**Twitter Sentiment Analysis using Naive Bayes Algorithm**

In [1]:
#importing nltk to download twitter_samples,stopwords
import nltk

In [2]:
#we have to download those to our pc
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#importing from nltk-corpus
from nltk.corpus import twitter_samples,stopwords

In [4]:
pos_tweets=twitter_samples.strings('positive_tweets.json')
neg_tweets=twitter_samples.strings('negative_tweets.json')

In [5]:
#type
type(pos_tweets)

list

In [6]:
#an eg, first tweet
pos_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [7]:
#length of pos tweets
len(pos_tweets)

5000

In [8]:
#len of neg tweets
len(neg_tweets)

5000

In [9]:
#an eg of neg tweet,
neg_tweets[22]

'just want to play video games/watch movies with someone :('

here we have 5000 positive tweets and 5000 negative tweets. 
both are located in like strings in a list.


# 1. Train Test split,

In [10]:
#train - 80% of tweets from both
train_pos = pos_tweets[:4000]
train_neg = neg_tweets[:4000]
train = train_pos + train_neg
test_pos = pos_tweets[4000:]
test_neg = neg_tweets[4000:] 
test = test_pos + test_neg

#length confirmation
print(f'length of train data : {len(train)}')
print(f'length of test data : {len(test)}')

length of train data : 8000
length of test data : 2000


Now we have 8000 train data(includes 4000 pos and 4000 neg)
and 2000 test data(includes 1000 pos and 1000 neg)

# 2. Making Label for both train and test set

In [11]:
import numpy as np
train_label=np.append((np.ones(4000)),np.zeros(4000))
test_label=np.append((np.ones(1000)),np.zeros(1000))
#shapes
print(train_label.shape)
print(test_label.shape)

(8000,)
(2000,)


# 3. Processing Data

In [12]:
#let's see one tweet, how is looking
train[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

here we can see a string which includes words,hashtag,symbols, etc meaningless terms. remove those to get it easy

* feature extraction is very important before feeding into any ML models.

In [13]:
import re
import string

from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [14]:
#making a function to process
def process(tweet):
  #input : a tweet
  #output : a list of cleaned words
  
  tweet=re.sub(r'#','',tweet)  #hashtag removals
  tweet=re.sub(r'\$\w*','',tweet)  #stockticket removals
  tweet=re.sub(r'https?:\/\/.*[\r\n]*','',tweet)  #hyperlinks removals
  tweet=re.sub(r'^RT[\s]+','',tweet)   #retweets removals

  tokenizer=TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
  tokens=tokenizer.tokenize(tweet)

  eng_stopwords=stopwords.words('english')
  punctuations=string.punctuation
  stemmer=PorterStemmer()

  cleaned_tokens=[]
  for word in tokens:
    if (word not in eng_stopwords and word not in punctuations):
      stem_word=stemmer.stem(word)
      cleaned_tokens.append(stem_word)
  return cleaned_tokens



In [15]:
#eg of process
process(train[0])

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

Rename inputs and outputs

In [16]:
X_train=train
y_train=train_label
X_test=test
y_test=test_label

# Feature Extraction

making a dictionary, which contains {(word,label):counts,........}

key   > (word,label)

value > counts...how much appears

In [17]:
X_train[:3]        #first 3 tweets

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!']

In [18]:
y_train

array([1., 1., 1., ..., 0., 0., 0.])

In [19]:
result={}
for y,tweet in zip(y_train,X_train):
  for word in process(tweet):
    pair=(word,y)
    if pair in result:
      result[pair] += 1
    else:
      result[pair] = 1

result

{('followfriday', 1.0): 23,
 ('top', 1.0): 30,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2847,
 ('hey', 1.0): 60,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 80,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 504,
 ('listen', 1.0): 14,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 57,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 ('15', 1.0): 4,
 ('day', 1.0): 187,
 ('one', 1.0): 90,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 16,
 ('like', 1.0): 187,
 ('keep', 1.0): 55,
 ('love', 1.0): 336,
 

In [20]:
len(result)

11346

# Training

Naive bayes

* less time to train and predict

1.Identify number of classes

prior and logprior

lod(number of pos egs/number of neg egs)

In [21]:
#here,
#log(4000/4000)
np.log(4000/4000)

0.0

2.likelihood and loglikelihood

In [22]:
# to get logprior and loglikelihood for any word

find the frequencies,

In [23]:
def lookup(result, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in result):
        n = result[pair]

    return n

In [24]:
def NB(result,X_train,y_train):
  #to find number of unique words in the dictionary
  vocab=set([pair[0] for pair in result.keys()])
  V=len(vocab)     #length
  
  #to find logprior
  #logprior=np.log(number of total pos)-np.log(number of total neg)
  D_pos=len(list(filter(lambda x : x>0, y_train)))
  D_neg=len(list(filter(lambda x: x<=0, y_train)))
  logprior=np.log(D_pos)-np.log(D_neg)

  #to find loglikelihood
  #(freq_pos+1 / N_pos+V)
  #(freq_neg+1 / N_neg+V)
  N_pos=N_neg=0
  for pair in result.keys():
    if pair[1]>0:
      N_pos += result[pair]
    else:
      N_neg += result[pair]

  loglikelihood={}
  for word in vocab:
    freq_pos=lookup(result, word, 1)
    freq_neg=lookup(result, word, 0)

    p_pos=(freq_pos+1)/(N_pos+V)
    p_neg=(freq_neg+1)/(N_neg+V)

    loglikelihood[word] = np.log(p_pos/p_neg)

  return logprior,loglikelihood

In [25]:
NB(result,X_train,y_train)

(0.0,
 {'ff__special': 0.6985591249960175,
  'ms': -0.6877352361238731,
  'lorm': -0.6877352361238731,
  'sabadodeganarseguidor': -0.4000531636720922,
  'ko': -0.8055182717802566,
  '1.300': -0.6877352361238731,
  'taco': 0.005411944436072202,
  'manni': 0.6985591249960175,
  'hidden': 0.6985591249960175,
  'areadi': -0.6877352361238731,
  'ant': -1.0932003442320375,
  'scientist': 0.005411944436072202,
  'mari': 0.6985591249960175,
  'houston': 0.6985591249960175,
  '0ne': 1.104024233104182,
  'fineandyu': -0.6877352361238731,
  'llaollao': -0.6877352361238731,
  'nighti': 0.6985591249960175,
  'etid': -0.6877352361238731,
  'wetherspoon': -0.6877352361238731,
  'mommi': -0.6877352361238731,
  'straplin': -0.6877352361238731,
  'exclud': -0.6877352361238731,
  'assist': 0.6985591249960175,
  'decent': 0.005411944436072202,
  'artist': -0.4000531636720922,
  '🌎': 1.104024233104182,
  'icloud': -0.6877352361238731,
  'routt': 0.6985591249960175,
  'blurri': 0.6985591249960175,
  'avril'

In [26]:
logprior,loglikelihood=NB(result,X_train,y_train)
len(loglikelihood)

9089

# Test score

if score > 0 ............positive tweet

if score < 0 ............negative tweet 

In [27]:
#score of a tweet= logprior + loglikelihood

In [44]:
def naive_bayes_predict(tweet, logprior,loglikelihood ):
    
    word_l = process(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    

    return p


In [45]:
naive_bayes_predict('i am sad', logprior,loglikelihood )

-2.817949103177132

# Test Accuracy

In [48]:
#def test_accuracy(X_test,y_test,logprior,loglikelihood):
y_hat=[]
for tweet in X_test:
  if naive_bayes_predict(tweet,logprior,loglikelihood) > 0:
    y_hat_i=1
  else:
    y_hat_i=0
  y_hat.append(y_hat_i)

error = np.mean(np.absolute(y_hat-y_test))
acc=1-error
acc
  

0.994

99.4 % Accuracy on test set