# Training Naive Bayes
### Steps:

In [46]:
!pip install nltk



In [47]:
import nltk
import re 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords #corpus is a list of strings/text
stop_words = stopwords.words('english')
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\lgene\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [48]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples #downloads the .json file of twitter_samples data 

import numpy as np
import pandas as pd

In [49]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print('Number of positive tweets: ', len(positive_tweets))
print('Number of negative tweets: ', len(negative_tweets))

print('\nThe type of postive_tweets is: ', type(positive_tweets))
print('The type of a tweet entry is: ', type(positive_tweets[0]))

Number of positive tweets:  5000
Number of negative tweets:  5000

The type of postive_tweets is:  <class 'list'>
The type of a tweet entry is:  <class 'str'>


### Text Preprocessing

In [50]:
#apply the following regular expression using "sub" which replaces a specific substring with a specific substring, here we replace those substrings with an empty string
def clean_text(tweet):
    #clean text 
    tweet=re.sub('(#|@)\w*',"",tweet)# \w [a-z|A_Z|0-9|_] #remoce hashtage ,username
    tweet=re.sub("https?:\/\/\S+","",tweet) #remove hyperlink
    tweet=re.sub("(\?|!)+"," ",tweet) #remve (?!)
    tweet=re.sub("\s\d+\s","",tweet) # 33  
    tweet=re.sub("(\.|\,)+","",tweet) #remove . ,
    tweet=re.sub("^\s+","",tweet) #remove space 
    tweet=re.sub("\s+$","",tweet)#remove space 
    tweet = re.sub(r'[^\w\s]', '', tweet)# remove emoji, although emojis are a great indicator of sentiment and could be added to the frequency table to judge sentiment but for now I will exclude them and can include them when I enhance this model
    
    return tweet 

In [51]:
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [52]:
def process_sentence(tweets):
    clean_tweets=[]
    for tweet in tweets:
        tweet=clean_text(tweet) #remove all unneededd regexes 
        tweet=tweet.split() #split words using space into a list of strings to analyze them
        c_tweet=[word.lower() for word in tweet if word.lower() not in stop_words] #using list comprehension remove stop wods & convert to lower case
        #stemming 
        ps=PorterStemmer()
        clean_tweet=[ps.stem(word) for word in c_tweet]#convert word to  base 
        clean_tweets.append(clean_tweet) #a list of lists where each lists represents a single tweet 

    return clean_tweets

In [53]:
positive_tweets=process_sentence(positive_tweets)
negative_tweets=process_sentence(negative_tweets)

In [54]:
for i in range(10):
    print("#" , i)
    print(positive_tweets[i])

# 0
['top', 'engag', 'member', 'commun', 'week']
# 1
['hey', 'jame', 'odd', 'pleas', 'call', 'contact', 'centr', 'onand', 'abl', 'assist', 'mani', 'thank']
# 2
['listen', 'last', 'night', 'bleed', 'amaz', 'track', 'scotland']
# 3
['congrat']
# 4
['yeaaaah', 'yippppi', 'accnt', 'verifi', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profil', 'inday']
# 5
['one', 'irresist']
# 6
['dont', 'like', 'keep', 'love', 'custom', 'wait', 'long', 'hope', 'enjoy', 'happi', 'friday', 'lwwf']
# 7
['second', 'thought', 'there', 'enough', 'time', 'dd', 'new', 'short', 'enter', 'system', 'sheep', 'must', 'buy']
# 8
['jgh', 'go', 'bayan', 'bye']
# 9
['act', 'mischiev', 'call', 'etl', 'layer', 'inhous', 'wareh', 'app', 'katamari', 'well', 'name', 'impli', 'p']


In [55]:
for i in range(10):
    print("#" , i)
    print(negative_tweets[i])

# 0
['hopeless', 'tmr']
# 1
['everyth', 'kid', 'section', 'ikea', 'cute', 'shame', 'im', 'nearlyinmonth']
# 2
['heart', 'slide', 'wast', 'basket']
# 3
['hate', 'japanes', 'call', 'bani']
# 4
['dang', 'start', 'next', 'week', 'work']
# 5
['oh', 'god', 'babi', 'face']
# 6
['make', 'smile']
# 7
['work', 'neighbour', 'motor', 'ask', 'said', 'hate', 'updat', 'search']
# 8
['sialan']
# 9
['athabasca', 'glacier']


In [66]:
#create labels for positive and negative tweets
positive_labels=[1]*len(positive_tweets) # "1" represents positive tweets
negative_labels=[0]*len(negative_tweets) # "0" represents negative tweets

#positive_labels.extend(negative_labels)
#positive_tweets.extend(negative_tweets)

labels=positive_labels+negative_labels
tweets=positive_tweets+negative_tweets

print("len tweets",len(tweets))
print("len labels",len(labels))

len tweets 10006
len labels 10006


In [68]:
for i in range(10): 
    print(tweets[i] , labels[i])

['top', 'engag', 'member', 'commun', 'week'] 1
['hey', 'jame', 'odd', 'pleas', 'call', 'contact', 'centr', 'onand', 'abl', 'assist', 'mani', 'thank'] 1
['listen', 'last', 'night', 'bleed', 'amaz', 'track', 'scotland'] 1
['congrat'] 1
['yeaaaah', 'yippppi', 'accnt', 'verifi', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profil', 'inday'] 1
['one', 'irresist'] 1
['dont', 'like', 'keep', 'love', 'custom', 'wait', 'long', 'hope', 'enjoy', 'happi', 'friday', 'lwwf'] 1
['second', 'thought', 'there', 'enough', 'time', 'dd', 'new', 'short', 'enter', 'system', 'sheep', 'must', 'buy'] 1
['jgh', 'go', 'bayan', 'bye'] 1
['act', 'mischiev', 'call', 'etl', 'layer', 'inhous', 'wareh', 'app', 'katamari', 'well', 'name', 'impli', 'p'] 1


In [69]:
#shuffle two lists
import random 

zip_list=list(zip(tweets,labels)) #converts to tupple
random.shuffle(zip_list)
tweets,labels=zip(*zip_list)
tweets[1]
labels[1]

0

In [70]:
#after shuffling 
for i in range(10): 
    print(tweets[i] , labels[i])

['im', 'hope', 'come', 'back', 'au', 'afford', 'go', 'dont', 'think', 'ill', 'abl'] 0
['fbc', 'mess', 'dm'] 0
['happi', 'birthday', 'gorgeou', 'hope', 'ill', 'see', 'fair'] 1
['omg', 'realli', 'im', 'sorri', 'babe'] 0
['still', 'fck', 'nae', 'nae', 'liter', 'made', 'fuck', 'cri', 'bc', 'deep'] 0
['everyon', 'holiday'] 0
['favourit', 'cream', 'soda'] 1
['hrryok', 'fact', 'harri', 'still', 'hold', 'mom', 'hand'] 0
['yeah', 'dude', 'keep', 'calm', 'brace'] 0
['get', 'ive', 'tri', 'age', 'joy'] 0


Expected format of the frequency table I am building below is a dictionary containing the word as key and the value is list of two members one representing the count of positives and the other representing the count of negative value

In [40]:
## Build freq table
def build_freq(tweets,labels):
    freq={}

    #iterat tweets
    for i in range(len(tweets)):
        #iterate on each word in each tweets
        for word in tweets[i]:
            key=word
            if key not in freq.keys(): #initialize this key + values
                if labels[i]==1:
                    freq[key]=[1,0]
                else:
                    freq[key]=[0,1]
            else: #increments the value already existing key
                if labels[i]==1:
                    freq[key][0]+=1
                else:
                    freq[key][1]+=1

    return freq

In [41]:
freq_table=build_freq(tweets,labels)
freq_table
sum_pos_freq=0
sum_neg_freq=0
for key in freq_table.keys():
    sum_pos_freq+=freq_table[key][0]
    sum_neg_freq+=freq_table[key][1]
print(sum_pos_freq,sum_neg_freq)

64409 31818


"V" is count of unique words 

In [42]:
V=len(freq_table)
print("V : ",V)

V :  11047


### Building probability dictionary

probability of current word in a class formula 
http://localhost:8888/view/NLP_Binary_Sentiment_Naive_Bayes_Classifier/log%20likelehood%20of%20a%20word.png

In [71]:
def build_propability(freq_table,sum_pos_freq,sum_neg_freq,V):
    prop_dict={}

    #key :word ,value:[p(w/pos),p(w/neg)]
    for key in freq_table.keys():
        prop_dict[key]=[((freq_table[key][0]+1)/(sum_pos_freq+V)),((freq_table[key][1]+1)/(sum_neg_freq+V))]

    return prop_dict
prop_dict=build_propability(freq_table,sum_pos_freq,sum_neg_freq,V)
prop_dict

{'wowwww': [3.9758269720101784e-05, 2.332905633967106e-05],
 'thank': [0.009793787107718405, 0.0024728799720051325],
 'lot': [0.001524067005937235, 0.0006532135775107897],
 'team': [0.0005698685326547922, 0.0003499358450950659],
 ':)': [0.04431721798134012, 2.332905633967106e-05],
 '❤💙💚💕❤💙💚💕': [6.62637828668363e-05, 2.332905633967106e-05],
 'final': [0.0008746819338422392, 0.0008398460282281582],
 'ep': [7.951653944020357e-05, 9.331622535868424e-05],
 'got': [0.0021734520780322306, 0.0022862475212877638],
 'go': [0.004930025445292621, 0.005225708620086317],
 ':(': [0.05075805767599661, 0.08932695672460049],
 'great': [0.002584287531806616, 0.0005365682958124344],
 'session': [0.00011927480916030534, 0.00013997433803802638],
 'hope': [0.0031806615776081423, 0.002332905633967106],
 ':-)': [0.008601039016115352, 2.332905633967106e-05],
 "what'": [0.0004373409669211196, 0.00037326490143473696],
 'snapchat': [0.0008084181509754028, 0.0009798203662661844],
 'actual': [0.0005433630195080577, 

log likelehood of a word formula 
http://localhost:8888/view/NLP_Binary_Sentiment_Naive_Bayes_Classifier/probability%20of%20current%20word%20in%20a%20class.png

In [72]:
import numpy as np
def Naive_Bayes_inference(tweets,prop_dict):
    results=[]

    for tweet in tweets:
        #for one word in a single tweet I add the log likelehopod of all words in a tweet to judge the whole tweet's sentiment
        result=0
        for word in tweet:
            try:
                result+=np.log(prop_dict[word][0]/prop_dict[word][1])
            #to handle zero division exception
            except:
                result+=0
        results.append(result)
    return results

In [74]:
## evaluate Model
y_pred=Naive_Bayes_inference(tweets,prop_dict)


In [75]:
#if the predicted y is larger then 0 , set to 1 which means positive sentiment and if less than 0 then set it to -1 which sets negatove sentiment
y_p=[1 if y>=0 else 0 for y in y_pred]


In [76]:
from sklearn.metrics import accuracy_score
print("Accuracy Score : ",accuracy_score(y_p,labels))

Accuracy Score :  0.8286028382970217


## Make Prediction 

In [77]:
tweets[0],labels[0]

(['im',
  'hope',
  'come',
  'back',
  'au',
  'afford',
  'go',
  'dont',
  'think',
  'ill',
  'abl'],
 0)

In [78]:
Naive_Bayes_inference([tweets[0]],prop_dict)

[-1.3958125287152436]

## Testing the model with five random tweets

In [79]:
test_tweets=["@metalgear_jp @Kojima_Hideo I want you're T-shirts ! They are so cool ! :D","Stats for the day have arrived. 2 new followers and NO unfollowers :) via http://t.co/xxlXs6xYwe.",
             "Dang that is some rad @AbzuGame #fanart! :D https://t.co/bI8k8tb9ht","Can u feel it? :((:( #exo http://t.co/ghsa262ORm","@seanactual You mean you're not offering? :("]


In [81]:
x = process_sentence(test_tweets)
y_not_cleaned = Naive_Bayes_inference(x,prop_dict)

y=[1 if y>=0 else 0 for y in y_not_cleaned]


## The classifier predicted the first three tweets are of a +Ve sentiment and the last two tweets are of a -ve sentiment

In [82]:
y

[1, 1, 1, 0, 0]