# Sentiment Analysis using Naive Bayes from Scratch 

Sentiment analysis is done using the twitter dataset i.e twitter_samples from nltk

In [2]:
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer

In [3]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [4]:
## creating a function process_tweet that takes tweet as parameter and returns a list of clean tweets

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)  # adding to list

    return tweets_clean

In [5]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# spliting the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

## Creating numpy array of positive and negative labels
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [8]:
print(train_y.shape)
print(test_y.shape)

(8000,)
(2000,)


## Implementing helper functions

To train naive bayes model, we need to compute a dictionary where the keys are a tuple (word, label) and 
the values are the corresponding frequency. The labels we'll use here are 1 for positive and 0 for negative.

Here we, create a function count_tweets that takes a list of tweets as input, cleans all of them, and returns a dictionary.

* The key in the dictionary is a tuple containing the stemmed word and its class label, e.g. ("happi",1).
* The value the number of times this word appears in the given collection of tweets (an integer).

In [9]:
'''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
'''

def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

## Training model using Naive Bayes

Create freqs dictionary
* Given count_tweets() function, we compute a dictionary called freqs that contains all the frequencies.
* In this freqs dictionary, the key is the tuple (word, label)
* The value is the number of times it has appeared.

In [10]:
freqs = count_tweets({}, train_x, train_y)

In [11]:
## training naive bayes

'''
Input:
    freqs: dictionary from (word, label) to how often the word appears
    train_x: a list of tweets
    train_y: a list of labels correponding to the tweets (0,1)
Output:
    logprior: the log prior. (equation 3 above)
    loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
'''

def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0


    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg i.e total no of pos and neg words 
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs.get(pair)

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs.get(pair)

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents 
    D_pos = np.sum(train_y)

    # Calculate D_neg, the number of negative documents 
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos)-np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos)-np.log(p_w_neg)

    return logprior, loglikelihood


In [15]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9089


## Testing naive bayes

Implement the naive_bayes_predict function to make predictions on tweets.

* The function takes in the tweet, logprior, loglikelihood.
* It returns the probability that the tweet belongs to the positive or negative class.
* For each tweet, sum up loglikelihoods of each word in the tweet.
* Also add the logprior to this sum to get the predicted sentiment of that tweet.

In [12]:
'''
Input:
    tweet: a string
    logprior: a number
    loglikelihood: a dictionary of words mapping to numbers
Output:
    p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

'''

def naive_bayes_predict(tweet, logprior, loglikelihood):
     # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood.get(word,0)

    return p

### Implement test_naive_bayes to check the accuracy of your predictions.

* The function takes in your test_x, test_y, log_prior, and loglikelihood
* It returns the accuracy of your model.
* First, use naive_bayes_predict function to make predictions for each tweet in text_x.

In [13]:
"""
Input:
    test_x: A list of tweets
    test_y: the corresponding labels for the list of tweets
    logprior: the logprior
    loglikelihood: a dictionary with the loglikelihoods for each word
Output:
    accuracy: (# of tweets classified correctly)/(total # of tweets)
"""

def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0 
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)
    
    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.abs(y_hats-test_y))
    
    # Accuracy is 1 minus the error
    accuracy = 1-error
    
    ### END CODE HERE ###

    return accuracy


In [16]:
## checking the accuracy 

print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [23]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.','What a sad day!',
              'great', 'great great', 'great great great', 'great great great great']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')
    if p>0:
        print('This sentence reflects positive sentiment')
    else:
        print('This sentence reflects negative sentiment')

I am happy -> 2.15
This sentence reflects positive sentiment
I am bad -> -1.29
This sentence reflects negative sentiment
this movie should have been great. -> 2.14
This sentence reflects positive sentiment
What a sad day! -> -2.32
This sentence reflects negative sentiment
great -> 2.14
This sentence reflects positive sentiment
great great -> 4.28
This sentence reflects positive sentiment
great great great -> 6.41
This sentence reflects positive sentiment
great great great great -> 8.55
This sentence reflects positive sentiment


In [25]:
## testing with custom tweet

my_tweet = 'I am happy because I am learning :)'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.574768961173337


In [None]:
## if the probability score p is greater than 0 then sentiment is positive else negative