# Sentiment Analysis of tweets with Naive Bayes

## Imports and loading the dataset:

In [1]:
import numpy as np
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import TweetTokenizer

import string
import pandas as pd

In [2]:
# Download these if you haven't already
# nltk.download('stopwords')
# nltk.download('twitter_samples')

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

train_pos = positive_tweets[:4000]
test_pos  = positive_tweets[4000:]
train_neg = negative_tweets[:4000]
test_neg  = negative_tweets[4000:]

X_train   = train_pos + train_neg
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)

X_test    = test_pos + test_neg
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [4]:
print(y_train.shape)
print(y_test.shape)

(8000, 1)
(2000, 1)


## Preprocessing function

In [5]:
import re            # Regular Expressions
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess_tweet(tweet):
    
    '''
        Input: Tweet, a string
        Output: A processed, tokenized (array) of words, after removing stopwords and stemming
    '''
    
    # Re operations
    tweet = re.sub(r'^RT[\s]+', '', tweet)               # Remove retweet text "RT"
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)   # Remove https://, website links
    tweet = re.sub(r'#', '', tweet)                      # Remove hashtags
    
    # Tokenize
    tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)
    tweet_tokens = tokenizer.tokenize(tweet)

    # Remove Stopwords
    cleaned_tweet = []
    for token in tweet_tokens:
        if token in english_stopwords or token in string.punctuation:
            continue
        else:
            cleaned_tweet.append(token)
        
    # Stemming
    stemmed_tweet = []

    for token in cleaned_tweet:
        stemmed_tweet.append(stemmer.stem(token))
        
    return stemmed_tweet

In [6]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(preprocess_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


## Build Class frequencies for each word

In [7]:
def build_frequencies(tweets, y_list):
    
    '''
        Inputs: 
        Tweets: The entire list of tweets, each of which is a string
        y_list: Labels for tweets
        
        Output:
        The word frequency dictionary
    
    '''

    # y: Sentiment label of each tweet (Actual labels)
    y_list = np.squeeze(y_list).tolist()        # Squeezing to a list so that we can zip it.
    
    word_freqs = {}
    for y, tweet in zip(y_list, tweets):
        preprocessed_tweet = preprocess_tweet(tweet)
        for word in preprocessed_tweet:
            pair = (word, y)
            if(word_freqs.get(pair) is None):
                word_freqs[pair] = 1
            else:
                word_freqs[pair] += 1
                
    return word_freqs

In [8]:
# Testing your function
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
result = build_frequencies(tweets, ys)
print(result)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


# Training Naive Bayes

Naive Bayes may be used for sentiment analysis, one of the benefits is it's short training and prediction times

### Steps:
1. Identify the number of classes. Here we have 2: Pos and Neg

2. Create a probability for each class. 
                                        Prob(D_pos) = D_pos/D;
                                        Prob(D_neg) = D_neg/D;
   D_pos: total number of +ve tweets, D_neg: total number of -ve tweets, D: Total tweets = len(y_train)

3. Calculate logprior, especially useful when the dataset is unbalanced: np.log(D_pos) - np.log(D_neg)

4. - $freq_{pos}$ and $freq_{neg}$ are the frequencies of that specific word in the positive or negative class. In other words, the positive frequency of a word is the number of times the word is counted with the label of 1.
   - $N_{pos}$ and $N_{neg}$ are the total number of positive and negative words for all documents (for all tweets), respectively.
   - $V$ is the number of unique words in the entire set of documents, for all classes, whether positive or negative.
   
   $$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
   $$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$
   
5. Calculate loglikelihood for the WORD: 
   $$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)\tag{6}$$

First, we create the frequencies dictionary

In [9]:
freqs = build_frequencies(X_train, y_train)

In [10]:
def train_naive_bayes(X_train, y_train, freqs):
    
    '''
        Input: X_train and y_train, the tweets (all strings) along with the real classification
        freqs: The dict created by running build_frequencies on X_train
        
        Output:
        Returns logprior and loglikelihood. This is the "model", using this we can predict the classification of any tweet, 
        or on the test set
    
    '''
    
    # Define our return values
    loglikelihood = {}   # Dictionary storing loglikelihood of EACH WORD
    logprior      = 0
    
    # V is the length of unique words found
    vocab = set(pair[0] for pair in freqs)
    V = len(vocab)
    
    N_pos = 0
    N_neg = 0
    
    for pair in freqs.keys(): # A pair is (word, 1) or (word, 0) and its value is the number of times it appears in the data
        if(pair[1] == 1):
            N_pos += freqs.get(pair)
        else:
            N_neg += freqs.get(pair)
            
    # Documents (tweets)
    D = len(y_train)
    D_pos = len(list(filter(lambda x: x == 1, y_train)))
    D_neg = len(list(filter(lambda x: x == 0, y_train)))   
    
    logprior = np.log(D_pos/D_neg)
    
    # Filling in loglikelihood for every word in the vocabulary
    for word in vocab:
        # Get the positive and negative frequency of the word
        freq_pos = 0 if (freqs.get((word, 1)) is None) else freqs.get((word, 1))
        freq_neg = 0 if (freqs.get((word, 0)) is None) else freqs.get((word, 0))

        # Calculate the probability that each word is positive, and negative
        probability_w_pos = (freq_pos + 1)/(N_pos + V)
        probability_w_neg = (freq_neg + 1)/(N_neg + V)

        # Calculate the log likelihood of the word and store it in the loglikelihood dictionary
        loglikelihood[word] = np.log(probability_w_pos/probability_w_neg)
    
    return logprior, loglikelihood

In [11]:
logprior, loglikelihood = train_naive_bayes(X_train, y_train, freqs)
print(logprior) # Will be 0 here since D_pos = D_neg
print(len(loglikelihood))

0.0
9083


# Testing Naive Bayes

Given a tweet; logprior; and loglikelihood for words in the vocabulary, calculate the prediction using the formula: (Note: both logprior and loglikelihood are the results from training Naive Bayes on the training set

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

### On a single tweet:

In [12]:
def predict_tweet_naive_bayes(tweet, logprior, loglikelihood):
    
    '''
        Given the model (logprior, loglikelihood) and a single tweet (string), return the prediction value p.
        Where, p > 0 implies positive sentiment and vice versa.
        No need to preprocess, provide raw tweet, we call preprocess_tweet() from here
    
    '''
    sum_loglikelihood = 0
    
    processed_tweet = preprocess_tweet(tweet)
    
    for word in processed_tweet:
        if(word in loglikelihood.keys()):
            sum_loglikelihood += loglikelihood[word]
    
    p = logprior + sum_loglikelihood
    
    return p

In [13]:
my_tweet = 'I am happy because I am learning :)'
p = predict_tweet_naive_bayes(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 9.574522506155166


### On the Test Dataset

In [14]:
def test_set_naive_bayes(X_test, y_test, logprior, loglikelihood):
        
    '''
        Return test set accuracy given the model trained (logprior, loglikelihood)
    '''    
    
    y_hat = []
    y_test = np.squeeze(y_test).tolist()
    
    for tweet in X_test:
        p_tweet = predict_tweet_naive_bayes(tweet, logprior, loglikelihood)
        
        if(p_tweet > 0):
            y_hat.append(1)
        else:
            y_hat.append(0)

    error = (np.int8(y_hat != y_test)).sum()/len(y_test)

    accuracy = 1 - error

    return accuracy

In [16]:
print("Naive Bayes accuracy on the test set = %0.4f" %(test_set_naive_bayes(X_test, y_test, logprior, loglikelihood)))

Naive Bayes accuracy on the test set = 0.9995
