<a href="https://colab.research.google.com/github/KavehKadkhoda/Sentiment-Analysis/blob/main/7_NaiveBayes_for_sentiment_analysis_on_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Train a naive bayes model on a sentiment analysis task
# Test using your model
# Compute ratios of positive words to negative words
# Do some error analysis
# Predict on your own tweet

# More info:
# https://docs.google.com/document/d/1s7mi6aRC2anRmvK5AbZQiQSUwDMRVVHdM0MLkPcFXrM/edit?usp=sharing

In [2]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

import numpy as np # Library for linear algebra and math utils

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [5]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [6]:
# Part 1: Process the Data
# For any machine learning project, once you've gathered the data, the first step is to process it to make useful inputs to your model.

# Remove noise: You will first want to remove noise from your data -- that is, remove words that don't tell you much about the content. 
# These include all common words like 'I, you, are, is, etc...' that would not give us enough information on the sentiment.
# We'll also remove stock market tickers, retweet symbols, hyperlinks, 
# and hashtags because they can not tell you a lot of information on the sentiment.
# You also want to remove all the punctuation from a tweet. 
# The reason for doing this is because we want to treat words with or without the punctuation as the same word, 
# instead of treating "happy", "happy?", "happy!", "happy," and "happy." as different words.
# Finally you want to use stemming to only keep track of one variation of each word. In other words, 
# we'll treat "motivation", "motivated", and "motivate" similarly by grouping them within the same stem of "motiv-".
# We have given you the function process_tweet that does this for you.

In [7]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [8]:
# Part 1.1 Implementing your helper functions
# To help you train your naive bayes model, you will need to compute a dictionary 
# where the keys are a tuple (word, label) and the values are the corresponding frequency. 
# Note that the labels we'll use here are 1 for positive and 0 for negative.

# You will also implement a lookup helper function that takes in the freqs dictionary, 
# a word, and a label (1 or 0) and returns the number of times that word and label tuple appears in the collection of tweets.

# For example: given a list of tweets ["i am rather excited", "you are rather happy"] and the label 1, the function will return a dictionary that contains the following key-value pairs:

# { ("rather", 1): 2, ("happi", 1) : 1, ("excit", 1) : 1 }

# Notice how for each word in the given string, the same label 1 is assigned to each word.
# Notice how the words "i" and "am" are not saved, since it was removed by process_tweet because it is a stopword.
# Notice how the word "rather" appears twice in the list of tweets, and so its count value is 2.

# Create a function count_tweets that takes a list of tweets as input, cleans all of them, and returns a dictionary.

# The key in the dictionary is a tuple containing the stemmed word and its class label, e.g. ("happi",1).
# The value the number of times this word appears in the given collection of tweets (an integer).

In [9]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

In [10]:
# Testing your function

result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

# Expected Output: {('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

{('happi', 1): 1, ('sad', 0): 1, ('tire', 0): 2, ('trick', 0): 1}

In [11]:
# Part 2: Train your model using Naive Bayes Naive bayes is an algorithm that could be used for sentiment analysis. 
# It takes a short time to train and also has a short prediction time.

# Build the freqs dictionary for later uses
freqs = count_tweets({}, train_x, train_y)

In [12]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0


    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)    

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = len(list(filter(lambda x:x>0, train_y)))

    # Calculate D_neg, the number of negative documents
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)


    return logprior, loglikelihood

In [13]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

# Expected Output:
# 0.0
# 9165

0.0
9165


In [None]:
# Part 3: Test your naive bayes 
# Now that we have the logprior and loglikelihood, we can test the naive bayes function by making predicting on some tweets!

# Note
# Note we calculate the prior from the training data, and that the training data is evenly split between positive and negative labels
#  (4000 positive and 4000 negative tweets). This means that the ratio of positive to negative 1, and the logprior is 0.

# The value of 0.0 means that when we add the logprior to the log likelihood, 
# we're just adding zero to the log likelihood. However, please remember to include the logprior, 
# because whenever the data is not perfectly balanced, the logprior will be a non-zero value.

In [14]:
# naive_bayes_predict

def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]


    return p

In [15]:
# Experiment with your own tweet.
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

# Expected Output:
# The expected output is around 1.55
# The sentiment is positive.

The expected output is 1.5577981920239676


In [None]:
# Implement test_naive_bayes

# Instructions:
# Implement test_naive_bayes to check the accuracy of your predictions.
# The function takes in your test_x, test_y, log_prior, and loglikelihood
# It returns the accuracy of your model.
# First, use naive_bayes_predict function to make predictions for each tweet in text_x.

In [16]:
# test_naive_bayes

def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats - test_y))

    # Accuracy is 1 minus the error
    accuracy = 1 - error
                    

    return accuracy

In [17]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

# Expected Accuracy:

# Naive Bayes accuracy = 0.9955

Naive Bayes accuracy = 0.9955


In [18]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:    
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

# Expected Output:
# I am happy -> 2.14
# I am bad -> -1.31
# this movie should have been great. -> 2.12
# great -> 2.13
# great great -> 4.26
# great great great -> 6.39
# great great great great -> 8.52

I am happy -> 2.14
I am bad -> -1.31
this movie should have been great. -> 2.12
great -> 2.13
great great -> 4.26
great great great -> 6.39
great great great great -> 8.52


In [19]:
# Feel free to check the sentiment of your own tweet below
my_tweet = 'Stupidity is knowing the truth, seeing the truth but still believing the lies. And that is more infectious than any other disease.'
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-3.2399026615032165

In [None]:
# Part 4: Filter words by Ratio of positive to negative counts¶
# Some words have more positive counts than others, and can be considered "more positive". 
# Likewise, some words can be considered more negative than others.
# One way for us to define the level of positiveness or negativeness, 
# without calculating the log likelihood, is to compare the positive to negative frequency of the word.
# Note that we can also use the log likelihood calculations to compare relative positivity or negativity of words.
# We can calculate the ratio of positive to negative frequencies of a word.
# Once we're able to calculate these ratios, we can also filter a subset of words that have a minimum ratio of positivity / negativity or higher.
# Similarly, we can also filter a subset of words 
# that have a maximum ratio of positivity / negativity or lower (words that are at least as negative, or even more negative than a given threshold).

In [20]:
# Implement get_ratio()
# Given the freqs dictionary of words and a particular word, use lookup(freqs,word,1) to get the positive count of the word.
# Similarly, use the lookup() function to get the negative count of that word.
# Calculate the ratio of positive divided by negative counts

# get_ratio

def get_ratio(freqs, word):
    '''
    Input:
        freqs: dictionary containing the words

    Output: a dictionary with keys 'positive', 'negative', and 'ratio'.
        Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}
    '''
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    ### START CODE HERE ###
    # use lookup() to find positive counts for the word (denoted by the integer 1)
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    
    # use lookup() to find negative counts for the word (denoted by integer 0)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)
    
    # calculate the ratio of positive to negative counts for the word
    pos_neg_ratio['ratio'] = (
        pos_neg_ratio['positive'] + 1) / (pos_neg_ratio['negative'] + 1)
    ### END CODE HERE ###
    return pos_neg_ratio

In [None]:
# Implement get_words_by_threshold(freqs,label,threshold)
# If we set the label to 1, then we'll look for all words whose threshold of positive/negative is at least as high as that threshold, or higher.
# If we set the label to 0, then we'll look for all words whose threshold of positive/negative is at most as low as the given threshold, or lower.
# Use the get_ratio function to get a dictionary containing the positive count, negative count, and the ratio of positive to negative counts.
# Append the get_ratio dictionary inside another dictinoary, 
# where the key is the word, and the value is the dictionary pos_neg_ratio that is returned by the get_ratio function. 
# An example key-value pair would have this structure:
# {'happi':{'positive': 10, 'negative': 20, 'ratio': 0.524}}

In [21]:
# get_words_by_threshold

def get_words_by_threshold(freqs, label, threshold, get_ratio=get_ratio):
    '''
    Input:
        freqs: dictionary of words
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as the cutoff for including a word in the returned dictionary
    Output:
        word_list: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.
        example of a key value pair:
        {'happi':
            {'positive': 10, 'negative': 20, 'ratio': 0.5}
        }
    '''
    word_list = {}
    # CODE REVIEW COMMENT: This has been changed!! word_list was described as a dictionary, but defined (and operated on) as a list

    for key in freqs.keys():
        word, _ = key

        # get the positive/negative ratio for a word
        pos_neg_ratio = get_ratio(freqs, word)

        # if the label is 1 and the ratio is greater than or equal to the threshold...
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
        
            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # If the label is 0 and the pos_neg_ratio is less than or equal to the threshold...
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
        
            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # otherwise, do not include this word in the list (do nothing)

    return word_list


In [None]:
# Part 5: Error Analysis
# In this part you will see some tweets that your model missclassified. 
# Why do you think the missclassifications happened? Were there any assumptions made by your naive bayes model?

In [22]:
# Some error analysis done for you
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'sr financi analyst expedia inc bellevu wa financ expediajob job job hire'


In [None]:
# Part 6: Predict with your own tweet
# In this part you can predict the sentiment of your own tweet.

In [23]:
# Test with your own tweet - feel free to modify `my_tweet`
my_tweet = 'If your theory doesnt agree with experiment, its wrong. It doesnt matter how beautiful it is. Brain'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

5.048583353113135
