In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import string
import nltk
import pdb

from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [4]:
filepath = f"{getcwd()}/Datasets/"
nltk.data.path.append(filepath)

In [5]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [6]:
from utils import process_tweet

In [7]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [8]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    return result

In [9]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [10]:
freqs = count_tweets({}, train_x, train_y)

In [11]:
def lookup(freqs, word, label):
    n = 0
    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]
    return n

In [12]:
def train_naive_bayes(freqs, train_x, train_y):
    """
    Input: freqs, train_x, train_y
    Output: logprior, loglikelihood
    """
    loglikelihood = {}
    logprior = 0
    
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
            
    # Calculate D, the number documents
    D = len(train_y)
    
    # Calculate D_pos, the number of Positive Documents
    D_pos = sum(train_y == 1)
    
    # Calculate D_Neg
    D_neg = D - D_pos
    
    # Calculate LogPrior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    for word in vocab:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        
        p_w_pos = (freq_pos + 1)/(D_pos + V)
        p_w_neg = (freq_neg + 1)/(D_neg + V)
        
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    
    return logprior, loglikelihood

In [14]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9085


In [15]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    
    p = 0
    
    p += logprior
    
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
            
    return p

In [16]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(f"The Expected Output is: {p}")

The Expected Output is: 1.5686159179138452


In [18]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0
    
    y_hat = []
    
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
    
    error = (1/len(test_y)) * np.sum(np.abs(np.subtract(y_hat, test_y, dtype=np.float)))
    
    accuracy = 1 - error
    
    return f"{accuracy*100}%"

In [19]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.14
I am bad -> -1.30
this movie should have been great. -> 2.13
great -> 2.13
great great -> 4.27
great great great -> 6.40
great great great great -> 8.53
