In [53]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [54]:
TweetUrl='https://github.com/aasiaeet/cse5522data/raw/master/db3_final_clean.csv'
tweet_dataframe=pd.read_csv(TweetUrl)

## <font color = 'red'>Part1</font>


In [55]:
# wordDict maps words to id
# X is the document-word matrix holding the presence/absence of words in each tweet
wordDict = {}
idCounter = 0
for i in range(tweet_dataframe.shape[0]):
    allWords = tweet_dataframe.iloc[i,1].split(" ")
    for word in allWords:
        if word not in wordDict:
            wordDict[word] = idCounter
            idCounter += 1
X = np.zeros((tweet_dataframe.shape[0], idCounter),dtype='float')

In [56]:
for i in range(tweet_dataframe.shape[0]):
    allWords = tweet_dataframe.iloc[i,1].split(" ")
    for word in allWords:
        X[i, wordDict[word]]  = 1

In [57]:
y = np.array(tweet_dataframe.iloc[:,2])

In [58]:
#numNeg = np.where(y > 0)[0][0] - 1
numNeg = np.sum(y<=0) #More robust and, IMHO, easier to understand
numPos = len(y) - numNeg
probNeg = numNeg / (numNeg + numPos)
probPos = 1 - probNeg

In [59]:
# compute three distributions (four variables):
def compute_distros(x,y):
    # probWordGivenPositive: P(word|Sentiment = +ive)
    probWordGivenPositive=np.sum(x[y>=0,:],axis=0) #Sum each word (column) to count how many times each word shows up (in positive examples)
    probWordGivenPositive=probWordGivenPositive/np.sum(y>=0) #Divide by total number of (positive) examples to give distribution

    # probWordGivenNegative: P(word|Sentiment = -ive)
    probWordGivenNegative=np.sum(x[y<0,:],axis=0)
    probWordGivenNegative=probWordGivenNegative/np.sum(y<0)

    # priorPositive: P(Sentiment = +ive)
    priorPositive = np.sum(y>=0)/y.shape[0] #Number of positive examples vs. all examples
    # priorNegative: P(Sentiment = -ive)
    priorNegative = 1 - priorPositive
    # (note these last two form one distribution)

    return probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative


In [60]:
# compute the following:
# logProbWordPresentGivenPositive
# logProbWordAbsentGivenPositive
# logProbWordPresentGivenNegative
# logProbWordAbsentGivenNegative
# logPriorPositive
# logPriorNegative
def compute_logdistros(distros, min_prob):
    #Assume missing words are simply very rare
    #So, assign minimum probability to very small elements (e.g. 0 elements)
    distros=np.where(distros>=min_prob,distros,min_prob)
    #Also need to consider minimum probability for "not" distribution
    distros=np.where(distros<=(1-min_prob),distros,1-min_prob)

    #Note: Another option is to set the log for missing words to 0
    #      This is equivalent to simply ignoring the word (since logP==0 is the same as P==1)

    return np.log(distros), np.log(1-distros)



In [61]:
# classifyNB: 
#   words - vector of words of the tweet (binary vector)
#   logProbWordPresentGivenPositive - log P(x_j = 1|+)
#   logProbWordAbsentGivenPositive  - log P(x_j = 0|+)
#   logProbWordPresentGivenNegative - log P(x_j = 1|-)
#   logProbWordAbsentGivenNegative  - log P(x_j = 0|-)
#   logPriorPositive - log P(+)
#   logPriorNegative - log P(-)
#   returns (label of x according to the NB classification rule, confidence about the label)

# Note: you can also change the function definition if you wish to encapsulate all six log probs
# as one model; just make sure to follow through below

def classify_NB(words,logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
               logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
               logPriorPositive, logPriorNegative, ignore_absent):
    if ignore_absent:
        posSum = logPriorPositive + words.dot(logProbWordPresentGivenPositive)
        negSum = logPriorNegative + words.dot(logProbWordPresentGivenNegative)
    else:
        posSum = logPriorPositive + np.sum(logProbWordPresentGivenPositive[words == 1, ]) + np.sum(logProbWordAbsentGivenPositive[words == 0, ])
        negSum = logPriorNegative + np.sum(logProbWordPresentGivenNegative[words == 1, ]) + np.sum(logProbWordAbsentGivenNegative[words == 0, ])
    return 1 if posSum>negSum else -1

In [62]:
# testNB: Classify all xTest
#   xTest - test data features
#   yTest - true label of test data
#   logProbWordPresentGivenPositive - log P(x_j = 1|+)
#   logProbWordAbsentGivenPositive  - log P(x_j = 0|+)
#   logProbWordPresentGivenNegative - log P(x_j = 1|-)
#   logProbWordAbsentGivenNegative  - log P(x_j = 0|-)
#   logPriorPositive - log P(+)
#   logPriorNegative - log P(-)
#   returns Average test error
def test_NB(xTest, yTest, 
           logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
           logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
           logPriorPositive, logPriorNegative, ignore_absent):
    error = 0
    for i in range(xTest.shape[0]):
        prediction = classify_NB(xTest[i, ],logProbWordPresentGivenPositive,
                                logProbWordAbsentGivenPositive,logProbWordPresentGivenNegative, 
                                logProbWordAbsentGivenNegative,logPriorPositive, logPriorNegative, ignore_absent)
        if (prediction!=yTest[i]):
            error += 1
    return error/xTest.shape[0]

### Compute accuracy ignoring absent words and incorporating absent words

In [63]:
total_error_ignore_absent = []
total_error_incorporate_absent = []
seed_of_split= 42 #Grader could change this value for training/test set randomization
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = seed_of_split)
probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = compute_distros(xTrain,yTrain)
min_prob = 1/yTrain.shape[0] #Assume very rare words only appeared once
logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive = compute_logdistros(probWordGivenPositive,min_prob)
logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative = compute_logdistros(probWordGivenNegative,min_prob)
logPriorPositive, logPriorNegative = compute_logdistros(priorPositive,min_prob)
total_error_ignore_absent.append(test_NB(xTest, yTest, 
                                       logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
                                       logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
                                       logPriorPositive, logPriorNegative, ignore_absent = True))
total_error_incorporate_absent.append(test_NB(xTest, yTest, 
                                       logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
                                       logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
                                       logPriorPositive, logPriorNegative, ignore_absent = False))

print ("Average Accuracy when ignore absent words is : " + str(1-np.mean(total_error_ignore_absent)))
print ("Average Accuracy when incoporating absent words is : " + str(1-np.mean(total_error_incorporate_absent)))


Average Accuracy when ignore absent words is : 0.827027027027027
Average Accuracy when incoporating absent words is : 0.8162162162162162


## Report For Part1
    When ignoring absent words and only calculating values for words present in the tweet, the accuracy is higher than incorporting absent words in a tweet. 

## <font color = 'red'>Part 2 Sticky Terms</font>

In [64]:
# Funtion to calculate PMI according to top numbers of sticky terms
def calculate_PMI(tweet_dataframe, top_number):
    w1_and_w2 = {}
    words = {}
    PMI = {}

    for i in range(tweet_dataframe.shape[0]):
        allWords = tweet_dataframe.iloc[i,1].split(" ")
        for index in range(len(allWords)):
            word = allWords[index]
            if word in words:
                words[word] += 1
            else:
                words[word] = 1
            if index < len(allWords)-1:
                word_pre = allWords[index]
                word_post = allWords[index+1]
                if (word_pre,word_post) in w1_and_w2:
                    w1_and_w2[word_pre,word_post] +=1
                else:
                    w1_and_w2[word_pre,word_post] = 1
    w1_and_w2= dict(sorted(w1_and_w2.items(), key=lambda x: x[1], reverse=True))
    words = dict(sorted(words.items(), key=lambda x:x[1], reverse = True))
        
    for (word_pre, word_post) in w1_and_w2:
        probability_sticky  = w1_and_w2[word_pre, word_post] / sum(w1_and_w2.values())
        probability_word_pre = words[word_pre] / sum(words.values())
        probability_word_post = words[word_post] / sum(words.values())
        PMI[word_pre, word_post] = probability_sticky/ (probability_word_pre * probability_word_post)
    PMI = sorted(PMI.items(), key=lambda x:x[1], reverse = True)
    
    return dict(PMI[0:top_number])
    


In [65]:
# Function to calculate the accuracy of the Naive Bayes Model
def calculate_accuracy(tweet_dataframe,PMI):
    # wordDict maps words to id
    # X is the document-word matrix holding the presence/absence of words in each tweet
    wordDict = {}
    idCounter = 0
    for i in range(tweet_dataframe.shape[0]):
        allWords = tweet_dataframe.iloc[i,1].split(" ")
        for word in allWords:
            if word not in wordDict:
                wordDict[word] = idCounter
                idCounter += 1
        for index in range(len(allWords)-1):
            word_pre = allWords[index]
            word_post = allWords[index+1]
            if (word_pre,word_post) in PMI.keys():
                if (word_pre,word_post) not in wordDict:
                    wordDict[word_pre, word_post] = idCounter
                    idCounter += 1

    X = np.zeros((tweet_dataframe.shape[0], idCounter),dtype='float')
    for i in range(tweet_dataframe.shape[0]):
        allWords = tweet_dataframe.iloc[i,1].split(" ")
        for index in range(len(allWords)-1):
            word_pre = allWords[index]
            word_post = allWords[index+1]
            X[i, wordDict[word_pre]]  = 1
            if (word_pre,word_post) in PMI.keys():
                 X[i, wordDict[word_pre, word_post]]  = 1

    total_error_ignore_absent = []
    total_error_incorporate_absent = []
    seed_of_split = 42 # Grader can change this value for training/test set randomization
    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2,random_state = seed_of_split)
    probWordGivenPositive, probWordGivenNegative, priorPositive, priorNegative = compute_distros(xTrain,yTrain)
    min_prob = 1/yTrain.shape[0] #Assume very rare words only appeared once
    logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive = compute_logdistros(probWordGivenPositive,min_prob)
    logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative = compute_logdistros(probWordGivenNegative,min_prob)
    logPriorPositive, logPriorNegative = compute_logdistros(priorPositive,min_prob)
    total_error_ignore_absent.append(test_NB(xTest, yTest, 
                                           logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
                                           logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
                                           logPriorPositive, logPriorNegative, ignore_absent = True))
    total_error_incorporate_absent.append(test_NB(xTest, yTest, 
                                           logProbWordPresentGivenPositive, logProbWordAbsentGivenPositive, 
                                           logProbWordPresentGivenNegative, logProbWordAbsentGivenNegative, 
                                           logPriorPositive, logPriorNegative, ignore_absent = False))
    return [total_error_ignore_absent,total_error_incorporate_absent]

## Get top <font color ="red">100 </font>PMI of sticky terms and accuracy after adding them as additional features in Naive Bayes Model

In [66]:
top_number = 100
PMI = calculate_PMI(tweet_dataframe, top_number)
print ("Top 100 sticky terms: ")
for terms in PMI.keys():
    print (terms)
total_error_ignore_absent,total_error_incorporate_absent = calculate_accuracy(tweet_dataframe,PMI)
print ("Average Accuracy when ignore absent wors is : " + str(1-np.mean(total_error_ignore_absent)))
print ("Average Accuracy when incoporating absent wors is : " + str(1-np.mean(total_error_incorporate_absent)))


Top 100 sticky terms: 
('sb', 'xlv')
('mormon', 'heroin')
('board', 'retreat')
('listing', 'appt')
('gurrl', 'lemme')
('hats', 'coats')
('fuckkkk', 'thaaaa')
('\nboo', 'ed')
('pegged', 'wannanewcar')
('wannanewcar', 'sorryxterra')
('subra', 'inet')
('grrrrrrrrr', 'lazyass')
('buzzzzing', 'battleoffife')
('childrens', 'parties')
('parties', 'difficult')
('mankato', 'benefiting')
('benefiting', 'leep')
('comparison', 'goodbyebluesky')
('total', 'horseshit')
('electrical', 'appliances')
('easily', 'amused')
('amused', 'perhaps')
('perhaps', 'bite')
('siesta', 'key')
('key', 'headn')
('effing', 'snowflakes')
('fort', 'collins')
('conditioner', 'kicks')
('latest', 'piano')
('piano', 'composition')
('homie', 'tj')
('chip', 'pancakes')
('dai', 'laew')
('alyson', 'kenward')
('outt', 'goodthing')
('operation', 'ashy')
('ashy', 'larry')
('leaking', 'fuckyouweather')
('fuckyouweather', 'notchill')
('nerves', 'shannon')
('shannon', 'freeman')
('brightness', 'unreasonable')
('burberry', 'scarf')
('

## Get top <font color ="red">200 </font>PMI of sticky terms and accuracy after adding them as additional features in Naive Bayes Model

In [67]:
top_number = 200
PMI = calculate_PMI(tweet_dataframe, top_number)
print ("Top 200 sticky terms: ")
for terms in PMI.keys():
    print (terms)
total_error_ignore_absent,total_error_incorporate_absent = calculate_accuracy(tweet_dataframe,PMI)
print ("Average Accuracy when ignore absent wors is : " + str(1-np.mean(total_error_ignore_absent)))
print ("Average Accuracy when incoporating absent wors is : " + str(1-np.mean(total_error_incorporate_absent)))


Top 200 sticky terms: 
('sb', 'xlv')
('mormon', 'heroin')
('board', 'retreat')
('listing', 'appt')
('gurrl', 'lemme')
('hats', 'coats')
('fuckkkk', 'thaaaa')
('\nboo', 'ed')
('pegged', 'wannanewcar')
('wannanewcar', 'sorryxterra')
('subra', 'inet')
('grrrrrrrrr', 'lazyass')
('buzzzzing', 'battleoffife')
('childrens', 'parties')
('parties', 'difficult')
('mankato', 'benefiting')
('benefiting', 'leep')
('comparison', 'goodbyebluesky')
('total', 'horseshit')
('electrical', 'appliances')
('easily', 'amused')
('amused', 'perhaps')
('perhaps', 'bite')
('siesta', 'key')
('key', 'headn')
('effing', 'snowflakes')
('fort', 'collins')
('conditioner', 'kicks')
('latest', 'piano')
('piano', 'composition')
('homie', 'tj')
('chip', 'pancakes')
('dai', 'laew')
('alyson', 'kenward')
('outt', 'goodthing')
('operation', 'ashy')
('ashy', 'larry')
('leaking', 'fuckyouweather')
('fuckyouweather', 'notchill')
('nerves', 'shannon')
('shannon', 'freeman')
('brightness', 'unreasonable')
('burberry', 'scarf')
('

## Get top <font color ="red">500 </font>PMI of sticky terms and accuracy after adding them as additional features in Naive Bayes Model

In [68]:
top_number = 500
PMI = calculate_PMI(tweet_dataframe, top_number)
print ("Top 500 sticky terms: ")
for terms in PMI.keys():
    print (terms)
total_error_ignore_absent,total_error_incorporate_absent = calculate_accuracy(tweet_dataframe,PMI)
print ("Average Accuracy when ignore absent wors is : " + str(1-np.mean(total_error_ignore_absent)))
print ("Average Accuracy when incoporating absent wors is : " + str(1-np.mean(total_error_incorporate_absent)))


Top 500 sticky terms: 
('sb', 'xlv')
('mormon', 'heroin')
('board', 'retreat')
('listing', 'appt')
('gurrl', 'lemme')
('hats', 'coats')
('fuckkkk', 'thaaaa')
('\nboo', 'ed')
('pegged', 'wannanewcar')
('wannanewcar', 'sorryxterra')
('subra', 'inet')
('grrrrrrrrr', 'lazyass')
('buzzzzing', 'battleoffife')
('childrens', 'parties')
('parties', 'difficult')
('mankato', 'benefiting')
('benefiting', 'leep')
('comparison', 'goodbyebluesky')
('total', 'horseshit')
('electrical', 'appliances')
('easily', 'amused')
('amused', 'perhaps')
('perhaps', 'bite')
('siesta', 'key')
('key', 'headn')
('effing', 'snowflakes')
('fort', 'collins')
('conditioner', 'kicks')
('latest', 'piano')
('piano', 'composition')
('homie', 'tj')
('chip', 'pancakes')
('dai', 'laew')
('alyson', 'kenward')
('outt', 'goodthing')
('operation', 'ashy')
('ashy', 'larry')
('leaking', 'fuckyouweather')
('fuckyouweather', 'notchill')
('nerves', 'shannon')
('shannon', 'freeman')
('brightness', 'unreasonable')
('burberry', 'scarf')
('

Average Accuracy when ignore absent wors is : 0.8162162162162162
Average Accuracy when incoporating absent wors is : 0.8148648648648649


## Report For Part 2
    After adding sticky terms, the accuracy of Naive Bayes Model decreases. However, accuracy doesn't change although sticky terms added into model changed. It may caused by same training/test split randomization. Also, without regrading absent words, the accuracy is higher than incorporating absent words.