# NaiveBayes_Classifier Sentiment Analysis

In [1]:
from __future__ import division
from codecs import open
import numpy as np
import pandas as pd
from collections import Iterable
import math

def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

  """


In [2]:
all_docs, all_labels = read_documents('all_sentiment_shuffled.txt')

split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]  
val_docs = all_docs[split_point:]
val_labels = all_labels[split_point:]

In [3]:
vocabulary = set()               #Create the vocabulary of all unique words
for review in train_docs:
    for word in review:
        vocabulary.add(word)

In [4]:
def train_nb(train_docs, train_labels, vocabulary, alpha = 1): # If non given, alpha is 1. Alpha is the smoothing parameter we are going to use 
    
    word_appear_in_pos_neg_review = {word: [0]*2 for word in vocabulary} #every word has two indices, one for negative and one for positve appearances 

    # we will use index 0 for negative and 1 for positive reviews 
    S_Neg_Pos = [0,0]            # List with the Sums of all words contained in negative and positive labeled reviews
    
    for i in range(len(train_docs)):
        if train_labels[i] == 'neg':
            for word in train_docs[i]:
                word_appear_in_pos_neg_review[word][0] += 1
                S_Neg_Pos[0] += 1
        else:
            for word in train_docs[i]:
                word_appear_in_pos_neg_review[word][1] += 1
                S_Neg_Pos[1] += 1
                
    likelihood_dict = {word: [0]*2 for word in vocabulary}   # Dictionary for the likelihood of words being in positive or negative reviews 
     
    for word in vocabulary:
        likelihood_dict[word][0] = (word_appear_in_pos_neg_review[word][0] + alpha)/ (S_Neg_Pos[0] + alpha*len(vocabulary))
        likelihood_dict[word][1] = (word_appear_in_pos_neg_review[word][1] + alpha)/ (S_Neg_Pos[1] + alpha*len(vocabulary))
        
    total_neg_reviews = 0    #Sum of all negative reviews
    total_pos_reviews = 0    #Sum of all positive reviews

    for i in range(len(train_labels)):
        if train_labels[i] == 'neg':
            total_neg_reviews += 1
        else:
            total_pos_reviews += 1      
        
    prior_probs = [0,0]                           # We estimate the prior probabilities of negative and positive reviews
    prior_probs[0] = total_neg_reviews/len(train_docs)
    prior_probs[1] = total_pos_reviews/len(train_docs)     
    
    return likelihood_dict, prior_probs, S_Neg_Pos
    
    

In [5]:
likelihood_di, prior_probab, S_Neg_Pos = train_nb(train_docs, train_labels, vocabulary)

In [6]:
def score_doc_label(document, likelihood_dict, prior_probab, vocabulary):
    score_prob = [0,0]
    score_prob[0] = prior_probab[0]
    score_prob[1] = prior_probab[1]
    for word in document:
        if word in vocabulary:
            if (score_prob[0]>= 10**(-305)) and (score_prob[1]>= 10**(-305)):  # We put that limitation here, since python 
                score_prob[0] *= likelihood_dict[word][0]                      # cannot handle very very small numbers and converts them  
                score_prob[1] *= likelihood_dict[word][1]                      # into minus infinity instead.
            
        # We chose to ignore the words that are not included in the vocabulary, since that gave us better accuracy results
        # Otherwise, we would include them with the following way
        
        #else:
            #score_prob[0] *= alpha /(S_Neg_Pos[0]+alpha*len(vocabulary))    # We would have to include alpha in the parameters of   
            #score_prob[1] *= alpha /(S_Neg_Pos[1]+alpha*len(vocabulary))    # the function if we run this else condition
        # print (score_prob)
         
    score_prob = np.log(score_prob)    #Converting the probability into log probability
    return score_prob  

In [7]:
doc = ['a', 'top-quality', 'performance']   # We can put any review to test our classifier

In [8]:
score = score_doc_label(doc, likelihood_di, prior_probab, vocabulary)  # Run this to compute the score that we need for the 
                                                                       # classification below 

In [9]:
print(score) # We get the log probability array 

[-13.48689174 -12.80785836]


In [10]:
new_score = np.exp(score)   # Run this if you want to convert the log probability that appeared above to a normal probability array
new_score

array([1.38904828e-06, 2.73916245e-06])

In [11]:
def classify_nb(document, score):
    if score[0] < score[1]:
        guess = "pos"
    else:
        guess = "neg"
    #else:
        #guess = "Cannot decide: Equally likely to be positive or negative review"    # In the extreme case that positive and negative have the exact same score 
        
    return guess

In [12]:
guess = classify_nb(doc, score)
guess

'pos'

In [13]:
#Evaluating the classifier

def classify_documents(docs, vocabulary, likelihood_di, prior_probab):  
    
    label_list = []
    score_sentence = []
    
    for doc in docs:
        score = score_doc_label(doc, likelihood_di, prior_probab, vocabulary)
        # print(score)
        guess = classify_nb(doc, score)
        label_list.append(guess)
        score_sentence.append(score)
    
    return label_list, score_sentence

In [14]:
guess_labels, score_sentence = classify_documents(val_docs, vocabulary, likelihood_di, prior_probab)

In [15]:
def accuracy(true_labels, guessed_labels):
    correct_count = 0
    for i in range(len(true_labels)):
        if true_labels[i] == guessed_labels[i]:
            correct_count += 1
            
    acc = correct_count/len(true_labels)     
    return acc

In [16]:
accur = accuracy(val_labels, guess_labels)
accur

0.7960553923625682

In [17]:
def precision(val_labels, guess_labels):
    true_pos = 0
    fals_pos = 0
    
    for label_t, label_p in zip (val_labels,guess_labels):
        if label_t == 'pos' and label_p == 'pos':
            true_pos += 1
        if label_t == 'neg' and label_p == 'pos':
            fals_pos += 1
    
    total = true_pos + fals_pos
    
    precis = true_pos/total
    
    return precis

In [18]:
precis = precision(val_labels, guess_labels)
precis

0.8018099547511313

In [19]:
def recall(val_labels, guess_labels):
    true_pos = 0
    fals_neg = 0
    
    for label_t, label_p in zip (val_labels,guess_labels):
        if label_t == 'pos' and label_p == 'pos':
            true_pos += 1
        if label_t == 'pos' and label_p == 'neg':
            fals_neg += 1
    
    total = true_pos + fals_neg
    
    recall = true_pos/total
    
    return recall

In [20]:
recall_ = recall(val_labels, guess_labels)
recall_

0.7684301821335646

In [21]:
def f_score(val_labels, guess_labels):
    precis = precision(val_labels, guess_labels)
    rec = recall(val_labels, guess_labels)
    
    f = 2 * (precis * rec) / (precis +rec)
    return f

In [22]:
f_1 = f_score(val_labels, guess_labels)
f_1

0.7847652790079717

### Error Analysis

In [23]:
def find_errors_indices(val_labels = val_labels, guess_labels = guess_labels):
    false_predict = []     # List in which we keep all the indices of the misclassified documents
    for i in range(len(val_labels)):
        if (val_labels[i] == 'pos' and guess_labels[i] == 'neg') or (val_labels[i] == 'neg' and guess_labels[i] == 'pos'):
            false_predict.append(i)
    return false_predict                   

In [24]:
false_pred = find_errors_indices(val_labels, guess_labels)

In [25]:
df = pd.DataFrame((val_docs, score_sentence, val_labels, guess_labels))
df = df.T
df = df.rename(columns = {0: "Review", 1: "Neg_Pos", 2: "True_Label", 3: "Guess_Label"} )

In [26]:
df.head()

Unnamed: 0,Review,Neg_Pos,True_Label,Guess_Label
0,"[do, not, buy, this, iron, ., it, 's, fabulous...","[-476.1805840041536, -488.49501638443303]",neg,neg
1,"[the, series, just, keeps, on, getting, better...","[-478.8695832044195, -476.2881359649154]",pos,pos
2,"[i, bought, this, apple, humidifier, in, early...","[-686.6196141857182, -709.1696617728866]",neg,neg
3,"[i, highly, recommend, this, super, wide, angl...","[-364.6633195247102, -353.71965625805865]",pos,pos
4,"[this, is, one, of, the, better, historical, d...","[-706.3890416473079, -695.9936451532267]",pos,pos


In [27]:
df2 = df[((df['True_Label'] == 'neg') & (df['Guess_Label'] == 'pos')) | ((df['True_Label'] == 'pos') & (df['Guess_Label'] == 'neg'))]

In [28]:
df2

Unnamed: 0,Review,Neg_Pos,True_Label,Guess_Label
9,"[i, agree, with, other, reviewers, that, it, f...","[-522.2997780517946, -517.1997485904006]",neg,pos
10,"[this, camera, has, a, very, poor, lens, ., at...","[-480.25153518481494, -476.1033272001385]",neg,pos
12,"[this, book, offers, more, information, about,...","[-474.45296619422317, -473.1350092725393]",neg,pos
14,"[i, was, able, to, scout, out, the, different,...","[-329.82249598660104, -329.9073854379372]",pos,neg
15,"[i, have, been, a, fan, since, valotte, ., and...","[-699.5516101932159, -703.9787347446481]",pos,neg
...,...,...,...,...
2372,"[the, text, is, ok, and, the, way, the, story,...","[-674.8046173025884, -672.2745429779512]",neg,pos
2376,"[i, 'm, not, sure, what, compelled, me, to, se...","[-707.6027104635142, -704.8330676653848]",neg,pos
2377,"[after, watching, "", harsh, times, "", we, wond...","[-705.381183513228, -703.0399534529643]",neg,pos
2378,"[the, story, here, dose, n't, matter, ., the, ...","[-704.3883541325342, -702.7512472813743]",neg,pos


In [29]:
index_list = []                     # This list will contain the 10 most wrongly directed misclassified document indices
for j in range(10):
    max_diff = abs(df2['Neg_Pos'][9][0] - df2['Neg_Pos'][9][1])
    index = 9
    for i in false_pred:
        if (abs(df2['Neg_Pos'][i][0] - df2['Neg_Pos'][i][1]) > max_diff) and (i not in index_list):
            max_diff = abs(df2['Neg_Pos'][i][0] - df2['Neg_Pos'][i][1])
            index = i
    index_list.append(index)
    

In [30]:
index_list

[1287, 173, 231, 1626, 1756, 2181, 121, 1551, 1856, 2254]

In [31]:
df2.loc[1287]

Review         [i, 've, been, using, norton, antivirus, 2002,...
Neg_Pos                 [-684.8728016026175, -706.3533809219927]
True_Label                                                   pos
Guess_Label                                                  neg
Name: 1287, dtype: object

In [32]:
print(abs(df['Neg_Pos'][1287][0] - df['Neg_Pos'][1287][1]))    # This is the difference in the log probabilities

21.480579319375124


In [33]:
to_read = " ".join(val_docs[1287])
print(to_read)

i 've been using norton antivirus 2002 with annual updates of patches forever with norton internet security 2003. now that it 's time to renewal norton antivirus , i thought i switch to trend micro internet secuirty 2005 after all the glowing wins it received from certain mainstream computer magazines . i like norton 2002 but it was a memory resident hog in the taskmanager considering i was running a p3 500 laptop with 196mb on winxp pro , resources are valuable . also newer norton antivirus version were a hit and miss . after also reading reviews at amazon , i took the plunge with trend micro . my review of this product can be best summarize along with it 's relations to norton antivirus 2002 ( updated ) . pros of trend micro 2005 : 1. fastest virus scan i 've seen . i have not find any virus in my system yet whereas norton would fail in about .01% of the time ( i.e. one stalling virus every year ) . 2. check incoming and outgoing mail like norton 3. scan selected files as i chose jus

CROSS VALIDATION

In [34]:
N=10    # We will implement the 10-fold Cross Validation, therefore we give the number of folds N=10.

In [35]:
# K-Fold Cross Validation

S_accur = 0                 # For calculating the accuracy later of cross validation below
S_f_1 = 0                   # Same for F1 score

for fold_nbr in range(N):
    split_point_1 = int(float(fold_nbr)/N*len(all_docs))
    split_point_2 = int(float(fold_nbr+1)/N*len(all_docs))

    train_docs_fold = all_docs[:split_point_1] + all_docs[split_point_2:]
    train_labels_fold = all_labels[:split_point_1] + all_labels[split_point_2:]
    
    new_vocabulary = set()               #Create the vocabulary of all unique words
    for review in train_docs_fold:
        for word in review:
            new_vocabulary.add(word)
    
    val_docs_fold = all_docs[split_point_1:split_point_2]
    val_labels_fold = all_labels[split_point_1:split_point_2]
    
    likelihood_di, prior_probab, S_Neg_Pos = train_nb(train_docs_fold, train_labels_fold, new_vocabulary)
    guess_labels, score_sentence = classify_documents(val_docs_fold, new_vocabulary, likelihood_di, prior_probab)
    
    accur = accuracy(val_labels_fold, guess_labels)
    S_accur += accur
    
    f_1 = f_score(val_labels_fold, guess_labels)
    S_f_1 += f_1

final_accur = S_accur / N    
final_f_1 = S_f_1 / N

In [36]:
final_accur

0.7884844527468318

In [37]:
final_f_1

0.7833510236318384

In [38]:
def accuracy_cross(true_labels, guessed_labels):   # for the  leave one out, as there is no index. 
    correct_count = 0
    for i in range(len(true_labels)):
        if true_labels == guessed_labels:
            correct_count += 1
            
    acc = correct_count/len(true_labels)     
    return acc

In [39]:
#Leave-one-out Cross Validation

S_accur = 0                 # Same as above, for metrics calculations
S_f_1 = 0                   
list_guesses = []           # We need these lists for the computation of the F1 score later on
list_val_labels = []

for i in range (100):
    train_docs_fold = all_docs[:i] + all_docs[i+1:]
    train_labels_fold = all_labels[:i] + all_labels[i+1:]
    
    new_vocabulary = set()               #Create the vocabulary of all unique words
    for review in train_docs_fold:
        for word in review:
            new_vocabulary.add(word)
    
    val_docs_fold = all_docs[i]
    val_labels_fold = all_labels[i]
    list_val_labels.append(val_labels_fold)
    
    likelihood_di, prior_probab, S_Neg_Pos = train_nb(train_docs_fold, train_labels_fold, new_vocabulary)
    guess_labels, score_sentence = classify_documents([val_docs_fold], new_vocabulary, likelihood_di, prior_probab)
    list_guesses.append(guess_labels)
    
    accur = accuracy_cross([val_labels_fold], guess_labels)
    S_accur += accur
    

In [40]:
print(S_accur)

79.0


In [41]:
final_accur = S_accur / 100
final_accur

0.79

Here, we noticed that our list_guesses was nested. Thus giving errors. We fixed this by unflattening it. 

In [42]:
def flatten(lis):
     for item in lis:
        if isinstance(item, Iterable) and not isinstance(item, str):
            for x in flatten(item):
                 yield x
        else:        
            yield item

In [43]:
list_guesses = list(flatten(list_guesses))

In [44]:
f1_score = f_score(list_val_labels, list_guesses)
f1_score

0.8108108108108109