###  Import Required intially required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)

### **Loading Data**

#### In the first step, I have extracted the documents and labels from the source file

In [2]:
from codecs import open
from __future__ import division
from collections import Counter

# Defining a function that reads the document and extract the documents and labels

def read_documents(doc_file):  
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

### **Data Preprocessing**

#### Here the data is divided in to train and evaluation samples

In [3]:
# Preparing the data in order to feed it to the classifier

all_docs, all_labels = read_documents('sentiments.txt')

split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]  
eval_docs = all_docs[split_point:]
eval_labels = all_labels[split_point:]

### **Building the Model**

#### In this step, the model is defined, probabilities are calculated and required parameters are extracted

In [4]:
def train_nb(documents, labels):
    
    # Initializing an instance of counter to train labels which occur frequently
    label_train = Counter()
    
    for i in labels:
        label_train[i] += 1       
        
    # Probablitly that the document is annotated as positive
    prob_pos = label_train['pos']/len(documents)
    
    # Probablitly that the document is annotated as negative
    prob_neg = label_train['neg']/len(documents) 
    
    print('Probability of review being POSITIVE is: ', prob_pos)
    print('Probability of review being NEGATIVE is: ', prob_neg)
 

    # List of Positive reviews in the documet
    pos_train = []
    
    # List of Negative reviews in the documet
    neg_train = []
    
    for i, j in zip(documents, labels):
        if j == 'pos':
            pos_train.append(i)
        else:
            neg_train.append(i)
            
    # Frequency of Words in Postive documents list  
    freq_pos = Counter(w for doc in pos_train for w in doc)
    
    # Frequency of Words in Negative documents list
    freq_neg = Counter(w for doc in neg_train for w in doc)
    
        
    total_freq_pos = sum(list(freq_pos.values()))
    total_freq_neg = sum(list(freq_neg.values()))
    
    posprobs = freq_pos
    negprobs = freq_neg

    for i in posprobs:
        posprobs[i] = posprobs[i] / total_freq_pos
        
    for i in negprobs:
        negprobs[i] = negprobs[i] / total_freq_neg
        
    
    return(posprobs, negprobs, prob_pos, prob_neg, total_freq_pos, total_freq_neg)


#### Calcuating the guess of the classifier

In [5]:
# Defining a fucntion which return the guess of the classifier

def classify_nb(prob_pos, prob_neg):
    
    if(prob_pos > prob_neg):
        guess = 'pos'
    else:
        guess = 'neg'
        
    return(guess)

#### Document Classification

In [6]:
# Writing a function which classifies documents and returns predicted labels

def classify_documents(docs, alpha, Posprobs, Negprobs, PriorPositive, PriorNegative, totalPos, totalNeg):
    
    predicted_labels = []
    NchoicesP = len(Posprobs) # uses for laplace smoothing
    NchoicesN = len(Negprobs)
    
    for doc in docs:
            
        probOfAReviewBeingPos = 0
        probOfAReviewBeingNeg = 0
        
        for word in doc: 
         
            if Posprobs[word] > 0:                                  
                probOfAReviewBeingPos = probOfAReviewBeingPos +  np.log((Posprobs[word] * totalPos + alpha)/(alpha * NchoicesP + totalPos))
            else:                                                
                probOfAReviewBeingPos = probOfAReviewBeingPos + np.log(alpha / (alpha * NchoicesP + totalPos))
                
                
            if Negprobs[word] > 0:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log((Negprobs[word] * totalNeg + alpha) / (alpha * NchoicesN + totalNeg))
            else:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log(alpha / (alpha * NchoicesN + totalNeg))
            
            
        posteriorPosPerDoc = probOfAReviewBeingPos + np.log(PriorPositive)
        posteriorNegPerDoc = probOfAReviewBeingNeg + np.log(PriorNegative)
        
        predicted_labels.append(classify_nb(posteriorPosPerDoc, posteriorNegPerDoc)) 
        
    return(predicted_labels)

### **Evaluation**

#### Checking the accuracy of the classifier

In [7]:
# This function returns the classifier accuracy

def accuracy(true_labels, guessed_labels):
    counter = 0
    for i in range(len(guessed_labels)):
        if(guessed_labels[i] == true_labels[i]):
            counter += 1
    return counter / len(guessed_labels)

In [8]:
# Calculating final results

Posprobs, Negprobs, prob_pos, prob_neg, totalPos, totalNeg = train_nb(train_docs, train_labels)

alpha_laplace = 1.0
predicted_labels1 = classify_documents(eval_docs,alpha_laplace,Posprobs,Negprobs,prob_pos,prob_neg, totalPos, totalNeg)
Accuracyy = accuracy(eval_labels, predicted_labels1)

print('The accuracy of predicted labels is: ', Accuracyy)

Probability of review being POSITIVE is:  0.5183673469387755
Probability of review being NEGATIVE is:  0.4816326530612245
The accuracy of predicted labels is:  0.6684782608695652


#### We get around **66 %** accuracy, but we aim to imporove it, so, let's try cross validation for 5 folds

In [11]:
# Writing a function which returns the Log of Probablities

def score_doc_label(docs, alpha, Posprobs, Negprobs, PriorPositive, PriorNegative, totalPos, totalNeg):
    
    NchoicesP = len(Posprobs) # uses for laplace smoothing
    NchoicesN = len(Negprobs)
    
    for doc in docs:
            
        probOfAReviewBeingPos = 0
        probOfAReviewBeingNeg = 0
        
        for word in doc: 
         
            if Posprobs[word] > 0:                                  
                probOfAReviewBeingPos = probOfAReviewBeingPos +  np.log((Posprobs[word] * totalPos + alpha)/(alpha * NchoicesP + totalPos))
            else:                                                
                probOfAReviewBeingPos = probOfAReviewBeingPos + np.log(alpha / (alpha * NchoicesP + totalPos))
                
                
            if Negprobs[word] > 0:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log((Negprobs[word] * totalNeg + alpha) / (alpha * NchoicesN + totalNeg))
            else:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log(alpha / (alpha * NchoicesN + totalNeg))
            
            
        posteriorPosPerDoc = probOfAReviewBeingPos + np.log(PriorPositive)
        posteriorNegPerDoc = probOfAReviewBeingNeg + np.log(PriorNegative)
        
        expProbPos = np.exp(posteriorPosPerDoc)
        expProbNeg = np.exp(posteriorNegPerDoc)
        
        print("The Log Probability of the Document is ", expProbPos)
        print("The Log Probability of the Document is ", expProbNeg)
    
    return(expProbPos, expProbNeg)

In [12]:
# Check for the log probablities

print(score_doc_label(eval_docs, alpha_laplace, Posprobs, Negprobs, prob_pos, prob_neg, totalPos, totalNeg))

The Log Probability of the Document is  3.205434637166961e-62
The Log Probability of the Document is  1.2231134276912487e-61
The Log Probability of the Document is  0.0
The Log Probability of the Document is  0.0
The Log Probability of the Document is  0.0
The Log Probability of the Document is  0.0
The Log Probability of the Document is  4.928189062668924e-69
The Log Probability of the Document is  6.519391611443618e-67
The Log Probability of the Document is  0.0
The Log Probability of the Document is  0.0
The Log Probability of the Document is  1.1352595255398278e-75
The Log Probability of the Document is  6.278246185076333e-76
The Log Probability of the Document is  7.950470978141997e-170
The Log Probability of the Document is  4.2410456127912103e-175
The Log Probability of the Document is  9.20311148e-316
The Log Probability of the Document is  1.996613e-318
The Log Probability of the Document is  2.999626599277605e-286
The Log Probability of the Document is  1.0125621392984049e-28

### **Cross Validation**

In [9]:
# Cross validation Results for 5 Folds

def cross_val(N = 5):
    all_docs, all_labels = read_documents('sentiments.txt')
    results = []
    for fold_nbr in range(N):
        split_point_1 = int(float(fold_nbr)/N*len(all_docs))
        split_point_2 = int(float(fold_nbr+1)/N*len(all_docs))
        train_docs = all_docs[:split_point_1] + all_docs[split_point_2:]
        eval_docs = all_docs[split_point_1:split_point_2]
        trained_data = train_nb(train_docs, all_labels)
        results.append(trained_data[0])
    return(results)

In [10]:
# Checking the results

cross_val(5)

Probability of review being POSITIVE is:  0.6331521739130435
Probability of review being NEGATIVE is:  0.6154891304347826
Probability of review being POSITIVE is:  0.6340136054421769
Probability of review being NEGATIVE is:  0.6163265306122448
Probability of review being POSITIVE is:  0.6340136054421769
Probability of review being NEGATIVE is:  0.6163265306122448
Probability of review being POSITIVE is:  0.6340136054421769
Probability of review being NEGATIVE is:  0.6163265306122448
Probability of review being POSITIVE is:  0.6340136054421769
Probability of review being NEGATIVE is:  0.6163265306122448


[Counter({'the': 0.046632124352331605,
          'reviewer': 0.00016654330125832716,
          'who': 0.0013508512213175426,
          'compared': 0.00014803849000740192,
          'massage': 5.551443375277572e-05,
          'feature': 7.401924500370096e-05,
          'to': 0.023501110288675055,
          'that': 0.010658771280532938,
          'of': 0.01800518134715026,
          'a': 0.021373056994818652,
          'cell': 1.850481125092524e-05,
          'phone': 9.25240562546262e-05,
          'perhaps': 0.00022205773501110288,
          'said': 0.0005181347150259067,
          'it': 0.017135455218356774,
          'best': 0.0007586972612879349,
          ',': 0.03693560325684678,
          'so': 0.003367875647668394,
          'suffice': 3.700962250185048e-05,
          'say': 0.0007957068837897854,
          'one': 0.003830495928941525,
          'can': 0.001998519615099926,
          'essentially': 9.25240562546262e-05,
          'ignore': 3.700962250185048e-05,
          'aspec

### **Result**

#### The average accuracy achieved by the cross validation is **~62.3 %**