In [9]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import random

In [10]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# to break into sentences from text
def sentence(text):
    sentences = []
    sentences = list(text.split("\n"))
    return sentences
    

# load the document
filename = 'a1_data/a1_d3.txt'
text = load_doc(filename)
sentences = sentence(text)

In [11]:
# text preprocessing
lower_case_sentences = []
for i in sentences:
    lower_case_sentences.append(i.lower())

no_punctuations = []
for i in lower_case_sentences:
    no_punctuations.append(''.join(c for c in i if c not in string.punctuation))

clean_data = []
for i in no_punctuations:
    sub = i.split(', ')
    sub1 = sub[0].split('\t')
    clean_data.append(sub1)
clean_data.remove(clean_data[-1])

In [12]:
df = pd.DataFrame(clean_data, columns =['Review', 'Sentiment'])
print(df)

                                                Review Sentiment
0    so there is no way for me to plug it in here i...         0
1                            good case excellent value         1
2                                great for the jawbone         1
3    tied to charger for conversations lasting more...         0
4                                     the mic is great         1
..                                                 ...       ...
995  the screen does get smudged easily because it ...         0
996  what a piece of junk i lose more calls on this...         0
997                        item does not match picture         0
998  the only thing that disappoint me is the infra...         0
999  you can not answer calls with the unit never w...         0

[1000 rows x 2 columns]


In [13]:
# Split a dataset into 5 folds
def cross_validation(df, n_folds):
    df_split = list()
    df_copy = list(df)
    fold_size = int(len(df) / 5)
    for i in range (n_folds):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(0,len(df_copy))
            fold.append(df_copy.pop(index))
        df_split.append(fold)
    return df_split

folds = cross_validation(clean_data, 5)
for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
        row_copy = list(row)
        test_set.append(row_copy)
        
train_df = pd.DataFrame(train_set, columns =['Review', 'Sentiment'])
test_df = pd.DataFrame(test_set,columns =['Review', 'Sentiment'])
train_df_positive = train_df.loc[train_df['Sentiment']=='1']
train_df_negative = train_df.loc[train_df['Sentiment']=='0']

# Setting the model's vocabulary
def vocab_freq(train_df):
    train_sentences = train_df['Review'].values
    train_sentences_list = train_sentences.tolist()
    all_words_train = []
    for i in train_sentences_list:
        all_words_train.extend(i.split(' '))
    vocab,count = np.unique(np.array(all_words_train),return_counts=True)
    return (vocab,count)

#Setting the positive sentiment and negative sentiment vocab and frequency
vocab_positive, count_positive = vocab_freq(train_df_positive)
vocab_negative, count_negative = vocab_freq(train_df_negative)
vocab_total, count_total = vocab_freq(train_df)


#Gives the probability P(C) or prior probability
# no. of sentiment values is the same as the no. of reviews in train_set
train_sentiments = train_df['Sentiment'].values
sentiment,count = np.unique(train_sentiments,return_counts=True)

positive_review_count = count[1]
negative_review_count = count[0]

prob_positive = positive_review_count / (positive_review_count + negative_review_count)
prob_negative = negative_review_count / (positive_review_count + negative_review_count)

In [14]:
# extracting the words from the test_set
test_sentences = test_df['Review'].values
test_sentences_list = test_sentences.tolist()
words_test = []
for i in test_sentences_list:
    words_test.append(i.split(' '))

#Calculating likelihood probability P(d|C)
#Writing a function for a given class C = 1,0
def posterior_prob(train_df,vocab,count,words_test,prob_class, class_count):
    posterior_prob = list()
    #Calculations for test data in row i
    for i in words_test:
        likelihood_prob = 1
        word_test_array = np.array(i)
        vocab_test,count_test = np.unique(word_test_array,return_counts=True)
        #j returns the elements of the iterable list i
        for j in i:
            try:
                index = list(vocab).index(j)
                #here likelihood probability is returned for the ith row of test data
                likelihood_prob *= ((count[index]+ 1)/(np.sum(count)+np.sum(count_total)+1))
            except ValueError:
                likelihood_prob *= ((0+ 1)/(np.sum(count)+np.sum(count_total)+1))
            
        #return the probability P(d|C)*P(C)
        posterior = prob_class*likelihood_prob
        posterior_prob.append(posterior)
    return posterior_prob
            
posterior_prob_positive = posterior_prob(train_df,vocab_positive,count_positive,words_test,prob_positive,positive_review_count)
posterior_prob_negative = posterior_prob(train_df,vocab_negative,count_negative,words_test,prob_negative,negative_review_count)

In [15]:
# implementing f1 score
def f1_score_single(y_actual, y_pred):
    y_actual = set(y_actual)
    y_pred = set(y_pred)
    true_positives = len(y_actual & y_pred)
    if true_positives == 0: return 0.
    precision = 1.0* true_positives / len(y_pred)
    recall = 1.0* true_positives / len(y_actual)
    return 2 * precision * recall / (precision + recall)
    
def f1_score(y_actual, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_actual, y_pred)])

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [16]:
test_predict = list()

#predict the Sentiment
for i in range (len(test_set)):
    if posterior_prob_positive[i] > posterior_prob_negative[i]:
        test_predict.append("1")
    else:
        test_predict.append("0")
        
test_df['Predicted Sentiment'] = test_predict 
f1_score = f1_score(test_df['Sentiment'], test_df['Predicted Sentiment'])
accuracy = accuracy_metric(test_df['Sentiment'], test_df['Predicted Sentiment'])
print(test_df)
print(f1_score)
print(accuracy)

                                                Review Sentiment  \
0                                         fast service         1   
1    sound quality on both end is excellent i use h...         1   
2    this phone is slim and light and the display i...         1   
3                   plantronics bluetooth excelent buy         1   
4                                          great phone         1   
..                                                 ...       ...   
195  the design might be ergonomic in theory but i ...         0   
196  this phone is pretty sturdy and ive never had ...         1   
197  excellent product i am very satisfied with the...         1   
198         yes its shiny on front side  and i love it         1   
199        i wasted my little money with this earpiece         0   

    Predicted Sentiment  
0                     1  
1                     1  
2                     1  
3                     1  
4                     1  
..                  ...  
1