In [88]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer

In [110]:
### READ ALL SENTENCES OF POSITIVE AND NEGATIVE REVIEWS OF TRAIN AND TEST SETS ###
train_pos_path = '/Users/justinlacoste/downloads/aclImdb/train/pos'
train_neg_path = '/Users/justinlacoste/downloads/aclImdb/train/neg'
test_pos_path = '/Users/justinlacoste/downloads/aclImdb/test/pos'
test_neg_path = '/Users/justinlacoste/downloads/aclImdb/test/neg'

x_train_pos_raw_reviews = []
x_train_neg_raw_reviews = []
x_test_pos_raw_reviews = []
x_test_neg_raw_reviews = []


def read_all_files_in_directory(path_from_aclImdb, array_to_append_to):
    directory = os.fsencode(path_from_aclImdb)

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        f = open(f"{path_from_aclImdb}/{filename}", "r")
        array_to_append_to.append(f.read())


read_all_files_in_directory(train_pos_path, x_train_pos_raw_reviews)
read_all_files_in_directory(train_neg_path, x_train_neg_raw_reviews)
read_all_files_in_directory(test_pos_path, x_test_pos_raw_reviews)
read_all_files_in_directory(test_neg_path, x_test_neg_raw_reviews)

In [112]:
### EXTRACT WORD FREQUENCIES PER CLASS MANUALLY ###

def array_of_sentences_to_word_frequency_dict(array_of_sentences):
    array_of_words = [x.split() for x in array_of_sentences]
    array_of_words_count = sum(len(x) for x in array_of_words)
    print(f"Total number of words in reviews: {array_of_words_count}")

    array_of_words_dict = {}
    for x in array_of_words:
        for word in x:
            if word in array_of_words_dict:
                array_of_words_dict[word] += 1
            else:
                array_of_words_dict[word] = 1

    return array_of_words_dict



x_train_pos_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_train_pos_raw_reviews)
x_train_neg_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_train_neg_raw_reviews)
x_test_pos_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_test_pos_raw_reviews)
x_test_neg_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_test_neg_raw_reviews)

Total number of words in reviews: 2958832
Total number of words in reviews: 2885848
Total number of words in reviews: 2862401
Total number of words in reviews: 2850766


In [169]:
### EXTRACT WORD FREQUENCIES PER CLASS SEMI-MANUALLY ###
import string

def array_of_sentences_to_word_frequency_dict(array_of_sentences):
    array_of_words = [x.split() for x in array_of_sentences]
    array_of_words = [[y.strip(string.punctuation) for y in x] for x in array_of_words]
    array_of_words_count = sum(len(x) for x in array_of_words)
    print(f"Total number of words in reviews: {array_of_words_count}")

    array_of_words_dict = {}
    for x in array_of_words:
        for word in x:
            if word in array_of_words_dict:
                array_of_words_dict[word] += 1
            else:
                array_of_words_dict[word] = 1

    return array_of_words_dict



x_train_pos_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_train_pos_raw_reviews)
x_train_neg_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_train_neg_raw_reviews)
x_test_pos_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_test_pos_raw_reviews)
x_test_neg_raw_reviews_words_dict = array_of_sentences_to_word_frequency_dict(x_test_neg_raw_reviews)

Total number of words in reviews: 2958832
Total number of words in reviews: 2885848
Total number of words in reviews: 2862401
Total number of words in reviews: 2850766


In [92]:
### EXTRACT WORD FREQUENCIES PER CLASS WITH COUNTVECTORIZER() ###
vectorizer_pos = CountVectorizer()
vectorizer_pos.fit_transform(x_train_pos_raw_reviews)
vocab_pos_dictionnary = vectorizer_pos.vocabulary_

vectorizer_neg = CountVectorizer()
vectorizer_neg.fit_transform(x_train_neg_raw_reviews)
vocab_neg_dictionnary = vectorizer_neg.vocabulary_

In [162]:
#THERE IS AN ERROR, CAUSE THE WORD COUNT IS DEFINITELY TOO HIGH
print(sum(vocab_pos_dictionnary.values()))
print(sum(x_test_pos_raw_reviews_words_dict.values()))

1553390191
2862401


In [166]:
print(vocab_pos_dictionnary["spectacular"])
print(x_test_pos_raw_reviews_words_dict["spectacular"])

46366
93


In [172]:
### MULTINOMAIL NAIVE BAYES MODEL ###
class NaiveBayes():
    def __init__(self):
        #total_word_count = sum(vocab_dictionnary.values())
        self.estimates_pos = {} #frequency of each word (# of times it appears in positive reviews divided by the total number of words in the positive reviews)
        self.estimates_neg = {}
        #SET PRIORS
        self.prior_pos = 0.5
        self.prior_neg = 0.5
        self.alpha_laplace_smooth = 1
        self.beta_laplace_smooth = 1

        #TO DRASTICALLY SPEED UP FUTURE COMPUTATIONS
        self.sum_of_pos_probabilities = 0
        self.sum_of_neg_probabilities = 0

        #WE WANT TO CREATE 2 ARRAY OF THE FREQUENCY OF EACH WORD IN THE VOCABULARY FOR EACH CLASS
        # for key, value in vocab_dictionnary.items():
        #     self.estimates_pos[key] = value / size_pos
        #     self.estimates_neg[key] = value / size_neg
        
        # frequency_of_positives=[0.6, 0.2, 0.4, 0.3, ...] (ex: frequency of the word "the" is number of times it appears divided by the total number of words in the positive reviews)
        # frequency_of_negatives=[0.2, 0.3, 0.1, 0.1, ...]



    def fit(self, x_pos_dict, x_neg_dict):
        # MAXIMUM LIKELIKELIHOOD ESTIMATES
        self.estimates_pos = {}
        self.estimates_neg = {}
        pos_word_count = sum(x_pos_dict.values())
        neg_word_count = sum(x_neg_dict.values())
        for key, value in x_pos_dict.items():
            self.estimates_pos[key] = value / pos_word_count
        for key, value in x_neg_dict.items():
            self.estimates_neg[key] = value / neg_word_count

        for key, value in self.estimates_pos.items():
            self.sum_of_pos_probabilities += np.log(1 - value)
        for key, value in self.estimates_neg.items():
            self.sum_of_neg_probabilities += np.log(1 - value)


    def predict(self, x):

        probability_of_positive = np.log(self.prior_pos)
        probability_of_negative = np.log(self.prior_neg)

        for key, value in self.estimates_pos.items():
            
            #if the word is in the review, we add the log probability of the word to the probability of the review being positive
            if key in x:
                probability_of_positive += np.log(value)
            else:
                probability_of_positive += np.log(1 - value)
            
            #add code to account for case that we have not seen the word in the training set

        for key, value in self.estimates_neg.items():

            #if the word is in the review, we add the log probability of the word to the probability of the review being negative
            if key in x:
                probability_of_negative += np.log(value)
            else:
                probability_of_negative += np.log(1 - value)

            #add code to account for case that we have not seen the word in the training set
        
        #print(f"{probability_of_positive} --- {probability_of_negative}")
        if probability_of_positive > probability_of_negative:
            return 1
        else:
            return 0
        
    def predict_2(self, x):
        
        for word in x:
            if word in self.estimates_pos:
                self.sum_of_pos_probabilities -= np.log(1 - self.estimates_pos[word])
                self.sum_of_pos_probabilities += np.log(self.estimates_pos[word])

            if word in self.estimates_neg:
                self.sum_of_neg_probabilities -= np.log(1 - self.estimates_neg[word])
                self.sum_of_neg_probabilities += np.log(self.estimates_neg[word])

        if self.sum_of_pos_probabilities > self.sum_of_neg_probabilities:
            return 1
        else:
            return 0
        
    def evaluate_acc(self, array_of_sentences, array_of_labels):
        
        num_correct = 0

        for index, sentence in enumerate(array_of_sentences):
            word_array = sentence.split()
            prediction = self.predict_2(word_array)
            if prediction == array_of_labels[index]:
                num_correct += 1
            
        
        return num_correct / len(array_of_sentences)


In [173]:
model = NaiveBayes()
model.fit(x_train_pos_raw_reviews_words_dict, x_train_neg_raw_reviews_words_dict)

In [174]:
print(model.predict_2(['this', 'movie', 'is', 'awesome.']))

0


In [175]:
#EVALUATE MODEL
positive_labels = [1] * len(x_test_pos_raw_reviews)
negative_labels = [0] * len(x_test_neg_raw_reviews)
x_test_labels = positive_labels + negative_labels
x_test_sentences = x_test_pos_raw_reviews + x_test_neg_raw_reviews
print(model.evaluate_acc(x_test_sentences, x_test_labels))

0.65644
