# NAIVE BAYES METHOD 
### BUILD FROM SCRATCH TO IMPLEMENT SENDIMENT ANALYSIS

In [296]:
# REFE"RENCES
# https://stats.stackexchange.com/questions/323859/why-is-uniform-prior-on-logx-equal-to-1-x-prior-on-x
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
# https://streamsql.io/blog/sentiment-analysis
# https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf
# https://docs.python.org/3/library/glob.html
# https://github.com/filipkny/MediumRare/blob/master/NAIVE_BAYES/NaiveBayes.py

In [297]:
import csv
import math
import numpy as np
import re
import os
import glob
from collections import defaultdict

# Preprocess Step
- Obtain data, Clean html tags, remove unnecessary symbols

In [298]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def getData(path, extension):
    data = []
    for filepath in glob.glob(os.path.join(path, extension)):
        with open(filepath) as f:
            content = f.read()
            data.append(content)
    return data

def preProcess(dataset):
    data = [cleanhtml(review).lower().replace(",", "").replace(".", "").replace("!", "").replace("?", "")
           .replace(";", "").replace(":", "").replace("*", "")
           .replace("(", "").replace(")", "")
           .replace("/", "").split(' ') for review in dataset]
    return data

# INVERTED VOCABULARY DEVELOPMENT
def vocab_counter(dataset, all_vocabularies = []):
    inv_vocab_dict = defaultdict(lambda:0)
    for data in dataset:
#         print(data)
#         Calculating the number of documents the word appears in
        words_in_data_pt = []
        for word in data:
#             print(word)
            if word not in words_in_data_pt and word in all_vocabularies:
#                 print('imporved')
                inv_vocab_dict[word] += 1
                words_in_data_pt.append(word)
    return inv_vocab_dict

# LOW OCCURANCE WORDS REMOVED
def vocab_removal(dataset):
    all_data = list(dataset.keys())
    for data in all_data:
        if dataset[data]<5:
            del dataset[data]
    return dataset

In [299]:
# OBTAINING DATA & PREPROCESSING =========================================================================
neg_docs = []
pos_docs = []
train = []

train_neg = preProcess(getData('aclImdb/train/neg', '*.txt'))[0:1000]
train_pos = preProcess(getData('aclImdb/train/pos', '*.txt'))[0:1000]
test_neg = preProcess(getData('aclImdb/test/neg', '*.txt'))[0:1000]
test_pos = preProcess(getData('aclImdb/test/pos', '*.txt'))[0:1000]
vocab = getData('aclImdb/', '*.vocab')[0].split('\n')         # All vocabularies in the dataset



In [300]:
# test_neg

In [301]:
# INVERTED VOCABULARY DEVELOPMENT & VOCABULARY REMOVAL ===================================================
train_neg_vocab_counter = vocab_removal(vocab_counter(train_neg, all_vocabularies=vocab))
train_pos_vocab_counter = vocab_removal(vocab_counter(train_pos, all_vocabularies=vocab))
train_vocab_counter = vocab_removal(vocab_counter(train_pos+train_neg, all_vocabularies=vocab))
test_neg_vocab_counter = vocab_removal(vocab_counter(test_neg, all_vocabularies=vocab))
test_pos_vocab_counter = vocab_removal(vocab_counter(test_pos, all_vocabularies=vocab))
test_vocab_counter = vocab_removal(vocab_counter(test_pos+test_neg, all_vocabularies=vocab))



# CALCULATION OF PROBABILITIES
- 1000 data points were sampled for the calculations of probability
- 200 data points were sampled for testing and cross validation

In [302]:
def calculate_prob_occurrence(occurance, data_count, vocab_count, alpha=None):
    if alpha==None:
        return occurance / data_count
    else:
        return (occurance + alpha) / (data_count + vocab_count * alpha)


# def calculate_prob_pos_or_neg(flag, dataset):
def prob_positive(pos_data, data_count):
    return len(pos_data)/data_count

# CALCULATING CLASSIFIER PROBABILITY OF POSITIVE
def probability_of_positive_given_search(search, pos_data, data_count, train_pos_vocab_counter, train_vocab_counter):
#     POSITIVE PROBABILITY
    probability_of_positive = prob_positive(pos_data, data_count)
#     CONDITIONAL PROBABILITY GIVEN POSITIVE
    probability_of_search_given_positive = calculate_prob_occurrence(train_pos_vocab_counter[search], len(train_pos), len(train_pos_vocab_counter))
#     TOTAL PROBABILITY
    probability_of_search = calculate_prob_occurrence(train_vocab_counter[search], len(train_pos), len(train_vocab_counter))
    if probability_of_search!=0:
        return((probability_of_positive * probability_of_search_given_positive) / probability_of_search)
    else:
        return 0
    
# These are the required 2 calcuations from the syllabus
prob_neg = calculate_prob_occurrence(train_vocab_counter['the'], len(train_neg+train_pos), len(train_vocab_counter))
print('Probability of "the" is ', prob_neg)


prob_the_given_pos = calculate_prob_occurrence(train_pos_vocab_counter['the'], len(train_pos), len(train_pos_vocab_counter))
print('Probability of "the", given Positive, is', prob_the_given_pos)


Probability of "the" is  0.9915
Probability of "the", given Positive, is 0.989


# VALIDATION

In [306]:
def accuracy(pred, data_set, pos_dataset):
    correct_count = 0
    total_rows = data_set.shape[0]
    
    for p,d in enumerate(data_set):
        if pos_dataset[p] == pred:
            correct_count += 1
             
    return (correct_count / total_rows)


def predict_word_review(search, pos_data, data_count, train_pos_vocab_counter, train_vocab_counter):
    prob = probability_of_positive_given_search(search, pos_data, data_count, train_pos_vocab_counter, train_vocab_counter)
    if prob >= 0.50:
        return 1
    else:
        return 0

def cross_validation(k, search, data_set, pos_dataset):
    k_fold_accuracies = []
    increment = (int)(np.array(data_set).shape[0] / k)
    for i in range(k):
        temp_test_set = np.array(data_set[i*increment:(i+1)*increment])
        output = predict_word_review(search, pos_dataset, len(data_set), temp_dev_set_vocab_list, temp_dev_set_vocab_list)
        output = np.array(output)
        pos_dataset = np.array(pos_dataset)
        temp_accuracy = accuracy(output, temp_test_set, pos_dataset)

        k_fold_accuracies.append(temp_accuracy)
    return (sum(k_fold_accuracies) / len(k_fold_accuracies))

In [307]:
temp_dev_set_vocab_list = vocab_removal(vocab_counter(dataset, all_vocabularies = []))
test_all = test_neg[0:200]+test_pos[0:200]
test_all= np.array([[i, 0] for i in test_neg[0:200]]+[[i, 1] for i in test_neg[0:200]])

np.random.shuffle(test_all)
test = [i[0] for i in test_all]
label = [i[1] for i in test_all]

# Cross Validation Results
- The chance of positive given 'the' is evaluated below

In [308]:
cross_validation(5, 'the', test, label)

0.475