In [18]:
from collections import Counter

import os
import numpy as np

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [10]:
base_path = '/content/drive/My Drive/Colab/Processed_Dataset'
base_path

'/content/drive/My Drive/Colab/Processed_Dataset'

In [11]:
def get_X_Y_from(file):
    with open(file, 'r',encoding='utf16') as f:
        docs = [line.split() for line in f.readlines()]
    X = [doc[:-1] for doc in docs]
    Y = [doc[-1]  for doc in docs]
    return X,Y

In [None]:
# test data
X_train = [
    ["cricket","very","small","insect"],
    ["play","music"],
    ["play","play","cricket","football"],
    ["like","singing"],
    ["insect","small","live"]
]

Y_train = ["Biology","Music","Sports","Music","Biology"]

X_test = [["want","play","cricket"]]
Y_test = ["Sports"]


# set comprehention to find unique words
# unique_words = {word for doc in X_train for word in doc}
# print(unique_words)

{'insect', 'very', 'football', 'singing', 'music', 'small', 'like', 'live', 'cricket', 'play'}


In [12]:
class ClassWordCounter:
    def __init__(self,n_docs,n_words,word_counter):
        self.n_docs       = n_docs
        self.n_words      = n_words
        self.word_counter = word_counter 
        
        
    def __str__(self):
        return f"n_docs: {self.n_docs}\nn_words: {self.n_words}\nword_counter: {self.word_counter}"
    
    def __repr__(self):
        return f"ClassWordCounter({self.n_docs},{self.n_words},{self.word_counter})"
    
    
    
def get_class_word_counters(X_train,Y_train) -> ClassWordCounter:
    class_word_counters = dict()
    for cls,doc in zip(Y_train,X_train):
        if cls not in class_word_counters:
            class_word_counters[cls] = ClassWordCounter(0,0,Counter())
        class_word_counters[cls].n_docs += 1
        class_word_counters[cls].n_words += len(doc)
        class_word_counters[cls].word_counter += Counter(doc)
    # print(class_word_counters)
    return class_word_counters


# class_word_counters = get_class_word_counters(X_train,Y_train)
# class_word_counters

In [13]:
def naive_bayes_predict(document,class_word_counters,n_docs,n_unique_words,alpha):
    probabilities = dict()
    for word in document:
        for cls,class_word_counter in class_word_counters.items():
            if cls not in probabilities:
                # print(f"prior {cls}: P({cls}) = {class_word_counter.n_docs}/{n_docs}")
                # prior: P(cls)
                probabilities[cls] = class_word_counter.n_docs/n_docs
    
            n_word_cls = class_word_counter.word_counter[word]
            n_cls      = class_word_counter.n_words
            
            # zero problem
            # print(f"P({word}|{cls}) *= {n_word_cls}/{n_cls}")
            
            # with smoothin factor
            # print(f"P({word}|{cls}) *= ({n_word_cls} + {alpha})/({n_cls} + {alpha}*{n_unique_words})")
            probabilities[cls] *= ((n_word_cls + alpha)/(n_cls + (alpha * n_unique_words)))
    
    # print("Probabilities: ",probabilities)
    prediction = max(probabilities,key=lambda key: probabilities[key])
    return prediction


# n_docs = len(X_train)
# n_unique_words = len(unique_words)
# document = X_test[0]
# naive_bayes_predict(document,class_word_counters,n_docs,n_unique_words,1)

In [14]:
def performance_evaluation(X_train, Y_train, X_test, Y_test, alpha_vals):
    # get the proper input format
    class_word_counters = get_class_word_counters(X_train,Y_train)
    n_docs = len(X_train)
    unique_words = {word for doc in X_train for word in doc}
    n_unique_words = len(unique_words)

    
    stat = []
    for alpha in alpha_vals:
        print(f"---alpha = {alpha:.2f}------")
        total,correct,cur = len(X_test),0,0
        interval = max(total//5,1)
        for doc,actual_class in zip(X_test, Y_test):
            prediction = naive_bayes_predict(doc   , class_word_counters,
                                             n_docs, n_unique_words     ,alpha)
            
            if prediction == actual_class:
                correct += 1
            
            cur += 1
            if cur % interval == 0:
                print(f"Completed: {cur*100/total:.1f}%")
        
        print( "--------------------")
        print(f"Correct : {correct}")
        print(f"Total   : {total}")
        print(f"Accuracy: {(correct*100)/(total):.2f}%")
        print( "--------------------")
        stat.append(correct*100/total)
    return stat

In [16]:
# paths
train_input_file = base_path + "/train.in"
validation_input_file = base_path + "/validation.in"

X_train,Y_train = get_X_Y_from(train_input_file)
print(len(X_train),len(Y_train))

X_validation,Y_validation = get_X_Y_from(validation_input_file)
print(len(X_validation),len(Y_validation))

5500 5500
2200 2200


In [30]:
# hyper parameters
alpha_vals = np.linspace(0.1, 1.0, num=10)
# alpha_vals = [.9,1.0]
print(alpha_vals)

# get performance
stats = performance_evaluation(X_train,Y_train, X_validation,Y_validation, alpha_vals)

# generate stat in markdown for NB
with open("NB_stat.md","w") as out:
    print("# Naive Bayes", file=out)
    print("\n## Topics", file=out)
    for topic in set(Y_train): 
        print(f"- {topic}", file=out)
        
    print(f"\n## Training Data Size\n- **{len(Y_train)}** documents", file=out)
    print(f"\n## Validation Data Size\n- **{len(Y_validation)}** documents", file=out)
    
    print("\n## Accuracy for Different Smoothing Factors ($\\alpha$)\n", file=out)
    print("| Serial | alpha | Accuracy |",file=out)
    print("| --- | --- | --- |",file=out)

    for i,alpha,accuracy in zip(range(len(alpha_vals)),alpha_vals,stats):
        print(f"| {i+1} | {alpha:.2f} | {accuracy:.2f}% |",file=out)

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
---alpha = 0.10------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 1575
Total   : 2200
Accuracy: 71.59%
--------------------
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 1576
Total   : 2200
Accuracy: 71.64%
--------------------
---alpha = 0.30------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 1574
Total   : 2200
Accuracy: 71.55%
--------------------
---alpha = 0.40------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 1567
Total   : 2200
Accuracy: 71.23%
--------------------
---alpha = 0.50------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 1567
Total   : 2200
Accuracy: 7

In [31]:
from google.colab import files
files.download('NB_stat.md') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# test the best performing NB
# | 1 | 0.10 | 71.59% |
# | 2 | 0.20 | 71.64% |
# | 3 | 0.30 | 71.55% |
# | 4 | 0.40 | 71.23% |
# | 5 | 0.50 | 71.23% |
# | 6 | 0.60 | 71.18% |
# | 7 | 0.70 | 71.09% |
# | 8 | 0.80 | 70.82% |
# | 9 | 0.90 | 70.59% |
# | 10 | 1.00 | 70.55% |
best_alpha = 0.20
n_iter = 50

NB_test_itr_accuracy = []
for itr in range(n_iter):
    print(f"---Test---Iteration {itr + 1}---")
    
    input_file = f"{base_path}/test_itr_{itr}.in"
    X_test,Y_test = get_X_Y_from(input_file)

    accuracy_vals = performance_evaluation(X_train,Y_train,X_test,Y_test,[best_alpha])
    NB_test_itr_accuracy.append(accuracy_vals[0])

---Test---Iteration 1---
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 78
Total   : 110
Accuracy: 70.91%
--------------------
---Test---Iteration 2---
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 87
Total   : 110
Accuracy: 79.09%
--------------------
---Test---Iteration 3---
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 85
Total   : 110
Accuracy: 77.27%
--------------------
---Test---Iteration 4---
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed: 100.0%
--------------------
Correct : 78
Total   : 110
Accuracy: 70.91%
--------------------
---Test---Iteration 5---
---alpha = 0.20------
Completed: 20.0%
Completed: 40.0%
Completed: 60.0%
Completed: 80.0%
Completed

In [34]:
print("best_alpha",best_alpha)
print("NB_test_itr_accuracy = ",NB_test_itr_accuracy)

best_alpha 0.2
NB_test_itr_accuracy =  [70.9090909090909, 79.0909090909091, 77.27272727272727, 70.9090909090909, 64.54545454545455, 72.72727272727273, 73.63636363636364, 76.36363636363636, 72.72727272727273, 70.0, 71.81818181818181, 75.45454545454545, 73.63636363636364, 67.27272727272727, 70.0, 71.81818181818181, 75.45454545454545, 71.81818181818181, 70.9090909090909, 72.72727272727273, 70.0, 75.45454545454545, 78.18181818181819, 76.36363636363636, 77.27272727272727, 78.18181818181819, 74.54545454545455, 77.27272727272727, 71.81818181818181, 78.18181818181819, 78.18181818181819, 77.27272727272727, 67.27272727272727, 70.0, 70.0, 72.72727272727273, 69.0909090909091, 73.63636363636364, 77.27272727272727, 82.72727272727273, 69.0909090909091, 79.0909090909091, 71.81818181818181, 69.0909090909091, 74.54545454545455, 80.0, 70.9090909090909, 69.0909090909091, 74.54545454545455, 74.54545454545455]
