In [7]:
#Library imports 
import math
from collections import defaultdict
import string
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import naive_bayes
import os
import numpy as np

In [2]:
import logging
from datetime import datetime

import numpy as np

logger = logging.getLogger(__name__)
np.random.seed(2019)

log_path = datetime.now().strftime('./logs/%Y-%m-%d-%H-%M-%S.log')
logging.basicConfig(filename=log_path, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

#### Implementing Bernoulli Naive Baye's Classifier

In [1]:
'''
Implementation of Bernoulli Naive Bayes Classifier
'''
class BernoulliNB(object):
    p_y, p_x_y = None, None

    def fit(self, x, y):
        self.p_y = np.zeros((1, 1, 2), dtype=np.float64)
        self.p_x_y = np.ones((x.shape[1], 2, 2), dtype=np.float64)  # NOTE: Laplace Smoothing

        for x_i, y_i in zip(x, y):
            self.p_y[0][0][int(y_i)] += 1  # NOTE: P(y)
            for j, x_i_j in enumerate(x_i):
                self.p_x_y[j][int(x_i_j)][int(y_i)] += 1  # NOTE: P(x,y)

        self.p_x_y /= self.p_y + 2
        self.p_y /= self.p_y.sum()
        return self

    def predict(self, x):
        pr = []
        for x_i in x:
            lo = math.log2(self.p_y[0][0][1]) - math.log2(1 - self.p_y[0][0][1])
            for j, x_i_j in enumerate(x_i):
                lo += math.log2(self.p_x_y[j][x_i_j][1]) - math.log2(self.p_x_y[j][x_i_j][0])
            pr.append(int(lo >= 0))
        return np.array(pr, dtype=np.float64)

#### Loading Data from Memory

In [3]:
def load_data(dp):
    x = []
    for fn in sorted(os.listdir(dp), key=lambda y: int(y[:-4])):
        with open('{dp}{fn}'.format(dp=dp, fn=fn), 'r') as f:
            x.append(f.read())
    return x

x_tr_pos = np.array(load_data('./dataset/train/pos/'), dtype=np.str)
x_tr_neg = np.array(load_data('./dataset/train/neg/'), dtype=np.str)
x_tr = np.concatenate((x_tr_pos, x_tr_neg), axis=0)
y_tr = np.concatenate((np.ones_like(x_tr_pos, dtype=np.float64), np.zeros_like(x_tr_neg, dtype=np.float64)), axis=0)
x_ts = np.array(load_data('./dataset/test/'), dtype=np.str)

del x_tr_pos
del x_tr_neg

#### Applying Feature Construction pipeline

In [4]:
token_pattern = r'\w+|[%s]' % string.punctuation
cnt = CountVectorizer(token_pattern=token_pattern,
                      ngram_range=(1, 3),
                      binary=True)
x_tr = cnt.fit_transform(x_tr)
x_ts = cnt.transform(x_ts)

#### Training the Classifier and testing it on few elements of test data

In [10]:
cl_bnb = BernoulliNB().fit(x_tr[:, :10].toarray(), y_tr)
cl_bnb_prd = cl_bnb.predict(x_ts[:100, :10].toarray())

#### Testing Our implenmentation Against sklearn implemntation

In [11]:
sk_bnb = naive_bayes.BernoulliNB().fit(x_tr[:, :10].toarray(), y_tr)
sk_bnb_prd = sk_bnb.predict(x_ts[:100, :10].toarray())

In [15]:
correct = True
for x, y in zip(cl_bnb_prd, sk_bnb_prd):
    if x != y:
        correct = False
if correct:
    print("Implementation Verified")

Implementation Verified


**Our Implementation of Bernoulli Naive Baye's give same results as implementation from Python scikit-learn library which confirms its correcteness.**