In [2]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [3]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

3998


In [4]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

100%|██████████| 25000/25000 [07:21<00:00, 56.58it/s]
100%|██████████| 25000/25000 [07:06<00:00, 58.66it/s]


Calculation the general propabilite of P(Xi=1|C=1)


In [None]:
prop_list_X1C1=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==1 and y_train[j]==1):
      count_1=count_1+1
      sum+=1
  prop_list_X1C1.append(count_1)
for i in range(len(prop_list_X1C1)):
  prop_list_X1C1[i]=np.log((prop_list_X1C1[i]+1)/(sum+4000))
print(prop_list_X1C1)

[-8.701477873781075, -10.386265223456961, -9.748998501291473, -9.523558737716913, -6.962144201892869, -9.16505061269652, -8.151121411258448, -6.767738260346016, -9.210960314000598, -10.056023536586386, -9.760559323692549, -9.6823071267928, -8.891896640807648, -9.760559323692549, -10.025251877919631, -9.523558737716913, -9.910841526741887, -10.32308632183543, -8.72596889378937, -10.364759018235999, -8.746846478810925, -8.446585624152284, -9.693118042897016, -10.120562057723957, -10.04051935005042, -10.477237001662688, -8.590889113758719, -11.170384182222634, -10.28308098722173, -9.016409588680231, -10.104032755772746, -9.808187372681804, -9.4108855752128, -10.025251877919631, -9.726270250213917, -8.315009266363566, -10.718399058479577, -7.685487781683832, -9.748998501291473, -9.4108855752128, -10.225922573381782, -8.70551828331808, -8.6239196394352, -9.061615025448278, -9.038756887372227, -9.570515720804684, -9.924447178797665, -9.884173279659725, -8.365581839999198, -10.104032755772746

Calculation the general propabilite of P(Xi=1|C=0)

In [None]:
prop_list_X1C0=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==1 and y_train[j]==0):
      count_1=count_1+1
      sum+=1
  prop_list_X1C0.append(count_1)
for i in range(len(prop_list_X1C0)):
  prop_list_X1C0[i]=np.log((prop_list_X1C0[i]+1)/(sum+4000))
print(prop_list_X1C0)

[-9.415904617703728, -9.910600859539835, -10.100954588031053, -9.49189052468165, -6.8756478728325625, -9.051939240502316, -7.980046098567716, -6.893759622775609, -9.612968456234887, -9.42406792834289, -9.545957745951926, -10.185037705241594, -8.981064900915658, -9.897355632789814, -10.100954588031053, -9.483156844712894, -9.47449878196978, -9.937629531927755, -9.080594496262693, -11.13704651971783, -8.747450049734153, -9.063302999152631, -9.171843167727838, -9.871380146386553, -9.809248365279547, -10.117215108902833, -9.184663856156899, -9.018602820234724, -8.663568565753453, -9.622918787088054, -10.117215108902833, -10.167645962529726, -10.39944757658705, -9.979593731026787, -9.706300396027105, -7.817914091109978, -10.672740911586732, -7.806466705269627, -8.895370179810776, -11.13704651971783, -9.603116159791874, -9.086425416573485, -8.428996318615619, -8.880981442358676, -8.839017243259645, -9.937629531927755, -10.239104926511871, -9.773741676822638, -8.857450944948482, -10.133744410

Calculation the general propabilite of P(Xi=0|C=1)

In [None]:
prop_list_X0C1=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==0 and y_train[j]==1):
      count_1=count_1+1
      sum+=1
  prop_list_X0C1.append(count_1)
  
for i in range(len(prop_list_X1C0)):
  prop_list_X0C1[i]=np.log((prop_list_X0C1[i]+1)/((sum+4000)))
  
print(prop_list_X0C1)

[-8.283303267830611, -8.266953247602377, -8.270250263173944, -8.272023885051336, -8.383111880771798, -8.275823558162369, -8.298266960668526, -8.410813976740782, -8.275256733579397, -8.268399379452376, -8.27016971869321, -8.270733666341592, -8.279800351943763, -8.27016971869321, -8.268560189937567, -8.272023885051336, -8.269203690603273, -8.267194124395063, -8.282813751626009, -8.26703353341993, -8.28240600442362, -8.289196229719781, -8.270653082917558, -8.268077836041245, -8.268479781462469, -8.266632168774766, -8.285672646773138, -8.264948194197757, -8.267354741163798, -8.277850558491542, -8.268158212201719, -8.26984760562874, -8.273073414877624, -8.268560189937567, -8.270411371600263, -8.29289704400985, -8.265910118119624, -8.319616509477195, -8.270250263173944, -8.273073414877624, -8.267595714698038, -8.283221665153999, -8.2849367213137, -8.277201471247658, -8.27752596220532, -8.271620512854346, -8.269123230370699, -8.269364630492571, -8.291415074365988, -8.268158212201719, -8.27396

Calculation the general propabilite of P(Xi=0|C=0)

In [None]:
prop_list_X0C0=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==0 and y_train[j]==0):
      count_1=count_1+1
      sum+=1
  prop_list_X0C0.append(count_1)
for i in range(len(prop_list_X0C0)):
  prop_list_X0C0[i]=np.log((prop_list_X0C0[i]+1)/(sum+4000))
print(prop_list_X0C0)

[-8.272746126153393, -8.268876089879209, -8.267830527392032, -8.272019352592109, -8.396139019061431, -8.277117894774312, -8.305091746816718, -8.393583340415926, -8.27097050018243, -8.272665347446615, -8.27153513016315, -8.26742867888351, -8.278173218166488, -8.268956563061563, -8.267830527392032, -8.272100079131002, -8.272180812187196, -8.26871516293969, -8.276712297526812, -8.264620251761784, -8.28224256536257, -8.276955636133684, -8.275496491905923, -8.269117528856679, -8.26952005673717, -8.267750144770712, -8.275334496117459, -8.277604828723229, -8.28395664160969, -8.270889864775379, -8.267750144770712, -8.267509035667675, -8.266545180171368, -8.268473821077775, -8.27024501548655, -8.312713543418663, -8.265662461340227, -8.313302248704579, -8.27955493903491, -8.264620251761784, -8.271051142092075, -8.276631197814673, -8.289609382668342, -8.279798970434827, -8.280531422146108, -8.26871516293969, -8.26718764726703, -8.269761651262202, -8.280205821817631, -8.267669768610238, -8.2688760

In [None]:
from sklearn.metrics import classification_report
def predict():
  y_predict=list()
  count_neg=0
  count_pos=0
  for test in x_test_binary:
    prop_pos=np.log(0.5)
    prop_neg=np.log(0.5)
    for i in range(len(test)):
      if(test[i]==1):
        prop_pos+=(prop_list_X1C1[i])
        prop_neg+=(prop_list_X1C0[i])
      else:
        prop_pos+=(prop_list_X0C1[i])
        prop_neg+=(prop_list_X0C0[i])

    if ((prop_pos)>(prop_neg)):
      y_predict.append(1)
    elif((prop_neg)>(prop_pos)):
      y_predict.append(0)
    else:
      y_predict.append(0)
  return y_predict    
    
print(classification_report(y_test, predict(), labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [None]:
from sklearn.naive_bayes import BernoulliNB
nb= BernoulliNB()

nb.fit(x_train_binary,y_train)

from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

