In [2]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [3]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

3998


In [4]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

100%|██████████| 25000/25000 [07:21<00:00, 56.58it/s]
100%|██████████| 25000/25000 [07:06<00:00, 58.66it/s]


Calculation the general propabilite of P(Xi=1|C=1)


In [None]:
prop_list_X1C1=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==1 and y_train[j]==1):
      count_1=count_1+1
      sum+=1
  prop_list_X1C1.append(count_1)
for i in range(len(prop_list_X1C1)):
  prop_list_X1C1[i]=np.log((prop_list_X1C1[i]+1)/(sum+4000))
print(prop_list_X1C1)

Calculation the general propabilite of P(Xi=1|C=0)

In [None]:
prop_list_X1C0=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==1 and y_train[j]==0):
      count_1=count_1+1
      sum+=1
  prop_list_X1C0.append(count_1)
for i in range(len(prop_list_X1C0)):
  prop_list_X1C0[i]=np.log((prop_list_X1C0[i]+1)/(sum+4000))
print(prop_list_X1C0)

Calculation the general propabilite of P(Xi=0|C=1)

In [None]:
prop_list_X0C1=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==0 and y_train[j]==1):
      count_1=count_1+1
      sum+=1
  prop_list_X0C1.append(count_1)
  
for i in range(len(prop_list_X1C0)):
  prop_list_X0C1[i]=np.log((prop_list_X0C1[i]+1)/((sum+4000)))
  
print(prop_list_X0C1)

Calculation the general propabilite of P(Xi=0|C=0)

In [None]:
prop_list_X0C0=list()
sum=0
for i in range(len(vocabulary)):
  count_1=0
  for j in range(len(x_train_binary)):
    if(x_train_binary[j][i]==0 and y_train[j]==0):
      count_1=count_1+1
      sum+=1
  prop_list_X0C0.append(count_1)
for i in range(len(prop_list_X0C0)):
  prop_list_X0C0[i]=np.log((prop_list_X0C0[i]+1)/(sum+4000))
print(prop_list_X0C0)

In [None]:
from sklearn.metrics import classification_report
def predict():
  y_predict=list()
  count_neg=0
  count_pos=0
  for test in x_test_binary:
    prop_pos=np.log(0.5)
    prop_neg=np.log(0.5)
    for i in range(len(test)):
      if(test[i]==1):
        prop_pos+=(prop_list_X1C1[i])
        prop_neg+=(prop_list_X1C0[i])
      else:
        prop_pos+=(prop_list_X0C1[i])
        prop_neg+=(prop_list_X0C0[i])

    if ((prop_pos)>(prop_neg)):
      y_predict.append(1)
    elif((prop_neg)>(prop_pos)):
      y_predict.append(0)
    else:
      y_predict.append(0)
  return y_predict    
    
print(classification_report(y_test, predict(), labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [None]:
from sklearn.naive_bayes import BernoulliNB
nb= BernoulliNB()

nb.fit(x_train_binary,y_train)

from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

