# Imports

In [40]:
import numpy as np
import pandas as pd
import math
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from IPython.display import display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from tensorflow.keras.utils import plot_model
from IPython.display import Image

# Fetching data from imdb

In [41]:
def data_fetch():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

    word_index = tf.keras.datasets.imdb.get_word_index()
    index2word = dict((i + 3, word) for (word, i) in word_index.items())
    index2word[0] = '[pad]' #padding
    index2word[1] = '[bos]' #begin of sentence
    index2word[2] = '[oov]' # out of vocabulary
    x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
    x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])
    return x_train, y_train, x_test, y_test

# Vectorizing examples

In [42]:
# def vectorize_examples(vocabulary, x_train):
#     binary_vectorizer = CountVectorizer(binary=True, vocabulary=vocabulary.keys())
#     x_train_binary = binary_vectorizer.fit_transform(x_train)
#     x_train_binary = x_train_binary.toarray()
#     return x_train_binary

def vectorize_examples(vocabulary, x_train):
    binary_vectorizer = CountVectorizer(binary=True, vocabulary=vocabulary.keys())
    x_train_binary = binary_vectorizer.fit_transform(x_train)
    x_train_binary = x_train_binary.toarray()
    return x_train_binary

# Information Gain

In [43]:
def calculate_ig(classes_vector, feature):
        classes = set(classes_vector)

        HC = 0
        for c in classes:
            PC = list(classes_vector).count(c) / len(classes_vector)  # P(C=c)
            HC += - PC * math.log(PC, 2)  # H(C)
            # print('Overall Entropy:', HC)  # entropy for C variable

        feature_values = set(feature)  # 0 or 1 in this example
        HC_feature = 0
        for value in feature_values:
            # pf --> P(X=x)
            pf = list(feature).count(value) / len(feature)  # count occurences of value 
            indices = [i for i in range(len(feature)) if feature[i] == value]  # rows (examples) that have X=x

            classes_of_feat = [classes_vector[i] for i in indices]  # category of examples listed in indices above
            for c in classes:
                # pcf --> P(C=c|X=x)
                pcf = classes_of_feat.count(c) / len(classes_of_feat)  # given X=x, count C
                if pcf != 0: 
                    # - P(X=x) * P(C=c|X=x) * log2(P(C=c|X=x))
                    temp_H = - pf * pcf * math.log(pcf, 2)
                    # sum for all values of C (class) and X (values of specific feature)
                    HC_feature += temp_H

        ig = HC - HC_feature
        return ig

# Vocabulary

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

def create_vocabulary(x_train,y_train, n, k, m, l):
    words_frequency_dict = dict()

    for review in x_train:
        distinct_words = set(review.split())

        for word in distinct_words:
            if word in words_frequency_dict:
                words_frequency_dict[word] += 1
            else:
                words_frequency_dict[word] = 1
    
    # Remove specific words from the dictionary
    for special_word in ['[bos]', '[pad]', '[oov]']:
        words_frequency_dict.pop(special_word, None)
   


    # Sort words based on their frequency in descending order
    remaining_words = sorted(words_frequency_dict.items(), key=lambda x: x[1], reverse=True)
    # Exclude the top n and bottom k words
    remaining_words = remaining_words[n:-k] 
    # Convert remaining_words back into a dictionary
    remaining_words = dict(remaining_words)

    # Create a new dictionary which shows the IG
    IG_dict = dict()
    x_train_binary = vectorize_examples(remaining_words, x_train)
    

    for i in tqdm(range(len(remaining_words))):
        # word = [example[i] for example in x_train_binary.T]
        word = [example[i] for example in x_train_binary]
        IG_dict[list(remaining_words.keys())[i]] = calculate_ig(y_train, word)

    # Sort words based on Information Gain in ascending order
    remaining_words = sorted(IG_dict.items(), key=lambda x: x[1], reverse=True)
    # Select the top l words
    remaining_words = remaining_words[:l] 
    # Convert remaining_words back into a dictionary
    remaining_words_dict = dict(remaining_words)

    return remaining_words_dict


In [45]:
x_train, y_train, x_test, y_test = data_fetch()
vocabulary = create_vocabulary(x_train,y_train, 50, 85000, 2500, 1000)
x_train_binary = vectorize_examples(vocabulary, x_train)
x_test_binary = vectorize_examples(vocabulary, x_test)
print(x_train_binary.shape)

100%|██████████| 3026/3026 [00:32<00:00, 94.46it/s] 


(25000, 1000)


# Naive Bayes 

In [98]:
class NaiveBayesCustom():

    def __init__(self):
        self.class0_prob = None
        self.class1_prob = None
        self.features_probs = None

    def fit(self,x_train_binary, y_train):

        # Calculate prior probabilites P(C=0) and P(C=1)
        total_samples = len(y_train)
        class0_samples = np.sum(y_train == 0)
        class0_prob = class0_samples / total_samples
        class1_prob = (total_samples - class0_samples) / total_samples

        self.class0_prob = class0_prob
        self.class1_prob = class1_prob

        # Calculate the likelihood
        self.feature_probs = np.zeros((2,x_train_binary.shape[1]))

        # Select samples belonging to class 0,1
        X_0 = []
        X_1 = []  #alliws np.array

        for i in range(x_train_binary.shape[1]):
            if y_train[i] == 0:
                X_0.append(x_train_binary[i])
            else:
                X_1.append(x_train_binary[i])
            
        # Convert lists to numpy arrays
        X_0 = np.array(X_0)
        X_1 = np.array(X_1)

        # Calculate the probability of each feature being 0 given the class
        self.feature_probs[0] = (X_0.sum(axis=0) + 1) / (len(X_0) + 2)   
        # Calculate the probability of each feature being 1 given the class
        self.feature_probs[1] = (X_1.sum(axis=0) + 1) / (len(X_1) + 2)   
        

    def predict(self, x_test_binary):
        
        sum_prob0=0
        sum_prob1=0

        num_features = x_test_binary.shape[1]
        y_predict =[]

        # Calculating P(C=1 | x_test_binary) and P(C=0 | x_test_binary)
        
        for x_test in x_test_binary:
            # sum_prob0 = sum( math.log(self.feature_probs[0][i]) if x_test[i] == 1 else  math.log(1-self.feature_probs[0][i]) for i in range(num_features) )
            # sum_prob1 = sum( math.log(self.feature_probs[1][i]) if x_test[i] == 1 else  math.log(1-self.feature_probs[1][i]) for i in range(num_features) )

            # sum_prob1 = math.log(self.class1_prob) + sum_prob1
            # sum_prob0 = math.log(self.class0_prob) + sum_prob0

            feature_prob_0 = np.log(self.feature_probs[0])
            feature_prob_1 = np.log(self.feature_probs[1])
            feature_prob_0 = np.sum(feature_prob_0 * x_test + np.log(1 - np.exp(feature_prob_0) * x_test), axis=0)
            feature_prob_1 = np.sum(feature_prob_1 * x_test + np.log(1 - np.exp(feature_prob_1) * x_test), axis=0)
            sum_prob0 = np.log(self.class0_prob) + feature_prob_0
            sum_prob1 = np.log(self.class1_prob) +feature_prob_1

            if (sum_prob1 > sum_prob0):
                y_predict.append(1)
            elif(sum_prob1 < sum_prob0):
                y_predict.append(0)
            else:
                y_predict.append(1 if self.class1_prob > self.class0_prob else 0)


        return y_predict


# Training And Testing - Naive Bayes

                    # 1. Custom Naive Bayes
                

In [99]:
nbc = NaiveBayesCustom()
nbc.fit(x_train_binary, y_train)
print(classification_report(y_train, nbc.predict(x_train_binary),zero_division=1))
print(classification_report(y_test, nbc.predict(x_test_binary), zero_division=1))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85     12500
           1       0.87      0.81      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.84     25000
weighted avg       0.85      0.85      0.84     25000

              precision    recall  f1-score   support

           0       0.81      0.88      0.85     12500
           1       0.87      0.80      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



                        # 2. Scikit-Learn

In [49]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train_binary, y_train)
print(classification_report(y_train, nb.predict(x_train_binary),
                            zero_division=1))
print(classification_report(y_test, nb.predict(x_test_binary),
                            zero_division=1))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.82      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

              precision    recall  f1-score   support

           0       0.81      0.84      0.82     12500
           1       0.84      0.80      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



# Logistic Regression

In [None]:
class LogisticRegression():
    def __init__(self, n_iters, learning_rate, regularizator):
        self.n_iters = n_iters
        self.learing_rate = learning_rate
        self.regularizator = regularizator
        self.weights = None

    def sigmoid(t):
        return 1 / (1 + np.exp(-t))
    
    def fit(self,x_train_binary, y_train):
        self.weights = np.random.rand(1, x_train_binary.shape[1])

        i=1
        s=0

        
        while (i<200):
            #-------SHUFFLE-------------
            # Create a permutation index
            permutation_index = np.random.permutation(len(y_train))

            # Use the permutation index to shuffle both arrays
            shuffled_y_train = y_train[permutation_index]
            shuffled_x_train_binary = x_train_binary[permutation_index]

            

# Training And Testing Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression

# #Algorithm to use in the optimization problem.
# #Each solver tries to find the parameter weights that minimize a cost function
# log = LogisticRegression()
# log.fit(x_train_imdb_binary, y_train_imdb)

# from sklearn.metrics import classification_report
# print(classification_report(y_test_imdb, log.predict(x_test_imdb_binary)))