# Imports

In [1]:
import numpy as np
import pandas as pd
import math
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from IPython.display import display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from tensorflow.keras.utils import plot_model
from IPython.display import Image

# Fetching data from imdb

In [2]:
def data_fetch():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

    word_index = tf.keras.datasets.imdb.get_word_index()
    index2word = dict((i + 3, word) for (word, i) in word_index.items())
    index2word[0] = '[pad]' #padding
    index2word[1] = '[bos]' #begin of sentence
    index2word[2] = '[oov]' # out of vocabulary
    x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
    x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])
    return x_train, y_train, x_test, y_test

# Vectorizing examples

In [3]:
def vectorize_examples(vocabulary, x_train):
    binary_vectorizer = CountVectorizer(binary=True, vocabulary=vocabulary.keys())
    x_train_binary = binary_vectorizer.fit_transform(x_train)
    x_train_binary = x_train_binary.toarray()
    return x_train_binary

# Information Gain

In [4]:
def calculate_ig(classes_vector, feature):
        classes = set(classes_vector)

        HC = 0
        for c in classes:
            PC = list(classes_vector).count(c) / len(classes_vector)  # P(C=c)
            HC += - PC * math.log(PC, 2)  # H(C)
            # print('Overall Entropy:', HC)  # entropy for C variable

        feature_values = set(feature)  # 0 or 1 in this example
        HC_feature = 0
        for value in feature_values:
            # pf --> P(X=x)
            pf = list(feature).count(value) / len(feature)  # count occurences of value 
            indices = [i for i in range(len(feature)) if feature[i] == value]  # rows (examples) that have X=x

            classes_of_feat = [classes_vector[i] for i in indices]  # category of examples listed in indices above
            for c in classes:
                # pcf --> P(C=c|X=x)
                pcf = classes_of_feat.count(c) / len(classes_of_feat)  # given X=x, count C
                if pcf != 0: 
                    # - P(X=x) * P(C=c|X=x) * log2(P(C=c|X=x))
                    temp_H = - pf * pcf * math.log(pcf, 2)
                    # sum for all values of C (class) and X (values of specific feature)
                    HC_feature += temp_H

        ig = HC - HC_feature
        return ig

# Vocabulary

In [5]:
def create_vocabulary(x_train, m, n, k, l):
    # n most frequent
    # k less frequent
    # ig 
    
    words_frequency_Dict = dict()

    for review in x_train:
        # I need a list with the distinct words of every review
        distinct_words = set(review.split())

        for word in distinct_words:
            if word in words_frequency_Dict.keys():
                words_frequency_Dict[word] += 1
            else:
                words_frequency_Dict[word] = 1
    
    words_frequency_Dict.pop('[bos]',' ')
    words_frequency_Dict.pop('[pad]',' ')
    words_frequency_Dict.pop('[oov]',' ')
                
    # Sort words based on their frequency in descending order
    remaining_words = sorted(words_frequency_Dict.items(), key=lambda x: x[1], reverse=True)

     # Exclude the top n and bottom k words
    remaining_words = remaining_words[n:-k] if k > 0 else remaining_words[n:]

    #create new dictionary which shows the IG
    IG_Dict = dict()
    x_train_binary = vectorize_examples(remaining_words, x_train)
    for i in tqdm(range(len(remaining_words))):
        word = [example[i] for example in x_train_binary]
        IG_Dict[list(remaining_words.keys())[i]] = IG(y_train, word)

    remaining_words = sorted(IG_Dict.items(), key=lambda x: x[1])
    remaining_words = remaining_words[:l] 

   # Convert remaining_words back into a dictionary
    remaining_words_dict = dict(remaining_words)

    return remaining_words_dict

In [6]:
x_train, y_train, x_test, y_test = data_fetch()
vocabulary = create_vocabulary(x_train, 2500, 50, 85000, 1000)
print(vocabulary)

AttributeError: 'list' object has no attribute 'keys'

# Naive Bayes 

In [None]:
# class NaiveBayesCustom():

#     def __init__(self):
#         self.class0_prob = None
#         self.class1_prob = None
#         self.features_probs = None

#     def fit(self,x_train_binary, y_train):

#         # Calculate prior probabilites P(C=0) and P(C=1)
#         total_samples = len(y_train)
#         class0_samples = np.sum(y_train == 0)
#         class0_prob = class0_samples / total_samples

#         class1_prob = (total_samples - class0_samples) / total_samples

#         # !!!!!!!!!!!!!!!!!!!!!!
#         self.class0_prob = class0_prob
#         self.class1_prob = class1_prob

#         # Calculate the likelihood
#         self.feature_probs = {}

#         # Select samples belonging to class 0,1
#         X_0 = x_train_binary[y == 0]
#         X_1 = x_train_binary[y == 1]

#         # Calculate the probability of each feature being 1 given the class
#         # feature_probs_c = [np.mean(X_c[:, i]) 
#         for i in range(num_features)]
        
#         self.feature_probs[c] = feature_probs_c




#     def predict(self, x_test_binary):
