In [102]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
log = np.log
α = 1

In [103]:
class BinarrayNaiveBayes:
    def fit(self, paragraphs: list[list[str]], labels: list[int]):
        unique_words = list(set([item for sublist in paragraphs for item in sublist]))

        self.label_encoder = LabelEncoder()
        self.label_encoder.fit_transform(unique_words)
        self.N = len(unique_words)

        # CALCULATING CONDITIONAL PROBABILTIES FOR WORD GIVEN BINARY CLASS LABEL (P(word|0) and P(word|1))
        self.conditional_probabilities = np.array(
            [[α for _ in range(self.N + 1)], [α for _ in range(self.N + 1)]],
            dtype=float,
        )
        # Here, we are reserving the Nth position for the conditional probability of an unkown word that isn't present in the training dataset

        for j in range(len(paragraphs)):
            words = paragraphs[j]
            class_index = labels[j]
            for i in range(len(words)):
                encoded_word = self.label_encoder.transform([words[i]])[0]
                self.conditional_probabilities[class_index][encoded_word] += 1

        self.N_0 = sum(self.conditional_probabilities[0])
        self.N_1 = sum(self.conditional_probabilities[1])

        self.conditional_probabilities[0] = self.conditional_probabilities[0] / self.N_0
        self.conditional_probabilities[1] = self.conditional_probabilities[1] / self.N_1

        # CALCULATING the probability of getting each of the binary classes from the training dataset
        self.class_probability = [
            self.N_0 / (self.N_0 + self.N_1),
            self.N_1 / (self.N_0 + self.N_1),
        ]

    def get_probabilitiy_for_class(self, class_index: int, features: list[int]):
        return (
            self.class_probability[class_index]
            * [self.conditional_probabilities[class_index][i] for i in features].prod()
        )

    def get_log_probabilitiy_for_class(self, class_index: int, features: list[int]):
        return log(self.class_probability[class_index]) + sum(
            [log(self.conditional_probabilities[class_index][i]) for i in features]
        )

    def predict(self, features: list[str]):
        features = features.copy()
        for i in range(len(features)):
            try:
                features[i] = self.label_encoder.transform([features[i]])[0]
            except ValueError as e:
                features[i] = self.N
        return (
            0
            if self.get_log_probabilitiy_for_class(0, features)
            > self.get_log_probabilitiy_for_class(1, features)
            else 1
        )
        # get_probabilitiy_for_class(0,features) > get_probabilitiy_for_class(1,features)

    def predictY(self, X: np.array):
        # same thing as predict() but for an array of x vectors(=X), instead of a one x vector
        return [self.predict(X[i]) for i in range(len(X))]

    def score(self, X, Y):
        Y_pred = np.array(self.predictY(X))
        return 1 - sum((Y - Y_pred) ** 2) / len(Y_pred)

testing

In [104]:
clf = BinarrayNaiveBayes()

clf.fit([["Money","Money","Pay"],["Hi","Dear","omg","bruh"]],[0,1])

In [105]:
print(clf.conditional_probabilities)

[[0.1        0.1        0.3        0.2        0.1        0.1
  0.1       ]
 [0.18181818 0.18181818 0.09090909 0.09090909 0.18181818 0.18181818
  0.09090909]]


In [106]:
clf.predict(["Money","omg","omg","bruh","Bruh","Bruh","Bruh"])

1

# Application

## EMAIL SPAM DETECTION USING NAIVE BAYES

In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/spam.csv").head(1000)
# df.head()

In [108]:
paragraphs = [df.Message.iloc[i].replace(".", "").split() for i in range(len(df))]
labels = [1 if df.Category.iloc[i] == "spam" else 0 for i in range(len(df))]

In [109]:
X_Train, X_Test, Y_Train,Y_test = train_test_split(paragraphs,labels, train_size=0.5)

In [110]:
clf.fit(X_Train,Y_Train)

Making a prediction

In [111]:
X_Test[2], "spam" if Y_test[2] == 1 else "not spam"

(['If',
  'u',
  'sending',
  'her',
  'home',
  'first',
  "it's",
  'ok',
  'lor',
  "I'm",
  'not',
  'ready',
  'yet'],
 'not spam')

In [112]:
"spam" if clf.predict(X_Test[2]) else "not spam"

'not spam'

Checking accuracy

In [113]:
clf.score(X_Test,Y_test)

0.904