In [34]:
from collections import defaultdict
import numpy as np

In [35]:
# My initial code (no chatgpt help)

class NB1:

    def __init__(self):

        self.logprior = defaultdict(float)
        self.loglikelihood = defaultdict(float)

    def train_NB(self, D: list, C: list):

        ND = len(D)
        bigdoc = {c: [D[i] for i in range(ND) if C[i] == c] for c in C}

        V = set()
        for d in D:
            V.update(d.split())

        unique_classes = set(C)
        bow = {w: defaultdict(int) for w in V}

        for c in unique_classes:

            Nc = C.count(c)
            self.logprior[c] = np.log(Nc/ND)

            total_count_c = sum(len(d.split()) for d in bigdoc[c])

            for w in V:
                bow[w][c] = sum(d.count(w) for d in bigdoc[c])
                self.loglikelihood[w,c] = np.log( (bow[w][c]+1) / (total_count_c+len(V)) )

        return self.logprior, self.loglikelihood, V

    def test_NB(self, testdoc: str):

      class_preds = defaultdict(float)

      for c in self.unique_classes:
          class_preds[c] = self.logprior[c]
          for w in testdoc.split():
              if w in self.V:
                  class_preds[c] += self.loglikelihood[w,c]


      max_pred, class_result = float("-inf"), None
      for c, pred in class_preds.items():
          if pred > max_pred:
              max_pred = pred
              class_result = c

      return class_result, max_pred

In [36]:
# code with chatgpt improvements

class NB2:

    def __init__(self):

        self.logprior = defaultdict(float)
        self.loglikelihood = defaultdict(float)
        self.V = set()
        self.unique_classes = set()

    def train_NB(self, D: list, C: list):
        """
        Trains the Naive Bayes classifier by calculating log prior and log likelihood for each class.

        Args:
            D (list of str): List of training documents.
            C (list of str): Corresponding class labels for the documents.

        Returns:
            logprior (defaultdict): Log prior probabilities for each class.
            loglikelihood (defaultdict): Log likelihood of each word given each class (with Laplace smoothing).
            V (list): The full vocabulary of unique words across all documents.

        Notes:
            - The log prior is based on the fraction of documents per class.
            - Log likelihood is computed with Laplace smoothing to handle zero counts.
            - Vocabulary includes all words in the training set, used for all classes.
        """

        ND = len(D)

        for d in D:
            self.V.update(d.split())

        self.unique_classes = set(C)
        for c in self.unique_classes:

            Nc = C.count(c)
            self.logprior[c] = np.log(Nc/ND)

            # create a bag of words for this class
            class_bow = defaultdict(int)
            class_total_words_count = 0

            for i in range(ND):
                if C[i] == c:
                    words = D[i].split()
                    class_total_words_count += len(words)
                    for w in words:
                        class_bow[w] += 1

            for w in self.V:
                self.loglikelihood[w,c] = np.log( (class_bow[w]+1) / (class_total_words_count+len(self.V)) )

        # return self.logprior, self.loglikelihood, self.V

    def test_NB(self, testdoc: str):
        """
        Predicts the most likely class for a given test document using Naive Bayes.

        Args:
            testdoc (str): The document to classify.

        Returns:
            class_result (str): The predicted class label.
            max_pred (float): The log probability of the predicted class.
        """

        class_preds = defaultdict(float)

        for c in self.unique_classes:
            class_preds[c] = self.logprior[c]
            for w in testdoc.split():
                if w in self.V:
                    class_preds[c] += self.loglikelihood[w,c]

        class_result, max_pred = max(class_preds.items(), key = lambda x: x[1])

        return class_result, max_pred



In [37]:
import time

In [38]:
D = ["The Helmy Omar", "Muhammad Helmy Moustafa", "Muhammad Salah Goals"]
C = ["me", "me", "not me"]

In [39]:
nb1 = NB1()

st = time.time()
res1 = nb1.train_NB(D, C)
endt = time.time()

print(endt - st, "\n")
res1

0.00030493736267089844 



(defaultdict(float,
             {'me': -0.40546510810816444, 'not me': -1.0986122886681098}),
 defaultdict(float,
             {('Muhammad', 'me'): -1.8718021769015913,
              ('Goals', 'me'): -2.5649493574615367,
              ('Helmy', 'me'): -1.466337068793427,
              ('The', 'me'): -1.8718021769015913,
              ('Omar', 'me'): -1.8718021769015913,
              ('Moustafa', 'me'): -1.8718021769015913,
              ('Salah', 'me'): -2.5649493574615367,
              ('Muhammad', 'not me'): -1.6094379124341003,
              ('Goals', 'not me'): -1.6094379124341003,
              ('Helmy', 'not me'): -2.3025850929940455,
              ('The', 'not me'): -2.3025850929940455,
              ('Omar', 'not me'): -2.3025850929940455,
              ('Moustafa', 'not me'): -2.3025850929940455,
              ('Salah', 'not me'): -1.6094379124341003}),
 {'Goals', 'Helmy', 'Moustafa', 'Muhammad', 'Omar', 'Salah', 'The'})

In [40]:
nb2 = NB2()

st = time.time()
res2 = nb2.train_NB(D, C)
endt = time.time()

print(endt - st, "\n")
res2

0.00025773048400878906 



In [41]:
testdoc = "Muhammad Helmy Goals"

st = time.time()
class_pred = nb2.test_NB(testdoc)
endt = time.time()

print(endt - st, "\n")
print(class_pred)

0.00017118453979492188 

('me', -6.30855371126472)


# Load, train, and test on Arabic corpus

In [42]:
import pandas as pd

Link for the dataset: https://www.kaggle.com/datasets/mustafahowr/arabic-text-classification

In [43]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Speech and Language Processing/chapter_4/Arabic_classifcation.csv")
dataset

Unnamed: 0,Text,topic
0,استمر المنتخب اليوناني في تفوقه على نظيره المص...,sport
1,استمرار تفوق اليونان,sport
2,البطولة صعبة على جميع الفريق، وبطولات الكؤوس ل...,sport
3,إذن لماذا وافقت على القيام بالمهمة؟,sport
4,أتوجه بجزيل الشكر إلى رئيس القادسية السابق، فو...,sport
...,...,...
394,من جهة أخرى، يأمل بعض حلفاء الولايات المتحدة ...,Economy
395,وحاولت كانبرا إقناع الولايات المتحدة باستبعاد...,Economy
396,وتبرز مخاوف على صعيد السوق المحلية من أن هذه ...,Economy
397,وأوضح ستيف شوبو وزير التجارة الأسترالي لقناة ...,Economy


In [44]:
dataset = dataset.rename(columns={' topic ': 'topic'}) # There is a typo in the original name
dataset['topic'].unique()

array(['sport', 'Politics', 'Technology', 'Economy'], dtype=object)

In [45]:
dataset['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
Politics,100
Technology,100
Economy,100
sport,99


In [46]:
naive_bayes_classifier = NB2()

In [47]:
naive_bayes_classifier.train_NB(list(dataset['Text']), list(dataset['topic']))

In [50]:
naive_bayes_classifier.test_NB("كم عدد الأهداف التي سجلها أبو تريكة في مسيرته؟")

('Politics', -36.88821674855835)

This was pretty bad.

In [52]:
naive_bayes_classifier.test_NB("انتقلت حكومة الشعب الفلسطيني إلى مرحلة انتقالية جديدة")

('Technology', -23.64824632988303)

:(

In [53]:
naive_bayes_classifier.test_NB("هذه آخر مرة استعمل فيها الحاسب الآلي")

('Technology', -41.529745058543)

In [54]:
naive_bayes_classifier.test_NB("القبض بالدولار الأمريكي أفضل من القبض بالجنيه المصري")

('sport', -49.201791251649176)

# Let's try English corpus dude

Link for the dataset: https://www.kaggle.com/datasets/divu2001/text-with-sentiment

In [55]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Speech and Language Processing/chapter_4/English_classification.csv")
dataset

Unnamed: 0,Emotion,Text
0,neutral,Why ?
1,joy,Sage Act upgrade on my to do list for tommorow.
2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3,joy,Such an eye ! The true hazel eye-and so brill...
4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...
...,...,...
34787,surprise,@MichelGW have you gift! Hope you like it! It'...
34788,joy,The world didnt give it to me..so the world MO...
34789,anger,A man robbed me today .
34790,fear,"Youu call it JEALOUSY, I call it of #Losing YO..."


In [56]:
dataset['Emotion'].unique()

array(['neutral', 'joy', 'sadness', 'fear', 'surprise', 'anger', 'shame',
       'disgust'], dtype=object)

In [58]:
dataset['Emotion'].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
joy,11045
sadness,6722
fear,5410
anger,4297
surprise,4062
neutral,2254
disgust,856
shame,146


In [59]:
naive_bayes_classifier = NB2()

In [60]:
naive_bayes_classifier.train_NB(list(dataset['Text']), list(dataset['Emotion']))

In [61]:
naive_bayes_classifier.test_NB("I am happy I will play football next Thursday")

('joy', -64.1453137302326)

In [63]:
naive_bayes_classifier.test_NB("I am not satisfied with the Arabic results")

('joy', -57.27996338899456)

In [64]:
naive_bayes_classifier.test_NB("I feel bad because of the Arabic results")

('joy', -55.92252021268611)

In [66]:
naive_bayes_classifier.test_NB("I am angry because of the Arabic results")

('joy', -57.580782522382776)

In [67]:
naive_bayes_classifier.test_NB("I am sad because of the Arabic results")

('sadness', -55.34771756283331)

In [68]:
naive_bayes_classifier.test_NB("I am")

('joy', -12.133585636378418)

In [69]:
naive_bayes_classifier.test_NB("I am shocked")

('sadness', -23.009726209252143)

In [70]:
naive_bayes_classifier.test_NB("I am afraid")

('fear', -18.814966752646733)

In [71]:
naive_bayes_classifier.test_NB("I am afraid I will keep doing bad habits")

('fear', -66.71502159364552)

In [72]:
naive_bayes_classifier.test_NB("I am ashamed I can't stop bad habits")

('sadness', -61.45739837342197)

In [73]:
naive_bayes_classifier.test_NB("shame on me")

('joy', -22.8196242319304)

This might be because of the unblanced data in which the joy and sadness classes are dominant.