#My Bayesian Text Classifier

In [390]:
class My_NB_Text_Classifier:

  def __init__(self, sens, stop_words):

    self.sens = {i.lower():sens[i] for i in sens}
    self.stw = stop_words
    self.prob_positive = sum(sens.values()) / len(sens)
    self.prob_negative = 1 - self.prob_positive
    self.positives, self.positive_words_p_hats = {}, {}
    self.negatives, self.negative_words_p_hats = {}, {}
    self.extra = list("!@#$%^&*()./-+~0123456789")

  def _remove_spaces_of_lists(self, words):

    while '' in words:

      words.pop(words.index(''))

    return words

  def _clean_and_remove_stop_words_(self, sen):

    #Cleaning
    for sym in self.extra:

      sen = sen.replace(sym, "")

    #Extract words
    words = sen.split(" ")

    #removing stop word
    for w in words:

      if w in self.stw:

        words[words.index(w)] = ""

    return words

  def _add_word_to_pn(self, words, label):

    if label:

      for w in words:

        if w in self.positives:

          self.positives[w] += 1

        else:

          self.positives[w] = 1

    else:

      for w in words:

        if w in self.negatives:

          self.negatives[w] += 1

        else:

          self.negatives[w] = 1

  def fit(self):

    for key in self.sens:

      sentence, label = key, self.sens[key]

      words = self._clean_and_remove_stop_words_(sentence)

      self._add_word_to_pn(words, label)

    self.positives.pop('')
    self.negatives.pop('')

  def predict(self, new_sen, verbose=True):

    positives_copy, negatives_copy = self.positives, self.negatives

    new_sen = new_sen.lower()

    ws = self._clean_and_remove_stop_words_(new_sen)
    
    ws = self._remove_spaces_of_lists(ws)

    self.being_positive, self.being_negative = 1, 1

    #Evaluate of being POSITIVE
    for w in ws:

      if w not in positives_copy:

        positives_copy[w] = 0

    #Pˆ(wk |vj) = (nk + 1) / (n + |Vocabulary|)
    positive_length = sum(positives_copy.values())
    positive_vocab_length = len(positives_copy.values())

    for wp in positives_copy:

      self.positive_words_p_hats[wp] = (positives_copy[wp] + 1) / (positive_length + positive_vocab_length)

    for w in ws:

      self.being_positive *= self.positive_words_p_hats[w]


    #Evaluate of being NEGATIVE
    for w in ws:

      if w not in negatives_copy:

        negatives_copy[w] = 0

    #Pˆ(wk |vj) = (nk + 1) / (n + |Vocabulary|)
    negative_length = sum(negatives_copy.values())
    negative_vocab_length = len(negatives_copy.values())

    for wp in negatives_copy:

      self.negative_words_p_hats[wp] = (negatives_copy[wp] + 1) / (negative_length + negative_vocab_length)

    for w in ws:

      self.being_negative *= self.negative_words_p_hats[w]

    self.prob_of_being_postive = self.being_positive / (self.being_positive + self.being_negative)
    self.prob_of_being_negative = 1 - self.prob_of_being_postive


    if self.being_positive > self.being_negative:

      if verbose:

        print(f"POSITIVE\nProbability(positive): {self.prob_of_being_postive:.3f}\nProbability(negative): {self.prob_of_being_negative:.3f}")

      return 1

    else:

      if verbose:

        print(f"NEGATIVE\nProbability(positive): {self.prob_of_being_postive:.3f}\nProbability(negative): {self.prob_of_being_negative:.3f}")

      return 0

#Testing on a Small Training Set

In [391]:
stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
              'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
              'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
              'of', 'most', 'itself', 'other', 'off', 'is', 'am', 'or', 'who',
              'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are',
              'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her',
              'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above',
              'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any',
              'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
              'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can',
              'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where',
              'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom',
              'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
              'further', 'was', 'here', 'than']

sens = {"I am good":1, "I am bad":0, "You are smArt":1, "She is stupid":0, "We are fine":1,
        "Awful result":0, "I hate you":0, "We are soulmates":1, "YoU are GoOd":1}

In [392]:
MNTC = My_NB_Text_Classifier(sens, stop_words)
    
MNTC.fit()

In [393]:
MNTC.predict("Every tHinG was BAd. How AWFuL iT waS!")

NEGATIVE
Probability(positive): 0.154
Probability(negative): 0.846


0

In [394]:
MNTC.predict("Every tHinG was BAd. How AWFuL iT waS! BUt it was ALSO GOOD")

NEGATIVE
Probability(positive): 0.429
Probability(negative): 0.571


0

In [395]:
MNTC.predict("Every tHinG was BAd. How AWFuL iT waS! BUt it was ALSO GOOD. ACTUALLY, The food was FInE")

POSITIVE
Probability(positive): 0.721
Probability(negative): 0.279


1

In [396]:
MNTC.predict("Every tHinG was BAd. How AWFuL iT waS! BUt it was ALSO GOOD. ACTUALLY, The food was FInE. The manager was smarT")

POSITIVE
Probability(positive): 0.911
Probability(negative): 0.089


1

As it can be seen, the more positive words are added, the higher probability for being positive is provided.

#Testing on a Large Training Set

In [397]:
import pandas as pd

df = pd.read_csv("/content/Restaurant_Reviews.csv")

df.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [398]:
n = int(0.8 * len(df))
train_set = {}

for idx in range(n):

  train_set[df['Review'][idx]] = df['Liked'][idx]

In [399]:
test_set = {}

for idx in range(n, len(df)):

  test_set[df['Review'][idx]] = df['Liked'][idx]

In [400]:
print(len(train_set), len(test_set))

800 200


In [401]:
MNTC = My_NB_Text_Classifier(train_set, stop_words)
    
MNTC.fit()

In [402]:
predicted_labels = []

for s in test_set:

  predicted_labels.append(MNTC.predict(s, verbose=False))

In [403]:
real_labels = list(test_set.values())

In [443]:
from sklearn.metrics import accuracy_score
import emojis

print(emojis.encode(f"Accuracy: {100 * accuracy_score(predicted_labels, real_labels):.1f}% :sunglasses:"))

Accuracy: 83.5% 😎


In [448]:
sentence = "It was good"

_ = MNTC.predict(sentence)

POSITIVE
Probability(positive): 0.747
Probability(negative): 0.253


In [451]:
sentence = "It was not bad"

_ = MNTC.predict(sentence)#As mentioned in the class, this method cannot classify "It was not bad" correctly.

NEGATIVE
Probability(positive): 0.014
Probability(negative): 0.986
