<a href="https://colab.research.google.com/github/JonNData/naive_bayes/blob/master/notebooks/Naive_Bayes_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np

## Multinomial Naive Bayes
Input X: array of messages  
Input y: array of labels

In [29]:
class MNNaiveBayes:
  def __init__(self, k=0.5):
    self.k = k
    self.cat0_count = 0
    self.cat1_count = 0
    self.total_count = self.cat0_count + self.cat1_count
    self.cat_0_prior = 0
    self.cat_1_prior = 0
    self.cat_0_prior, self.cat_1_prior
    self.word_probs = []
    self.vocab = []

  def tokenize(self, document):
    """
    Take in a document and return a list of words
    """
    doc = document.lower()
    # remove non-alpha characters
    stop_chars = '''0123456789!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    tokens = ""
    # iterate through and make each token
    for char in doc:
      if char not in stop_chars:
        tokens += char

    return tokens.split() # now a list of tokens
  
  def count_words(self, X, y):
    """
    X is an array of documents
    y is an array of targets, 0 or 1
    Output a dictionary of {word: (cat0_count, cat1_count)...}
    """
    counts = {}
    for document in X:
      for category in y:
        for token in self.tokenize(document):
          # Initialize a dict entry with 0 counts
          if token not in counts:
            counts[token] = [0,0]
          # Now that it exists, add to the category count for that word
          counts[token][category] += 1
    return counts

  def prior_prob(self, counts):
    
    # Iterate through counts dict and add up each word count by category
    cat0_word_count = cat1_word_count = 0
    for word, (cat0_count, cat1_count) in counts.items():
        cat0_word_count += cat0_count
        cat1_word_count += cat1_count

    # save attributes to the class
    self.cat0_count = cat0_word_count
    self.cat1_count = cat1_word_count
    self.total_count = self.cat0_count + self.cat1_count

    # Get the prior prob by dividing words in each cat by total words
    cat_0_prior = cat0_word_count / self.total_count
    cat_1_prior = cat1_word_count / self.total_count
    return cat_0_prior, cat_1_prior

  def word_probabilities(self, counts):
    """turn the word_counts into a list of triplets
    word, p(w | cat0), and p(w | cat1)"""
    # Here we apply the smoothing term, self.k, so that words that aren't in
    # the category don't get calculated as 0
    self.vocab = [word for word, (cat0, cat1) in counts.items()]
    return [(word,
    (cat0 + self.k) / (self.cat0_count + 2 * self.k),
    (cat1 + self.k) / (self.cat1_count + 2 * self.k))
    for word, (cat0, cat1) in counts.items()]

  def fit(self, X, y):
    # Take all these functions and establish probabilities of input
    counts = self.count_words(X, y)
    self.cat_0_prior, self.cat_1_prior = self.prior_prob(counts)
    self.word_probs = self.word_probabilities(counts)

  def predict(self, test_corpus):
    # Split the text into tokens,
    # For each category: calculate the probability of each word in that cat
    # find the product of all of them and the prior prob of that cat
    y_pred = []
    for document in test_corpus:
      # Every document get their own prediction probability
      log_prob_cat0 = log_prob_cat1 = 0.0
      tokens = self.tokenize(document)
        # Iterate through the training vocabulary and add any log probs that match
        # if no match don't do anything. We just need a score for each category/doc
      for word, prob_cat0, prob_cat1 in self.word_probs:
        if word in tokens:
          # Because of 'overflow' best to add the log probs together and exp
          log_prob_cat0 += np.log(prob_cat0)
          log_prob_cat1 += np.log(prob_cat1)
        # get each of the category predictions including the prior
      cat_0_pred = self.cat_0_prior * np.exp(log_prob_cat0)
      cat_1_pred = self.cat_1_prior * np.exp(log_prob_cat1)
      if cat_0_pred >= cat_1_pred:
        y_pred.append(0)
      else:
        y_pred.append(1)
    return y_pred
      


## Let's test it. Later this will be a pytest

In [19]:
# Data will be from reddit. 
# Train on 10 r/worldnews titles and 10 r/aww titles
# test on 2 r/aww and 2 r/worldnews
# category_0 = r/worldnews = 0 
# category_1 = r/aww = 1
worldnews = ["Uighur group calls for China to lose 2022 Games over 'genocide'", 
     "Polish Towns That Declared Themselves ‘L.G.B.T. Free’ Are Denied E.U. Funds",
     "Michelle Bolsonaro, Brazil's First Lady, Tests Positive For Coronavirus",
     "Border officials crack down on Americans travelling through B.C. to Alaska",
     "Hong Kong bans 11 pro-democracy figures from legislative election | Hong Kong Free Press HKFP",
     "The 3 women who have brought COVID into Queensland have been charged with falsifying documents and fraud",
     "UK KFC admits a third of its chickens suffer painful inflammation - Fast food giant KFC has laid bare the realities of chicken production after admitting to poor welfare conditions among its suppliers.",
     "Chile picks Japan's trans-Pacific cable route in snub to China",
     "Hackers post fake stories on real news sites 'to discredit Nato'",
     "Prostate cancer can be detected by a new blood test which also reveals the severity of the disease with 99 per cent accuracy"
    ]
aww = [
       "This little cutie climbed up on me while I applied to adopt her",
       "Here is a happy duckling to make your day better!",
       "Adorable cutie",
       "12 years ago she came running up to me on a dirt road and sat on my foot clinging to my ankle crying. Today I present to you my kitty Izzy.",
       "Very talented Otter",
       "A dog at the shelter I work at is teaching me how to smile.",
       "The best seat in the house",
       "A Stork couple celebrating their first egg ",
       "She turned 6 last week. Everyone still thinks she's a kitten.",
       "My gf and I rescued this little guy today.... meet max everyone"
]
X = worldnews + aww
y = [0]*10 + [1]*10

X_test = [
          "Toronto emerging as tech superpower as immigrants choose Canada over US",
          """Egypt imprisons female TikTok influencers: A court in Cairo has sentenced six young female bloggers to prison for up to two years — not for political offenses, but for violating "public morals." Activists have called the ruling an "outrageous attack on civil liberties.""",
          "The mixed kitten seeds grew well this year.",
          "My wife just sent me this photo of our cat at the vet. Safe to say she’s a little scared."
]

In [31]:
mnnb = MNNaiveBayes()
mnnb.fit(X,y)
mnnb.predict(["cat", "cute", "dog"])

[0, 0, 0]