In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import time

In [None]:
# to find P(Y=k), just do test_labels.where(k)/len(test_labels)
# vectos_train will contain all the information I need to create the counting for each word
# use the get_feature_names to create a dictionary to map to all counts
# this dictionary would be contained in another dict that has the labels as keys
# to find P(xj,k), find the indices where test_labels = (k), then find

In [None]:
# TODO: Handle cases where two examples are permutations of another
# TODO: Handle laplace smoothing, i.e. word from corpus not in word list

In [None]:
#' '.join(map(str, train_corpus[0]))

'This is the first document.'

In [65]:
class NB():
  def __init__(self):
    self.vectorizer = CountVectorizer(binary = True)

  # currently train_corpus expects a np array containing a list of strings, not array of words, nxm
  # test_label is an nx1 array
  def learn(self, train_corpus, test_labels):
    assert len(train_corpus) == len(test_labels)
    assert type(train_corpus) == type(test_labels) == np.ndarray

    vectors_train  = self.vectorizer.fit_transform(train_corpus).todense()
    self.word_list   = self.vectorizer.get_feature_names_out()
    self.num_samples = len(test_labels)

    unique, counts   = np.unique(test_labels, return_counts=True)
    self.label_count = dict(zip(unique, counts))
    self.labels      = unique

    self.word_count_given_label = {}
    for label in self.labels:
      indices = np.where(test_labels == label)[0]
      tot_word_count = np.array(vectors_train[indices].sum(axis=0))[0]
      self.word_count_given_label[label] = {self.word_list[i] : tot_word_count[i] for i in range(len(tot_word_count))}
      # print(label, self.word_count_given_label[label])

  # Assuming test_corpus is 2-d array where each test sample is an array of tokens
  def predict(self, test_corpus):
    assert type(test_corpus) == np.ndarray

    predictions = []
    for corpus in test_corpus:

      best_label = ''
      best_prob  = -1000

      for label in self.labels:
        p_of_y = self.label_count[label]/self.num_samples

        p_of_x_given_y = 1

        for word in self.word_list:
          if word in corpus:
            xj = 1
          else:
            xj = 0

          theta_xj_k      = (self.word_count_given_label[label][word] + 1) / (self.label_count[label] + len(self.labels))
          p_of_x_given_y *= (theta_xj_k**(xj) * (1-theta_xj_k)**(1-xj))
          # print(label, word, p_of_x_given_y, theta_xj_k, (theta_xj_k**(xj) * (1-theta_xj_k)**(1-xj)))

        unseen_words = [new_word for new_word in corpus if new_word not in self.word_list]
        for word in unseen_words:
          p_of_x_given_y *= 1/(self.label_count[label] + len(self.labels))

        p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
        # print(label, p_of_y_given_x)

        # print(p_of_y_given_x, best_prob)
        if p_of_y_given_x > best_prob:
          best_prob = p_of_y_given_x
          best_label = label

      predictions.append(best_label)

    return predictions

In [71]:
train_corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
     'Is this the first document?',]
test_labels = np.array(['1', '2', '3', '4'])
train_corpus = np.array(train_corpus)
test_corpus = np.array([['harro', 'first', 'pink', 'third',]])
model = NB()
model.learn(train_corpus, test_labels)
model.predict(test_corpus)

['1']

In [73]:
# Training data
X_train = np.array([
    "I love this movie",
    "This movie is great",
    "A movie like this is great",
    "I hate this movie",
    "This movie is terrible"
])

# Corresponding labels
y_train = np.array([1, 1, 1, 0, 0])  # 1 for positive sentiment, 0 for negative sentiment


model = NB()
model.learn(X_train, y_train)
X_test = np.array([
    "I love this movie, but terrible in some parts d dfa fa sf asdf adf dasf asdf df df dfd",
    "I hate this great movie",
    "This movie is terrible"
])


for instance in X_test:
  print(model.predict(np.array([instance.split()])))

[0]
[1]
[0]
