In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import time
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
import random
import re

seed = 10
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# to find P(Y=k), just do test_labels.where(k)/len(test_labels)
# vectos_train will contain all the information I need to create the counting for each word
# use the get_feature_names to create a dictionary to map to all counts
# this dictionary would be contained in another dict that has the labels as keys
# to find P(xj,k), find the indices where test_labels = (k), then find

In [None]:
# TODO: Handle cases where two examples are permutations of another
# TODO: Handle laplace smoothing, i.e. word from corpus not in word list

In [None]:
# worth looking into stripping accents, and sklearn feature extraction libarary including idf extractor

In [46]:
path = '/content/drive/MyDrive/Olympus in the Sky/McGill/2024 - Winter/ECSE 551/Data/'
df = pd.read_csv(path + 'train.csv', encoding = "ISO-8859-1")

training_data = df['body'].apply(lambda x: x.replace('_', ' ')).to_numpy()
training_labels = df['subreddit'].to_numpy()

indices = np.random.permutation(len(training_data))
training_data = training_data[indices]
training_labels = training_labels[indices]

test_split = 0.2

(training_data, testing_data, training_labels, testing_labels) = train_test_split(training_data, training_labels,  test_size = int(len(training_data)*test_split), random_state=seed)

spacy_stopwords_list = list(fr_stop) + list(en_stop)
nltk_stopwords_list = stopwords.words('english') + stopwords.words('french')

In [100]:
class NB():
  def __init__(self):
    spacy_stopwords_list = list(fr_stop) + list(en_stop)
    nltk_stopwords_list = stopwords.words('english') + stopwords.words('french')
    self.vectorizer = CountVectorizer(binary = True, max_features = 3000, stop_words = list(set().union(spacy_stopwords_list, nltk_stopwords_list)))
    self.no_pred = []

  # currently train_corpus expects a np array containing a list of strings, not array of words, nxm
  # test_label is an nx1 array
  def train(self, train_corpus, test_labels):
    assert len(train_corpus)  == len(test_labels)
    assert type(train_corpus) == type(test_labels) == np.ndarray

    vectors_train    = self.vectorizer.fit_transform(train_corpus).todense()
    self.word_list   = self.vectorizer.get_feature_names_out()
    self.num_samples = len(test_labels)
    self.word_count  = dict(zip(self.word_list, np.array(vectors_train.sum(axis=0))[0]))

    unique, counts   = np.unique(test_labels, return_counts=True)
    self.label_count = dict(zip(unique, counts))
    self.labels      = unique

    self.word_count_given_label = {}
    for label in self.labels:
      indices = np.where(test_labels == label)[0]
      tot_word_count = np.array(vectors_train[indices].sum(axis=0))[0]
      self.word_count_given_label[label] = {self.word_list[i] : tot_word_count[i] for i in range(len(tot_word_count))}
      # print(label, self.word_count_given_label[label])

  # Assuming test_corpus is 2-d array where each test sample is a string
  def predict(self, test_corpus):

    predictions = []
    pattern = re.compile(r"(?u)\b\w\w+\b")
    for index, corpus in enumerate(test_corpus):

      best_label = ''
      best_prob  = -np.inf

      # Text processing based on CountVectorizer's regex
      corpus = corpus.lower().replace('_', ' ')
      corpus_words = list(set(re.findall(pattern, corpus)))

      for label in self.labels:
        p_of_y = self.label_count[label]/self.num_samples

        p_of_x_given_y = 1

        for word in self.word_list:
          if word in corpus_words:
            xj = 1
          else:
            xj = 0

          theta_xj_k      = (self.word_count_given_label[label][word] + 1) / (self.label_count[label] + len(self.labels))
          p_of_x_given_y *= (theta_xj_k**(xj) * (1-theta_xj_k)**(1-xj))
          # print(label, word, p_of_x_given_y, theta_xj_k, (theta_xj_k**(xj) * (1-theta_xj_k)**(1-xj)))

        unseen_words = [new_word for new_word in corpus_words if new_word not in self.word_list]
        for word in unseen_words:
          # p_of_x_given_y *= 1/(self.label_count[label] + len(self.labels))
          p_of_x_given_y *= 1/(len(self.labels)) # When there are a large number of unseen words, the p_of_x_given_y basically becomes 0
        # print(len(unseen_words), len(corpus_words))

        p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
        # print(p_of_y, p_of_x_given_y)
        # print(label, p_of_y_given_x)

        # print(p_of_y_given_x, best_prob)
        if p_of_y_given_x > best_prob:
          best_prob = p_of_y_given_x
          best_label = label

      if best_label == '':
        self.no_pred.append(corpus)
      #   best_label = random.choice(self.labels)

      predictions.append(best_label)

    return predictions

In [107]:
def k_fold_validation(x_train_data, y_train_data, K = 10):
  assert type(x_train_data) == np.ndarray and type(y_train_data) == np.ndarray

  len_of_data  = len(x_train_data) - len(x_train_data) % K
  data_x       = x_train_data[:len_of_data]
  data_y       = y_train_data[:len_of_data]
  size_of_fold = len(x_train_data[:len_of_data]) // K
  validation_error = 0

  error    = []
  pred     = []
  timings  = []
  no_preds = []

  naive_bayes = NB()

  for i in range(K):

    naive_bayes = NB()

    if i != K-1:
      validation_fold_x = data_x[i*size_of_fold:(i+1)*size_of_fold]
      validation_fold_y = data_y[i*size_of_fold:(i+1)*size_of_fold]

      training_folds_x  = np.concatenate((data_x[:i*size_of_fold], data_x[(i+1)*size_of_fold:]))
      training_folds_y  = np.concatenate((data_y[:i*size_of_fold], data_y[(i+1)*size_of_fold:]))

    else:
      validation_fold_x = data_x[i*size_of_fold:]
      validation_fold_y = data_y[i*size_of_fold:]

      training_folds_x  = data_x[:i*size_of_fold]
      training_folds_y  = data_y[:i*size_of_fold]

    start_time = time.time()
    naive_bayes.train(training_folds_x, training_folds_y)
    end_time = time.time()

    timings.append(end_time - start_time)

    pred_valid = naive_bayes.predict(validation_fold_x)
    no_pred = naive_bayes.no_pred
    pred_train = naive_bayes.predict(training_folds_x)

    fold_error = {}
    fold_error['validation'] = 1 - accuracy_score(pred_valid, validation_fold_y)
    validation_error        += 1 - accuracy_score(pred_valid, validation_fold_y)
    fold_error['train']      = 1 - accuracy_score(pred_train, training_folds_y)

    model_pred = {}
    model_pred['validation'] = (pred_valid, validation_fold_y)
    model_pred['train']      = (pred_train, training_folds_y )

    error.append(fold_error)
    pred.append(model_pred)
    no_preds.append(no_pred)

  info = {'error': error, 'pred': pred, 'time': timings, 'no_preds' : no_preds}

  return validation_error/K, info

# Simple test cases for NB

In [98]:
train_corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
     'Is this the first document?',]
test_labels = np.array(['1', '2', '3', '4'])
train_corpus = np.array(train_corpus)
test_corpus = ['harro first pink haha']
model = NB()
model.train(train_corpus, test_labels)
model.predict(test_corpus)



['3']

In [88]:
# Training data
X_train = np.array([
    "I love this movie",
    "This movie is great",
    "A movie like this is great",
    "I hate this movie",
    "This movie is terrible"
])

# Corresponding labels
y_train = np.array([1, 1, 1, 0, 0])  # 1 for positive sentiment, 0 for negative sentiment


model = NB()
model.train(X_train, y_train)
X_test = [
    "I love this movie terrible", # Example of both class being viable but the second being chosen due to label eval order in NB
    "I hate this great movie",
    "This movie is terrible"
]


print(model.predict(X_test))

[0, 1, 0]


# K-fold for NB

In [None]:
# TODO: How to select best performing split
# TODO: No predictions given, divide by zero warnings might be the cause (Solved?)
# TODO: Prune weird words from word lsit like numebrs and etc
# TODO: Add lemitization

In [101]:
model = NB()
model.train(training_data, training_labels)
pred = model.predict(testing_data)
(pred == testing_labels).sum()/len(testing_labels)

  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)


0.6357142857142857

In [90]:
data = model.no_pred

debug = NB()
debug.train(training_data, training_labels)
pred = debug.predict(data)
pred

[]

In [108]:
mean_error, info = k_fold_validation(training_data, training_labels)

  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)
  p_of_y_given_x = np.log(p_of_y * p_of_x_given_y)


In [109]:
mean_error

0.3589285714285714

In [113]:
info['no_preds']
# Seems like one post is really long with different words which is causing an issue

[['kinda tired of people who keep complaining about paris like it?s really any significant difference from other cities around the world of comparable size and population density. \n\ntrash? i?ve seen piles higher than me in nyc. \n\nsome areas stink? i?ve been in parts of miami in summer you can barely breathe for the exhaust and mangrove/saltwater rot.\n\nproblems with safety or homeless/panhandlers? try checking out hollywood on any average weekday. \n\nsubway/trains are often breaking down? see chicago or manhattan.\n\ni get it. paris syndrome is a bitch. i visited for the first time last february and thought i was being savvy to buy a five day subway/bus pass my first day?then the next four days was all uber rides because the subway, bus and train workers declared a multi-day strike. i was disappointed by that, but i otherwise still enjoyed my trip.\n\ni feel like people are really just complaining because paris didn?t live up to their fantasized expectations, not because any of i

# Actual Test Case

In [114]:
model = NB()
model.train(training_data, training_labels)

test_df = pd.read_csv(path + 'test.csv', encoding = "ISO-8859-1")
test_data = test_df['body'].to_numpy()

predictions = model.predict(test_data)



In [115]:
pd.DataFrame({'Subreddit':predictions}).reset_index().rename(columns={"index": "Id"}).to_csv('results.csv', index=False)

# Extra

In [None]:
!pip install wordninja
import wordninja

In [None]:
' '.join(wordninja.split('This document is the spider-man document,egxnd3mtd2l6lxnlcnaih0j1cybhbmqgq29hy2ggq2hhcnrlcibtzwxib3vybmuybhaagbyyhjileaaygaqyiguyhgmycxaagiaegiofgiydmgsqabiabbikbrigazileaaygaqyiguyhgmycxaagiaegiofgiydsoobuksdwkkbcaf4azabajgbnwkgaewnqgedmi04uaedyaea.'))

'This document is the spider man document eg x nd 3 m td 2 l 6 l xn lc nai h 0 j 1 cy bh bm qg q 29 hy 2 g gq 2 h hcn rlc ibt zw xi b 3 vy b muy b haag by y hj ilea a yg aq yi guy hg my cx a agia eg i of gi yd mg sq abi abb ik brig a zile a a yg aq yi guy hg my cx a agia eg i of gi yd soo buk s dw k kb caf 4 az abaj gb nw kg a ew n qg edm i 04 uae dy a ea'

In [116]:
sorted(model.word_count.items(), key=lambda item: item[1], reverse = True)[:-1]

[('like', 220),
 ('people', 213),
 ('time', 143),
 ('think', 125),
 ('years', 115),
 ('good', 106),
 ('paris', 100),
 ('london', 98),
 ('know', 95),
 ('https', 93),
 ('going', 88),
 ('way', 88),
 ('work', 87),
 ('want', 85),
 ('need', 81),
 ('got', 79),
 ('place', 79),
 ('lot', 76),
 ('find', 73),
 ('new', 67),
 ('www', 64),
 ('city', 63),
 ('live', 62),
 ('look', 61),
 ('best', 59),
 ('better', 59),
 ('great', 59),
 ('things', 59),
 ('sure', 57),
 ('faire', 56),
 ('right', 56),
 ('tre', 56),
 ('use', 56),
 ('actually', 54),
 ('bit', 54),
 ('feel', 53),
 ('long', 53),
 ('ago', 52),
 ('year', 52),
 ('thing', 51),
 ('bien', 50),
 ('help', 50),
 ('10', 49),
 ('getting', 49),
 ('pay', 49),
 ('day', 48),
 ('life', 48),
 ('home', 47),
 ('money', 47),
 ('come', 46),
 ('places', 46),
 ('said', 46),
 ('house', 45),
 ('job', 44),
 ('old', 44),
 ('probably', 44),
 ('tr', 44),
 ('maybe', 43),
 ('area', 42),
 ('free', 42),
 ('love', 42),
 ('public', 42),
 ('street', 42),
 ('friends', 41),
 ('lookin