In [5]:
!pip install pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import sys
import numpy as np
import random as rn
import pandas as pd
from torchnlp.datasets import imdb_dataset
import random
import string 
import re
import math
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading and preprocessing data 

In [8]:
# load dataset
train_long = imdb_dataset(train=True)
test_long = imdb_dataset(test=True)
random.shuffle(train_long)
random.shuffle(test_long)
train = []
test = []

# only use a subsample of the data as to not run into memory overflow errors
for i in range(2000):
  train.append(train_long[i])

for i in range(500):
  test.append(test_long[i])

df_train = pd.DataFrame.from_dict(train)
df_test =  pd.DataFrame.from_dict(test)

# clean and tokenize textual data
def clean_data(words):
  #Set to lower case
  words = words.lower()

  # removing all the html tags from our data
  words = re.sub(r'<.*?>', '', words)

  # remove numbers
  words = re.sub(r'\d+', '', words)

  #Get rid of all punctiation and stop words
  no_punc = []
  tokens = nltk.word_tokenize(words)

  for w in tokens:
    if len(w) > 2 and w not in stopwords.words('english') and w not in set(string.punctuation): 
      no_punc.append(w)
    
  words = ' '.join(no_punc)
  words = words.strip()

  #Lemmatizes the words, puts the words in their most 
  #Basic form 
  lemmatizer = WordNetLemmatizer() 

  lemmas = []

  for w in nltk.word_tokenize(words):
    lem = lemmatizer.lemmatize(w, pos="v")

    if len(lem) > 2: 
      lemmas.append(lem)
  
  words = ' '.join(lemmas)
  return words.strip()


df_train['text'] = df_train['text'].apply(clean_data)
df_test['text'] = df_test['text'].apply(clean_data)

df_test.head()

Unnamed: 0,text,sentiment
0,n't get wrong assume movie would stupid honest...,neg
1,movie almost generation-defining importance be...,pos
2,previously see abridge print present david she...,pos
3,jane porter former love interest harry holt ne...,pos
4,admit tsui hark one kind n't top person strong...,pos


### Naive Bayes binary classification model

In [11]:
from tables.utils import count_logged_instances


class naive_bayes():

  def __init__(self):
    #Total number of words associated 
    #with positive or negative sentiment 
    self.num_messages = {}
    #Holds the priors for positive and 
    #negaitbe sentiment
    self.log_class_priors = {}
    #Holds the counts of each word in 
    #the data
    self.word_counts = {}
    #The vocabulary of our data 
    #i.e all the words seen during training 
    self.vocab = set()

    self.total_words = {}

  #Helper method to get the total counts of 
  #each word out of a sentance
  def get_word_counts(self, words):
    counts = {}
    for w in words: 
      counts[w] = counts.get(w, 0.0) + 1.0

    return counts


  def fit(self, data):

    #Gets the total number of positive and negative messages
    self.num_messages['pos'] = len(data[data['sentiment'] == 'pos'])
    self.num_messages['neg'] = len(data[data['sentiment'] == 'neg'])

    #Set up our priors which in this case are bernoulli r.v 
    #Get one for both pos and neg reviews
    self.log_class_priors['pos'] = math.log(self.num_messages['pos'] / len(data))
    self.log_class_priors['neg'] = math.log(self.num_messages['neg'] / len(data))
    

    #Get's the total count of every word in our training dataset
    #For both positive and negative reviews 
    self.word_counts['pos'] = {}
    self.word_counts['neg'] = {}
    self.total_words['pos'] = 0
    self.total_words['neg'] = 0

    for cur_text, cur_sent in zip(data['text'], data['sentiment']):

        counts = self.get_word_counts(nltk.word_tokenize(cur_text))
        for word, count in counts.items():
            if word not in self.vocab:
                self.vocab.add(word)
            if word not in self.word_counts[cur_sent]:
                self.word_counts[cur_sent][word] = 0.0
 
            self.word_counts[cur_sent][word] += count
            self.total_words[cur_sent] += count

    

  def predict(self, pred_data):
    pred = []

    #Go through each text to get a prediction
    for cur in pred_data:
      counts = self.get_word_counts(nltk.word_tokenize(cur))
      pos_score = 0
      neg_score = 0

      for cur_word, _ in counts.items():
        #If we have no seen this word during training skip it
        if cur_word not in self.vocab: 
          continue
        
        #Finding our multinomial liklehood for pos and neg 
        #with laplace smoothing 

        log_pos = math.log((self.word_counts['pos'].get(cur_word, 0.0) + 1) / (self.num_messages['pos'] + len(self.vocab)))
        #log_pos = math.log((self.word_counts['pos'].get(cur_word, 0.0) + 1) / (self.total_words['pos']))

        log_neg = math.log((self.word_counts['neg'].get(cur_word, 0.0) + 1) / (self.num_messages['neg'] + len(self.vocab)))
        #log_neg = math.log((self.word_counts['neg'].get(cur_word, 0.0) + 1) / (self.total_words['neg']))

        #Add the likleihood term to our total 
        #Scores for pos and neg 
        pos_score += log_pos

        neg_score += log_neg

      #Add our priors to our score
      pos_score += self.log_class_priors['pos']
      neg_score += self.log_class_priors['neg']

      #pick the larger score 
      if pos_score > neg_score:
          pred.append('pos')
      else:
          pred.append('neg')
    return pred


  def evaluate_acc(self, y_true, y_pred):
    total = 0
    for count, _ in enumerate(y_pred):
      if y_true[count] == y_pred[count]:
        total += 1
      acc = total/len(y_pred)

    print(acc)
    return classification_report(y_true, y_pred)


NB = naive_bayes()

NB.fit(df_train)

pred_text = df_test["text"]

preds = NB.predict(pred_text)


acc = NB.evaluate_acc(df_test["sentiment"], preds)
print(acc)




0.778
              precision    recall  f1-score   support

         neg       0.92      0.61      0.74       253
         pos       0.70      0.95      0.81       247

    accuracy                           0.78       500
   macro avg       0.81      0.78      0.77       500
weighted avg       0.82      0.78      0.77       500

