<a href="https://colab.research.google.com/github/Moukthika1253/NaiveBayesClassifier-Rotten-Tomato-reviews/blob/main/NBC_Rotten_tomato_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string,re
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.stem import PorterStemmer
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
rt_df=pd.read_csv('/content/rt_reviews.csv',encoding='latin-1')

In [177]:
X_train,X_remaining, y_train, y_remaining= train_test_split(rt_df['Review'],rt_df['Freshness'],train_size=0.5)
X_test, X_dev, y_test, y_dev = train_test_split(X_remaining,y_remaining,test_size=0.3)
train_set=pd.concat([X_train,y_train],axis=1).reset_index(drop=True)
dev_set=pd.concat([X_dev,y_dev],axis=1).reset_index(drop=True)
test_set=pd.concat([X_test,y_test],axis=1).reset_index(drop=True)
print(f"Train set size:  ",train_set.shape)
print(f"Dev set size:  ",dev_set.shape)
print(f"Test set size:  ",test_set.shape)

Train set size:   (240000, 2)
Dev set size:   (72000, 2)
Test set size:   (168000, 2)


In [178]:
train_set.head()

Unnamed: 0,Review,Freshness
0,I just am unsure the story of a white girl wh...,rotten
1,No Stone Unturned at times veers close to a r...,rotten
2,"The real problem is that, in (presumably) try...",rotten
3,"There were tears on my cheeks, but I wasn't f...",rotten
4,at #4 on the top10 Indie films in the iTunes ...,fresh


 **Pre-processing**

In [298]:
def preprocessing(review):
  stop_words=stopwords.words('english')
  word_lemmatizer=WordNetLemmatizer()
  tokenized_words=[word for word in word_tokenize(review)]
  words=[word.lower() for word in tokenized_words if word.lower() not in stop_words]
  for i in range(len(words)):
   words[i]=''.join([word for word in words[i] if word not in (string.punctuation)])
  no_emptywords=[word for word in words if word]
  no_extraspace=[re.sub(r'\s\s+', '',word) for word in no_emptywords]
  lemmatized_words=[word_lemmatizer.lemmatize(word) for word in no_extraspace]
  return lemmatized_words 
  

In [181]:
train_set['Review']=train_set['Review'].apply(preprocessing)
train_set.head()

Unnamed: 0,Review,Freshness
0,"[unsure, story, white, girl, find, inner, stre...",rotten
1,"[stone, unturned, time, veers, close, rant]",rotten
2,"[real, problem, presumably, trying, ambiguous,...",rotten
3,"[tear, cheek, nt, feeling, thing]",rotten
4,"[4, top10, indie, film, itunes, store]",fresh


In [205]:
review_list=[]
for i,review in train_set['Review'].items():
  for word in review:
    review_list.append(word)
review_list=list(set(review_list))
tot_words=len(review_list)
print(f"total number of words in the list: {tot_words}")

total number of words in the list: 86737


In [315]:
pos_reviews=0
neg_reviews=0
num_pos_words=0
num_neg_words=0
pos_words=[]
neg_words=[]
word_dict=defaultdict(lambda:[0,0])
freshness=list(train_set['Freshness'])

In [316]:
for i, review in enumerate(list(train_set['Review'])):
  if  freshness[i] == 'fresh':
    pos_reviews=pos_reviews+1
  else:
    neg_reviews=neg_reviews+1
  for word in (review):
    if freshness[i] == 'fresh':
      word_dict[word][1]+=1
      num_pos_words+=1
      pos_words.append(word)
    else:
      word_dict[word][0]+=1
      num_neg_words+=1
      neg_words.append(word)

In [245]:
(word_dict)

defaultdict(<function __main__.<lambda>()>,
            {'unsure': [58, 25],
             'story': [6709, 7728],
             'white': [566, 435],
             'girl': [735, 816],
             'find': [1669, 1796],
             'inner': [133, 181],
             'strength': [164, 477],
             'learning': [48, 71],
             'black': [695, 946],
             'culture': [303, 591],
             'using': [159, 269],
             'win': [161, 357],
             'whiterun': [1, 0],
             'dance': [240, 258],
             'contest': [44, 11],
             'inspiring': [111, 270],
             'everyone': [658, 702],
             'put': [865, 886],
             'effort': [1169, 657],
             'convincing': [248, 292],
             'stone': [265, 219],
             'unturned': [13, 5],
             'time': [5286, 5332],
             'veers': [106, 74],
             'close': [539, 637],
             'rant': [13, 12],
             'real': [1786, 2011],
             'problem': 

**calculate probability of a word given review type**

In [246]:
prob_rotten=train_set['Freshness'].value_counts(normalize=True)['rotten']
prob_fresh=train_set['Freshness'].value_counts(normalize=True)['fresh']
print(f'P(fresh) = {prob_fresh} P(rotten) = {prob_rotten}')

P(fresh) = 0.500325 P(rotten) = 0.499675


**Without Smoothing**

In [247]:
def word_alone_prob(word):
  return (math.log(word_dict[word][1]/pos_words))+math.log(word_dict[word][0]/neg_words)

In [248]:
word_alone_prob('silverlining')

ValueError: ignored

In [249]:
def word_prob(reviewType, word):
  if reviewType == 'fresh':
    return math.log(word_dict[word][1]/pos_words)
  else:
    return math.log(word_dict[word][0]/neg_words)
  

In [250]:
word_prob('fresh','silverlining')

ValueError: ignored

**After Smoothing**

In [251]:
alpha=1
def word_only_prob(word):
  return (math.log((word_dict[word][1]+alpha)/(pos_words+tot_words)))+(math.log((word_dict[word][0]+alpha)/(neg_words+tot_words)))

In [252]:
word_only_prob('silverlining')

-27.908537563201342

In [332]:
def cond_prob(reviewType, word):
  if reviewType == 'fresh':
    return math.log((word_dict[word][1]+alpha)/(num_pos_words+(tot_words)))
  else:
    return math.log((word_dict[word][0]+alpha)/(num_neg_words+(tot_words)))

In [254]:
cond_prob('rotten','silverlining')


-13.593358390067088

In [277]:
def review_probability(reviewType, review):
  if reviewType == 'fresh':
    prob=math.log(pos_reviews/len(rt_df))
  else:
    prob=math.log(neg_reviews/len(rt_df))
  preprocessed_reviews=[]
  preprocessed_reviews=preprocessing(review)
  for word in preprocessed_reviews:
     prob=prob+cond_prob(reviewType,word)
  return prob
    

In [309]:
print(review_probability('fresh','Absolutely awful in absolutely the most boring way possible'))
print(review_probability('rotten','Absolutely awful in absolutely the most boring way possible'))

-51.81855502305216
-47.82086749034597


In [312]:
def predict_class(sentence):
 if review_probability('fresh',sentence) > review_probability('rotten',sentence):
   return "fresh"
 else:
    return "rotten"

In [313]:
predict_class('Absolutely awful in absolutely the most boring way possible')

'rotten'

In [353]:
def topmost_prob(pos_word):
  p={}
  for i in range(len(pos_word)):
   num=(cond_prob('fresh',pos_word[i]))
   p[pos_word[i]]=num
  return p

In [368]:
def topmost_negprob(neg_word):
  p={}
  for i in range(len(neg_word)):
   num=(cond_prob('rotten',neg_word[i]))
   p[neg_word[i]]=num
  return p

**Top 10 words in class=Fresh**

In [365]:
p=topmost_prob(pos_words)
top10_words=sorted(p.items(),key=lambda x:x[1],reverse=True)
print("Top 10 words         P[class='fresh'|word]")
for i, (k,v) in enumerate(top10_words[:10]):
 print(k,"                    ", v)


Top 10 words         P[class='fresh'|word]
s                      -3.4810641526239197
film                      -4.217400321190946
movie                      -4.6706566689524145
one                      -5.026120046446683
nt                      -5.08798168409165
story                      -5.362444406027385
like                      -5.416950187533028
make                      -5.467388108289406
performance                      -5.626556866090487
character                      -5.713828673711295


**Top 10 words in class=Rotten**

In [369]:
p=topmost_negprob(neg_words)
top10_words=sorted(p.items(),key=lambda x:x[1],reverse=True)
print("Top 10 words         P[class='rotten'|word]")
for i, (k,v) in enumerate(top10_words[:10]):
 print(k,"                    ", v)

Top 10 words         P[class='rotten'|word]
s                      -3.5105955826734117
film                      -4.400011594467477
movie                      -4.43531181281204
nt                      -4.544890098017452
like                      -5.023477876371845
one                      -5.17731238065581
much                      -5.457132835158628
story                      -5.475151340661307
make                      -5.485788941907875
character                      -5.5453691477260225


**Accuracy on dev dataset**

In [284]:
correct_pred=0
incorrect_pred=0
freshness=list(dev_set['Freshness'])
for i, word in enumerate(list(dev_set['Review'])):
  if predict_class(word) == freshness[i]:
    correct_pred+=1
  else:
    incorrect_pred+=1

In [317]:
print("Accuracy on Dev dataset: ",correct_pred/(correct_pred+incorrect_pred))

Accuracy on Dev dataset:  0.7960416666666666


**Testing on Test dataset**

In [370]:
correct_pred=0
incorrect_pred=0
freshness=list(test_set['Freshness'])
for i, word in enumerate(list(test_set['Review'])):
  if predict_class(word) == freshness[i]:
    correct_pred+=1
  else:
    incorrect_pred+=1

In [371]:
print("Accuracy on Test dataset: ",correct_pred/(correct_pred+incorrect_pred))

Accuracy on Test dataset:  0.79725
