<a href="https://www.kaggle.com/code/nsff591/imdb-sentiment-analysis-bow-tfidf-0-8726?scriptVersionId=97815935" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Natural Language Processing: Sentiment Analysis (IMDB reviews)

## Importing the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import re
import nltk
import sys

from IPython.display import clear_output

print("Importing Complete!")

## Importing the  Dataset

In [None]:
#importing the training data
imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)



## Cleaning the Text

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
import pickle

In [None]:
imdb_data_len = len(imdb_data.iloc[:, 0])
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

ps = PorterStemmer()


# new list with cleaned data
corpus = []
for i in range(imdb_data_len): # this can also be written as imdb_data['review']
  review = BeautifulSoup(imdb_data['review'][i], "html.parser").get_text()
  review = re.sub('\[[^]]*\]',' ',review)
  review = re.sub('[^a-zA-z0-9]',' ',review)
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)

  corpus.append(review)
    
print("Done cleaning the data!")

### We save the cleaned reviews into a pickle file so we don't have to clean it every time we run the code

In [None]:
pickle.dump(corpus, open( "./imdb_cleaned_reviews.p", "wb" ))

In [None]:
corpus = pickle.load(open( "./imdb_cleaned_reviews.p", "rb" ))

In [None]:
print(corpus[0])

## Creating the bag of words model

In [None]:
cv = CountVectorizer(max_features= 1500)

x = cv.fit_transform(corpus).toarray()
y = imdb_data.iloc[:,1].values

In [None]:
print(len(x[0]))

In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(max_features=1500)
x_tv=tv.fit_transform(corpus).toarray()

## Splitting the data in training and test set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0)
x_tv_train, x_tv_test, y_tv_train, y_tv_test = train_test_split(x_tv, y, test_size= 0.2, random_state=0)

## Training the Naive Bayes model on the training set

In [None]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

classifier_tv = GaussianNB()
classifier_tv.fit(x_tv_train, y_tv_train)

## Predicting the test set results

In [None]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion matrix

In [None]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

y_tv_pred = classifier_tv.predict(x_tv_test)
cm_tv = confusion_matrix(y_tv_test, y_tv_pred)
print(cm_tv)
print(accuracy_score(y_tv_test, y_tv_pred))

## Reviewing the model

In [None]:
#Classification report for bag of words 
nb_bow_report=classification_report(y_test,y_pred,target_names=['Positive','Negative'])
print(nb_bow_report)

#Classification report for tfidf features
nb_tv_report=classification_report(y_tv_test,y_tv_pred,target_names=['Positive','Negative'])
print(nb_tv_report)

## Training other models: Support Vector Machine (RBF)

In [None]:
from sklearn.svm import SVC
classifier_svm = SVC(kernel= 'rbf', random_state= 0, max_iter=5, probability=True)
classifier_svm.fit(x_train, y_train)

classifier_svm_tv = SVC(kernel= 'rbf', random_state= 0, max_iter=5, probability=True)
classifier_svm_tv.fit(x_tv_train, y_tv_train)

In [None]:
y_pred_svm = classifier_svm.predict(x_test)
cm = confusion_matrix(y_test, y_pred_svm)
print(cm)
print(accuracy_score(y_test, y_pred_svm))

y_tv_pred_svm = classifier_tv.predict(x_tv_test)
cm_tv = confusion_matrix(y_tv_test, y_tv_pred_svm)
print(cm_tv)
print(accuracy_score(y_tv_test, y_tv_pred_svm))

In [None]:
#Classification report for bag of words 
svm_bow_report=classification_report(y_test,y_pred_svm,target_names=['Positive','Negative'])
print(svm_bow_report)

#Classification report for tfidf features
svm_tv_report=classification_report(y_tv_test,y_tv_pred_svm,target_names=['Positive','Negative'])
print(svm_tv_report)

## Making use of both Models

### Coverage of the models over the test data if we used both models in a perfect world

In [None]:
accuracy = 0
total = len(x_tv_test)
for i in range(total):
  if classifier_tv.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1
  elif classifier_svm_tv.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1

accuracy = accuracy/total*100

print(accuracy)

## Training other models: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(max_iter=500,random_state = 0)
classifier_LR.fit(x_train, y_train)

classifier_tv_LR = LogisticRegression(max_iter=500,random_state = 0)
classifier_tv_LR.fit(x_tv_train, y_tv_train)

In [None]:
y_pred_LR = classifier_LR.predict(x_test)
cm = confusion_matrix(y_test, y_pred_LR)
print(cm)
print(accuracy_score(y_test, y_pred_LR))

y_tv_pred_LR = classifier_tv_LR.predict(x_tv_test)
cm_tv = confusion_matrix(y_tv_test, y_tv_pred_LR)
print(cm_tv)
print(accuracy_score(y_tv_test, y_tv_pred_LR))

In [None]:
#Classification report for bag of words 
LR_bow_report=classification_report(y_test,y_pred_LR,target_names=['Positive','Negative'])
print(LR_bow_report)

#Classification report for tfidf features
LR_tv_report=classification_report(y_tv_test,y_tv_pred_LR,target_names=['Positive','Negative'])
print(LR_tv_report)

## Training other models: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_tv_rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier_tv_rf.fit(x_tv_train, y_tv_train)

In [None]:
y_tv_pred_rf = classifier_tv_rf.predict(x_tv_test)
cm_tv = confusion_matrix(y_tv_test, y_tv_pred_rf)
print(cm_tv)
print(accuracy_score(y_tv_test, y_tv_pred_rf))

## Making use of 4 Models

### Coverage of the models over the test data if we used 4 models in a perfect world

In [None]:
accuracy = 0
total = len(x_tv_test)
for i in range(total):
  if classifier_tv.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1
  elif classifier_svm_tv.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1
  elif classifier_tv_LR.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1
  elif classifier_tv_rf.predict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1

accuracy = accuracy/total*100

print(accuracy)

### New prediction method making use of the 3 models in an ensemble algorithm

I did not use the Naive Bayes model as it did not benefit in any way to the ensemble method.

The ensemble method using the average of the 3 models prediction ratio's did not improve the prediction accuracy compared to the logistic regression.


In [None]:
def newPredict(arr):
  lr = classifier_tv_LR.predict_proba(arr)[0][0] # % negative
  svm = classifier_svm_tv.predict_proba(arr)[0][0]
  rf = classifier_tv_rf.predict_proba(arr)[0][0]

  if (lr + svm + rf)/3 >= 0.5:
    return 'negative'
  else:
    return 'positive'

accuracy = 0
total = len(x_tv_test)

for i in range(total):
  if newPredict(x_tv_test[i].reshape(1,-1)) == y_test[i]:
    accuracy+=1

accuracy = accuracy/total*100

print(accuracy)

## Word frequency for positive review words

In [None]:
imdb_data_len = x_tv.shape[:][0]
reviewLength = x_tv.shape[:][1]

positive_list = [0] * reviewLength
negative_list = [0] * reviewLength

positive_amount = 0
negative_amount = 0

def addVectorizedReviewRow(lst, reviewLst, reviewLength):
  for i in range(reviewLength):
    lst[i] = lst[i] + reviewLst[i]
  return lst

def normalizeVectorizedReviewRow(lst, reviewLength, amount):
  for i in range(reviewLength):
    lst[i] = lst[i] / amount
  return lst

for i in range(imdb_data_len):
  if y[i] == 'positive':
    positive_list = addVectorizedReviewRow(positive_list, x_tv[i], reviewLength)
    positive_amount += 1
  else:
    negative_list = addVectorizedReviewRow(negative_list,x_tv[i], reviewLength)
    negative_amount += 1

positive_list = normalizeVectorizedReviewRow(positive_list, reviewLength, positive_amount)
negative_list = normalizeVectorizedReviewRow(negative_list, reviewLength, negative_amount)

In [None]:
bow=pd.DataFrame(positive_list, tv.get_feature_names())
positive_text=bow.sort_values(by=0, ascending=False).iloc[:50]

font = {'size'   : 20}

plt.rc('font', **font)

fig, ax = plt.subplots(figsize=(20, 10))

ax.bar(positive_text.index.values, positive_text.T.iloc[:].values[0])
plt.xticks(rotation=75)
plt.xlabel("Stemmed Word")
plt.ylabel("Word_Frequency(Normalized)")
plt.show()


## Word frequency for negative review words

In [None]:
bow=pd.DataFrame(negative_list, tv.get_feature_names())
negative_text=bow.sort_values(by=0, ascending=False).iloc[:50]

font = {'size'   : 20}

plt.rc('font', **font)

fig, ax = plt.subplots(figsize=(20, 10))

ax.bar(negative_text.index.values, negative_text.T.iloc[:].values[0])
plt.xticks(rotation=75)
plt.xlabel("Stemmed Word")
plt.ylabel("Word_Frequency(Normalized)")
plt.show()