# Sentiment Analysis Using NLP

In [48]:
import pandas as pd
import numpy as np
import glob
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
import time
from bs4 import BeautifulSoup
import unicodedata

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

import tqdm
from mlxtend.evaluate import paired_ttest_5x2cv
import pickle

import warnings
warnings.filterwarnings('ignore')

# Extract Data

In [49]:
train_positive_docs = glob.glob("aclImdb/train/pos/*.txt")
train_negative_docs = glob.glob("aclImdb/train/neg/*.txt")

train_positive_list = []
for i in train_positive_docs:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_positive_list.append(str)
    
train_negative_list = []
for i in train_negative_docs:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_negative_list.append(str)

In [50]:
labels = ['id', 'review', 'label']
dataframe_train_positive = pd.DataFrame()
dataframe_train_positive['review'] = train_positive_list
dataframe_train_positive['label'] = 'positive'
print(dataframe_train_positive.shape)
dataframe_train_negative = pd.DataFrame()
dataframe_train_negative['review'] = train_negative_list
dataframe_train_negative['label'] = 'negative'
print(dataframe_train_negative.shape)
dataframe_train = pd.concat([dataframe_train_positive , dataframe_train_negative])
dataframe_train=shuffle(dataframe_train)

(12500, 2)
(12500, 2)


In [51]:
dataframe_train.head()

Unnamed: 0,review,label
5944,This is probably one of the worst French movie...,negative
7732,"Need a lesson in pure, abject failure?? Look n...",negative
3534,"A truly, truly dire Canadian-German co-product...",negative
7958,SEX WISH was actually released (minus ten minu...,negative
3770,Sudden Impact was overall better than The Enfo...,positive


In [52]:
def eliminate_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    refined_text = soup.get_text()
    return refined_text

In [53]:
def eliminate_accent_characters(text):
    refined_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return refined_text

In [54]:
def eliminate_special_characters(text):
    refined_text = re.sub('[^a-zA-z0-9\s]', '', text)
    return refined_text

In [55]:
def lemmatize_text(text):
    listOfWords = [WordNetLemmatizer().lemmatize(word) for word in text]
    return listOfWords

In [56]:
def eliminate_stopwords(text, is_lower_case=False):
    listOfWords = word_tokenize(text)
    tokens = [word for word in listOfWords if word.isalpha()]
    stopword_list = stopwords.words('english')
    stopword_list.remove('no')
    stopword_list.remove('not')
    if is_lower_case:
        refined_tokens = [token for token in tokens if token not in stopword_list]
    else:
        refined_tokens = [token for token in tokens if token.lower() not in stopword_list] 
    return refined_tokens

In [57]:
def concatenate(text):
    return " ".join(text)

In [58]:
def preprocessing_dataframe_text(dataframe):
    dataframe["review"] = dataframe["review"].apply(eliminate_html_tags)
    dataframe['review'] = dataframe['review'].apply(eliminate_accent_characters)
    dataframe['review'] = dataframe['review'].str.lower()    
    dataframe['review'] = dataframe['review'].apply(eliminate_special_characters)
    dataframe['review'] = dataframe['review'].apply(eliminate_stopwords, True)
    dataframe["review"] = dataframe["review"].apply(lemmatize_text)
    dataframe["review"] = dataframe["review"].apply(concatenate)
    return dataframe

In [59]:
preprocessed_data = preprocessing_dataframe_text(dataframe_train)

In [60]:
preprocessed_data.head()

Unnamed: 0,review,label
5944,probably one worst french movie seen far among...,negative
7732,need lesson pure abject failure look no wizard...,negative
3534,truly truly dire canadiangerman coproduction e...,negative
7958,sex wish actually released minus ten minute ah...,negative
3770,sudden impact overall better enforcer opinion ...,positive


In [61]:
X_train=preprocessed_data['review'][:15000]
y_train=preprocessed_data['label'][:15000]

X_test = preprocessed_data['review'][15000:]
y_test = preprocessed_data['label'][15000:]

print(X_train.shape)
print(y_train.shape)

print(y_train.unique())

print(X_test.shape)
print(y_test.shape)

(15000,)
(15000,)
['negative' 'positive']
(10000,)
(10000,)


# Comparing Different Vectorization Techniques

In [62]:
CountVec = CountVectorizer()
BoW_fv_train=CountVec.fit_transform(X_train)
BoW_fv_test=CountVec.transform(X_test)
BoW_fv = CountVec.fit_transform(preprocessed_data['review'])


In [63]:
hashing = HashingVectorizer(strip_accents='ascii', lowercase=True, preprocessor=None,n_features=BoW_fv.shape[1])
hashing_fv_train=hashing.fit_transform(X_train)
hashing_fv_test=hashing.transform(X_test)


In [64]:
tfidf = TfidfVectorizer(strip_accents='ascii', lowercase=True, preprocessor=None,max_features=BoW_fv.shape[1])
tfidf_fv_train=tfidf.fit_transform(X_train)
tfidf_fv_test=tfidf.transform(X_test)
tfidf_fv_train.shape

(15000, 74836)

In [65]:
results=pd.DataFrame()

## Logistic Regression with BoW

In [66]:
time_beginning_of_training = time.time()

model_lr_bow=LogisticRegression(random_state=0)
train_feature = model_lr_bow.fit(BoW_fv_train, y_train)

test_feature_lr_bow = model_lr_bow.predict(BoW_fv_test)

train_acc=model_lr_bow.score(BoW_fv_train, y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_lr_bow.score(BoW_fv_test, y_test)
print('Test accuracy {}'.format(test_acc))

time_end_of_training = time.time()
print('Time to run: {}'.format(time_end_of_training-time_beginning_of_training))

Train accuracy 0.9994
Test accuracy 0.8768
Time to run: 1.703455924987793


In [67]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["Logistic Regression BoW"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

                     title  sample_size  train_acc  test_acc
0  Logistic Regression BoW        25000     0.9994    0.8768


In [68]:
print("Classification Report for Logistic Regression with BoW")
print(classification_report(y_test, test_feature_lr_bow))

Classification Report for Logistic Regression with BoW
              precision    recall  f1-score   support

    negative       0.88      0.87      0.88      4973
    positive       0.87      0.88      0.88      5027

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



## Logistic Regression with Hashing

In [69]:
time_beginning_of_training = time.time()

model_lr_hashing=LogisticRegression(random_state=0)
train_feature = model_lr_hashing.fit(hashing_fv_train, y_train)

test_feature_lr_hashing = model_lr_hashing.predict(hashing_fv_test)

train_acc=model_lr_hashing.score(hashing.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_lr_hashing.score(hashing.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

time_end_of_training = time.time()
print('Time to run: {}'.format(time_end_of_training-time_beginning_of_training))

Train accuracy 0.9054666666666666
Test accuracy 0.8691
Time to run: 4.513115882873535


In [70]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["Logistic Regression Hashing"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

                         title  sample_size  train_acc  test_acc
0  Logistic Regression Hashing        25000   0.905467    0.8691


In [71]:
print("Classification Report for Logistic Regression with hashing")
print(classification_report(y_test, test_feature_lr_hashing))

Classification Report for Logistic Regression with hashing
              precision    recall  f1-score   support

    negative       0.87      0.86      0.87      4973
    positive       0.86      0.88      0.87      5027

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



# Logistic Regression with TfidfVectorizer

In [72]:
time_beginning_of_training = time.time()

model_lr_tfidf=LogisticRegression(random_state=0)
model_lr_tfidf.fit(tfidf_fv_train, y_train)

test_feature_lr_tfidf = model_lr_tfidf.predict(tfidf_fv_test)

train_acc=model_lr_tfidf.score(tfidf.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_lr_tfidf.score(tfidf.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

time_end_of_training = time.time()
print('Time to run: {}'.format(time_end_of_training-time_beginning_of_training))

Train accuracy 0.9378666666666666
Test accuracy 0.8863
Time to run: 3.5468521118164062


In [73]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["Logistic Regression Tfidf"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

                       title  sample_size  train_acc  test_acc
0  Logistic Regression Tfidf        25000   0.937867    0.8863


In [74]:
print("Classification Report for Logistic Regression with Tfidf")
print(classification_report(y_test, test_feature_lr_tfidf))

Classification Report for Logistic Regression with Tfidf
              precision    recall  f1-score   support

    negative       0.90      0.87      0.88      4973
    positive       0.88      0.90      0.89      5027

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# RandomForestClassifier with Hashing

In [75]:
model_rfc_hashing=RandomForestClassifier(random_state=0)
model_rfc_hashing.fit(hashing.fit_transform(X_train), y_train)

test_model_rfc_hashing = model_rfc_hashing.predict(hashing.transform(X_test))

train_acc=model_rfc_hashing.score(hashing.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_rfc_hashing.score(hashing.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

Train accuracy 1.0
Test accuracy 0.8502


In [76]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["RandomForestClassifier Hashing"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

                            title  sample_size  train_acc  test_acc
0  RandomForestClassifier Hashing        25000        1.0    0.8502


In [77]:
print("Classification Report for RandomForestClassifier with Hashing")
print(classification_report(y_test, test_model_rfc_hashing))

Classification Report for RandomForestClassifier with Hashing
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      4973
    positive       0.86      0.84      0.85      5027

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# RandomForestClassifier with Tfidf

In [78]:
model_rfc_tfidf=RandomForestClassifier(random_state=0)
model_rfc_tfidf.fit(tfidf.fit_transform(X_train), y_train)

test_feature_rfc_tfidf = model_rfc_tfidf.predict(tfidf.transform(X_test))

train_acc=model_rfc_tfidf.score(tfidf.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_rfc_tfidf.score(tfidf.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

Train accuracy 1.0
Test accuracy 0.8413


In [79]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["RandomForestClassifier Tfidf"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

                          title  sample_size  train_acc  test_acc
0  RandomForestClassifier Tfidf        25000        1.0    0.8413


In [80]:
print("Classification Report for RandomForestClassifier with Tfidf")
print(classification_report(y_test, test_feature_rfc_tfidf))

Classification Report for RandomForestClassifier with Tfidf
              precision    recall  f1-score   support

    negative       0.83      0.85      0.84      4973
    positive       0.85      0.83      0.84      5027

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# Linear SVC

## Linear SVC with Hashing

In [81]:
time_beginning_of_training = time.time()

model_linearsvc_hashing =LinearSVC(random_state=0)
model_linearsvc_hashing.fit(hashing_fv_train, y_train)

test_feature_linearsvc_hashing = model_linearsvc_hashing.predict(hashing_fv_test)

train_acc=model_linearsvc_hashing.score(hashing.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))
test_acc=model_linearsvc_hashing.score(hashing.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

time_end_of_training = time.time()
print('Time to run: {}'.format(time_end_of_training-time_beginning_of_training))

Train accuracy 0.9738
Test accuracy 0.8841
Time to run: 2.0046639442443848


In [82]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["LinearSVC Hashing"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

               title  sample_size  train_acc  test_acc
0  LinearSVC Hashing        25000     0.9738    0.8841


In [83]:
print("Classification Report for LinearSVC with Hashing")
print(classification_report(y_test, test_feature_linearsvc_hashing))

Classification Report for LinearSVC with Hashing
              precision    recall  f1-score   support

    negative       0.89      0.88      0.88      4973
    positive       0.88      0.89      0.89      5027

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



## Linear SVC with Tfidf

In [84]:
time_beginning_of_training = time.time()

model_linearsvc_tfidf = LinearSVC(random_state=0)
model_linearsvc_tfidf.fit(tfidf.fit_transform(X_train), y_train)

test_model_linearsvc_tfidf = model_linearsvc_tfidf.predict(tfidf.transform(X_test))

train_acc=model_linearsvc_tfidf.score(tfidf.transform(X_train), y_train)
print('Train accuracy {}'.format(train_acc))

test_acc=model_linearsvc_tfidf.score(tfidf.transform(X_test), y_test)
print('Test accuracy {}'.format(test_acc))

time_end_of_training = time.time()
print('Time to run: {}'.format(time_end_of_training-time_beginning_of_training))

Train accuracy 0.9954666666666667
Test accuracy 0.8914
Time to run: 5.162245988845825


In [85]:
df = pd.DataFrame({})
df.reset_index(inplace=True)
df["title"]=["LinearSVC Tfidf"]
df["sample_size"]=[25000]
df["train_acc"]=train_acc
df["test_acc"]=test_acc
df.drop(labels="index",axis=1,inplace=True)
print(df)
results=pd.concat([df,results])

             title  sample_size  train_acc  test_acc
0  LinearSVC Tfidf        25000   0.995467    0.8914


In [86]:
print("Classification Report for LinearSVC with Tfidf")
print(classification_report(y_test, test_model_linearsvc_tfidf))

Classification Report for LinearSVC with Tfidf
              precision    recall  f1-score   support

    negative       0.89      0.89      0.89      4973
    positive       0.89      0.90      0.89      5027

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# Display Scores

In [87]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

results

Unnamed: 0,title,sample_size,train_acc,test_acc
0,LinearSVC Tfidf,25000,0.995467,0.8914
0,LinearSVC Hashing,25000,0.9738,0.8841
0,RandomForestClassifier Tfidf,25000,1.0,0.8413
0,RandomForestClassifier Hashing,25000,1.0,0.8502
0,Logistic Regression Tfidf,25000,0.937867,0.8863
0,Logistic Regression Hashing,25000,0.905467,0.8691
0,Logistic Regression BoW,25000,0.9994,0.8768


## Null Hypothesis Testing:

In [88]:
t, p = paired_ttest_5x2cv(estimator1=model_lr_tfidf, 
                          estimator2=model_linearsvc_tfidf, 
                          X=tfidf_fv_train, 
                          y=y_train, 
                          scoring='accuracy', 
                          random_seed=1)

print(f'P-value is {p:.3f}')
print(f't-statistics is {t:.3f}')

if p <= 0.05:
    print('Here p<0.05, so we may conclude that performance of this models are significantly different from each other and so rejecting null hypothesis')
else:
    print('Here p>0.05, so we can say that performance of two models are not significatly different and so cannot reject the null hypothesis')

P-value is 0.884
t-statistics is -0.154
Here p>0.05, so we can say that performance of two models are not significatly different and so cannot reject the null hypothesis


In [89]:
t, p = paired_ttest_5x2cv(estimator1=model_linearsvc_hashing, 
                          estimator2=model_rfc_hashing, 
                          X=hashing_fv_train, 
                          y=y_train, 
                          scoring='accuracy', 
                          random_seed=1)

print(f'P-value is {p:.3f}')
print(f't-statistics is {t:.3f}')

if p <= 0.05:
    print('Here p<0.05, so we may conclude that performance of this models are significantly different from each other and so rejecting null hypothesis')
else:
    print('Here p>0.05, so we can say that performance of two models are not significatly different and so cannot reject the null hypothesis')

P-value is 0.039
t-statistics is 2.774
Here p<0.05, so we may conclude that performance of this models are significantly different from each other and so rejecting null hypothesis


In [90]:
review = input("Please enter movie review: ")

Please enter movie review: All I read from these reviews is how it’s “not accurate to the book” that may be true but it’s still a great film, disliking a good movie and rating it low despite the fact of it being a great movie only because of its inaccuracy is just messed up, all though I loved this movie, I will say some of the seal stuff was unrealistic, but besides all that, Michael B Jordan did what he does best and delivered a great character. I understand why people are angry, but in all it was a good film. Also, this could be leading up to a tom Clancy - cinematic - Universe (AKA) TCCU, which I am excited for. If you tell the same story you’ll know what to expect and it’ll deprive the suspense from the film. I can’t wait to see more from this series.


In [91]:
## Negative Review: "Without a doubt the worst movie adaptation of a book I've ever seen.  It's hard to give it one star. The only obvious link to the book is John's change of name from Kelly to Clark at the end.  Clancy wrote a phenomenal page turner that had depth in plot and character development. This movie wasn't more than continuous scenes of gratuitous violence with little, if any meaningful or believable dialogue, and a story line that doesn't really have any relevance to the novel.  I can' believe the Clancy estate, which controls development of  the Tom Clancy related content since his death, would allow this to happen .  Anyone who read the book and saw this film, would easily agree.  Conversely, those who saw the film, without reading the book would walk away without knowledge or appreciation of Clancy's true genius. Amazon has done a much better job with it's Jack Ryan TV series in maintaining the relatable storylines and Clancy style"
## Positive Review: "All I read from these reviews is how it’s “not accurate to the book” that may be true but it’s still a great film, disliking a good movie and rating it low despite the fact of it being a great movie only because of its inaccuracy is just messed up, all though I loved this movie, I will say some of the seal stuff was unrealistic, but besides all that, Michael B Jordan did what he does best and delivered a great character. I understand why people are angry, but in all it was a good film. Also, this could be leading up to a tom Clancy - cinematic - Universe (AKA) TCCU, which I am excited for. If you tell the same story you’ll know what to expect and it’ll deprive the suspense from the film. I can’t wait to see more from this series."

In [92]:
input_vectorizer = TfidfVectorizer(vocabulary=tfidf.vocabulary_)
output_vector = input_vectorizer.fit_transform([review])
predicted_review = model_linearsvc_tfidf.predict(output_vector)
print(predicted_review)

['positive']
