# Import ingredients


In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
import spacy
import en_core_web_sm
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




## Loading Data

In [None]:
df = pd.read_pickle('final_project/Imdb_movie_reviews_database_cleaned_v1.pckl')

## Exploring the data - 1

In [None]:
df.shape

(50000, 3)

In [None]:
df

Unnamed: 0,reviews,sentiment,review_clean
0,Bromwell High is a cartoon comedy. It ran at t...,1,bromwell high cartoon comedy run time program ...
1,Homelessness (or Houselessness as George Carli...,1,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,brilliant act lesley ann warren best dramatic ...
3,This is easily the most underrated film inn th...,1,easily underrated film inn brook cannon sure f...
4,This is not the typical Mel Brooks film. It wa...,1,typical mel brook film much less slapstick mov...
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,occasionally let kid watch garbage understand ...
49996,When all we have anymore is pretty much realit...,0,anymore pretty much reality tv show people mak...
49997,The basic genre is a thriller intercut with an...,0,basic genre thriller intercut uncomfortable me...
49998,Four things intrigued me as to this film - fir...,0,four thing intrigue film firstly star carly po...


In [None]:
# Check reviews to see if they correspond to sentiment.
df['reviews'][0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [None]:
pos_reviews = df[df['sentiment']==1]['reviews']
neg_reviews = df[df['sentiment']==0]['reviews']
print("First 10 samples of positive reviews\n".format(),pos_reviews[:10])
print("First 10 samples of negative reviews\n".format(),neg_reviews[:10])

First 10 samples of positive reviews
 0    Bromwell High is a cartoon comedy. It ran at t...
1    Homelessness (or Houselessness as George Carli...
2    Brilliant over-acting by Lesley Ann Warren. Be...
3    This is easily the most underrated film inn th...
4    This is not the typical Mel Brooks film. It wa...
5    This isn't the comedic Robin Williams, nor is ...
6    Yes its an art... to successfully make a slow ...
7    In this "critically acclaimed psychological th...
8    THE NIGHT LISTENER (2006) **1/2 Robin Williams...
9    You know, Robin Williams, God bless him, is co...
Name: reviews, dtype: object
First 10 samples of negative reviews
 12500    Story of a man who has unnatural feelings for ...
12501    Airport '77 starts as a brand new luxury 747 p...
12502    This film lacked something I couldn't put my f...
12503    Sorry everyone,,, I know this is supposed to b...
12504    When I was little my parents took me along to ...
12505    "It appears that many critics find the id

In [None]:
# df is in order, must randomise
df = df.sample(frac=1, random_state=6).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,reviews,sentiment,review_clean
0,"Having just finished reading the book ""One of ...",0,finish reading book one kind week ago think wo...
1,WEEE this is still jolly good fun! As with mos...,1,weee still jolly good fun friend see movie hbo...
2,Well I guess I know the answer to that questio...,0,well guess know answer question money bombard ...
3,This is easily one of the best movies of the 1...,1,easily one best movie otto preminger direct fo...
4,The story is about Ankush (Abhay Deol) - who i...,1,story ankush abhay deol professional marriage ...


In [None]:
pos_reviews = df[df['sentiment']==1]['reviews']
neg_reviews = df[df['sentiment']==0]['reviews']
print("First 10 samples of positive reviews\n".format(),pos_reviews[:10])
print("First 10 samples of negative reviews\n".format(),neg_reviews[:10])

First 10 samples of positive reviews
 1     WEEE this is still jolly good fun! As with mos...
3     This is easily one of the best movies of the 1...
4     The story is about Ankush (Abhay Deol) - who i...
12    Gene Tierney and Dana Andrews, who were both s...
13    A message movie, but a rather good one. Outsta...
14    The silent film masterpiece Battleship Potemki...
15    i loved this movie it was one of the years bes...
16    I suppose I always felt that Hotel du Nord was...
18    gone in 60 seconds is a very good action comed...
19    For a long time I did not know weather I liked...
Name: reviews, dtype: object
First 10 samples of negative reviews
 0     Having just finished reading the book "One of ...
2     Well I guess I know the answer to that questio...
5     This is the third parody of the scary movies a...
6     in fact,it's basically the same movie.and they...
7     Comedy Central has a habit of putting on great...
8     My comment is for the Russian version of Space...

In [None]:
df.tail(5)

Unnamed: 0,reviews,sentiment,review_clean
49995,"Troma founder, Lloyd Kaufman is The Crapkeeper...",0,troma founder lloyd kaufman crapkeeper antholo...
49996,whereas the hard-boiled detective stories of D...,1,whereas hard boil detective story dashiell ham...
49997,"""Undercurrent"" features a top-notch cast of wo...",0,undercurrent feature top notch cast wonderful ...
49998,For those expecting the cover art and story ou...,0,expect cover art story outline indicate anothe...
49999,I came across this movie on TV. I hadn't heard...,1,come across movie tv heard almost change chann...


## Vectorisation Feature Engineering
### Following steps borrowed from Jonathan Oheix "Detecting Bad Customer Reviews with NLP"

In [None]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df["sentiments"] = df["reviews"].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)

In [None]:
df

Unnamed: 0,reviews,sentiment,review_clean,neg,neu,pos,compound
0,"Having just finished reading the book ""One of ...",0,finish reading book one kind week ago think wo...,0.071,0.820,0.109,0.9364
1,WEEE this is still jolly good fun! As with mos...,1,weee still jolly good fun friend see movie hbo...,0.079,0.747,0.175,0.7895
2,Well I guess I know the answer to that questio...,0,well guess know answer question money bombard ...,0.137,0.718,0.145,-0.4044
3,This is easily one of the best movies of the 1...,1,easily one best movie otto preminger direct fo...,0.079,0.737,0.184,0.9184
4,The story is about Ankush (Abhay Deol) - who i...,1,story ankush abhay deol professional marriage ...,0.018,0.735,0.247,0.9992
...,...,...,...,...,...,...,...
49995,"Troma founder, Lloyd Kaufman is The Crapkeeper...",0,troma founder lloyd kaufman crapkeeper antholo...,0.045,0.827,0.129,0.8709
49996,whereas the hard-boiled detective stories of D...,1,whereas hard boil detective story dashiell ham...,0.134,0.768,0.098,-0.9811
49997,"""Undercurrent"" features a top-notch cast of wo...",0,undercurrent feature top notch cast wonderful ...,0.096,0.770,0.133,0.4890
49998,For those expecting the cover art and story ou...,0,expect cover art story outline indicate anothe...,0.069,0.772,0.159,0.9929


In [None]:
# add number of characters column
df["nb_char"] = df["reviews"].apply(lambda x: len(x))

In [None]:
df.head()

Unnamed: 0,reviews,sentiment,review_clean,neg,neu,pos,compound,nb_char
0,"Having just finished reading the book ""One of ...",0,finish reading book one kind week ago think wo...,0.071,0.82,0.109,0.9364,1629
1,WEEE this is still jolly good fun! As with mos...,1,weee still jolly good fun friend see movie hbo...,0.079,0.747,0.175,0.7895,418
2,Well I guess I know the answer to that questio...,0,well guess know answer question money bombard ...,0.137,0.718,0.145,-0.4044,1227
3,This is easily one of the best movies of the 1...,1,easily one best movie otto preminger direct fo...,0.079,0.737,0.184,0.9184,628
4,The story is about Ankush (Abhay Deol) - who i...,1,story ankush abhay deol professional marriage ...,0.018,0.735,0.247,0.9992,2628


In [None]:
# add number of words column
df["nb_words"] = df["reviews"].apply(lambda x: len(x.split(" ")))

In [None]:
df

Unnamed: 0,reviews,sentiment,review_clean,neg,neu,pos,compound,nb_char,nb_words
0,"Having just finished reading the book ""One of ...",0,finish reading book one kind week ago think wo...,0.071,0.820,0.109,0.9364,1629,274
1,WEEE this is still jolly good fun! As with mos...,1,weee still jolly good fun friend see movie hbo...,0.079,0.747,0.175,0.7895,418,84
2,Well I guess I know the answer to that questio...,0,well guess know answer question money bombard ...,0.137,0.718,0.145,-0.4044,1227,218
3,This is easily one of the best movies of the 1...,1,easily one best movie otto preminger direct fo...,0.079,0.737,0.184,0.9184,628,112
4,The story is about Ankush (Abhay Deol) - who i...,1,story ankush abhay deol professional marriage ...,0.018,0.735,0.247,0.9992,2628,482
...,...,...,...,...,...,...,...,...,...
49995,"Troma founder, Lloyd Kaufman is The Crapkeeper...",0,troma founder lloyd kaufman crapkeeper antholo...,0.045,0.827,0.129,0.8709,686,119
49996,whereas the hard-boiled detective stories of D...,1,whereas hard boil detective story dashiell ham...,0.134,0.768,0.098,-0.9811,4896,799
49997,"""Undercurrent"" features a top-notch cast of wo...",0,undercurrent feature top notch cast wonderful ...,0.096,0.770,0.133,0.4890,655,101
49998,For those expecting the cover art and story ou...,0,expect cover art story outline indicate anothe...,0.069,0.772,0.159,0.9929,2709,465


Extract vector representations for every review. Gensim module creates a numerical vectors for each word in the corpus by using the contexts in which they appear (Word2Vec).

In [None]:
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
df = pd.concat([df, doc2vec_df], axis=1)

In [None]:
df

Unnamed: 0,reviews,sentiment,review_clean,neg,neu,pos,compound,nb_char,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,"Having just finished reading the book ""One of ...",0,finish reading book one kind week ago think wo...,0.071,0.820,0.109,0.9364,1629,274,-0.406136,0.168257,-0.180792,0.076184,0.573996
1,WEEE this is still jolly good fun! As with mos...,1,weee still jolly good fun friend see movie hbo...,0.079,0.747,0.175,0.7895,418,84,-0.080176,0.015398,-0.327595,0.015183,0.200012
2,Well I guess I know the answer to that questio...,0,well guess know answer question money bombard ...,0.137,0.718,0.145,-0.4044,1227,218,-0.288821,0.038533,-0.339843,0.126955,0.638754
3,This is easily one of the best movies of the 1...,1,easily one best movie otto preminger direct fo...,0.079,0.737,0.184,0.9184,628,112,0.460414,0.148562,-0.364555,0.304457,0.196561
4,The story is about Ankush (Abhay Deol) - who i...,1,story ankush abhay deol professional marriage ...,0.018,0.735,0.247,0.9992,2628,482,-0.155596,0.596693,-0.176754,0.119736,0.500467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,"Troma founder, Lloyd Kaufman is The Crapkeeper...",0,troma founder lloyd kaufman crapkeeper antholo...,0.045,0.827,0.129,0.8709,686,119,-0.196917,0.072463,-0.089229,0.297636,0.275315
49996,whereas the hard-boiled detective stories of D...,1,whereas hard boil detective story dashiell ham...,0.134,0.768,0.098,-0.9811,4896,799,0.348139,1.858647,0.282963,-0.526497,-0.842359
49997,"""Undercurrent"" features a top-notch cast of wo...",0,undercurrent feature top notch cast wonderful ...,0.096,0.770,0.133,0.4890,655,101,0.178532,0.607149,0.030661,0.109154,0.217802
49998,For those expecting the cover art and story ou...,0,expect cover art story outline indicate anothe...,0.069,0.772,0.159,0.9929,2709,465,0.674226,1.163354,-0.167812,-0.479264,0.588338


In [None]:
df.to_pickle("Imdb_doc2vec_v1.pckl")

In [None]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 100)

In [None]:
tfidf_result = tfidf.fit_transform(df["review_clean"]).toarray()

In [None]:
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())

In [None]:
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]

In [None]:
tfidf_df.index = df.index

In [None]:
tfidf_df

Unnamed: 0,word_aaron,word_abandon,word_abc,word_ability,word_able,word_aboard,word_abound,word_abraham,word_abrupt,word_abruptly,word_absence,word_absent,word_absolute,word_absolutely,word_absorb,word_absurd,word_absurdity,word_abuse,word_abusive,word_abysmal,word_academy,word_accent,word_accept,word_acceptable,word_acceptance,word_access,word_accessible,word_accident,word_accidentally,word_acclaim,word_accompany,word_accomplish,word_accord,word_account,word_accuracy,word_accurate,word_accurately,word_accuse,word_ace,word_achieve,...,word_worthless,word_worthwhile,word_worthy,word_would,word_wound,word_wow,word_wrap,word_wreck,word_wrench,word_wrestle,word_wretched,word_write,word_writer,word_writing,word_wrong,word_wtf,word_wwii,word_www,word_ya,word_yard,word_yawn,word_yeah,word_year,word_yearn,word_yell,word_yellow,word_yep,word_yes,word_yesterday,word_yet,word_york,word_young,word_youngster,word_youth,word_youthful,word_youtube,word_zero,word_zombie,word_zone,word_zoom
0,0.0,0.0,0.0,0.091224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.079872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.063739,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110298,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.262566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.086481,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063412,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.054113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.063364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.092902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.054386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.017113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.027313,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.062864,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_df.to_pickle("Imdb_tfidf.pckl")

In [None]:
#df = pd.concat([df, tfidf_df], axis=1)