# Import ingredients


In [7]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import random
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
import spacy
import en_core_web_sm
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from bs4 import BeautifulSoup as bs
import string
from nltk import pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Data

In [2]:
df = pd.read_pickle('Imdb_movie_reviews_database.pckl')

## Exploring the data - 1

In [75]:
df.shape

(50000, 2)

In [76]:
df

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
49995,I occasionally let my kids watch this garbage ...,0
49996,When all we have anymore is pretty much realit...,0
49997,The basic genre is a thriller intercut with an...,0
49998,Four things intrigued me as to this film - fir...,0


In [77]:
df.head(30)

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
5,"This isn't the comedic Robin Williams, nor is ...",1
6,Yes its an art... to successfully make a slow ...,1
7,"In this ""critically acclaimed psychological th...",1
8,THE NIGHT LISTENER (2006) **1/2 Robin Williams...,1
9,"You know, Robin Williams, God bless him, is co...",1


In [3]:
# renamed column
df = df.rename(columns={'review':'reviews'})

In [79]:

df.columns

Index(['reviews', 'sentiment'], dtype='object')

In [80]:
# Check reviews to see if they correspond to sentiment.
df['sentiment'][49000]

0

In [81]:
df.isnull().sum

<bound method DataFrame.sum of        reviews  sentiment
0        False      False
1        False      False
2        False      False
3        False      False
4        False      False
...        ...        ...
49995    False      False
49996    False      False
49997    False      False
49998    False      False
49999    False      False

[50000 rows x 2 columns]>

In [82]:
pos_reviews = df[df['sentiment']==1]['reviews']
neg_reviews = df[df['sentiment']==0]['reviews']
print("First 10 samples of positive reviews\n".format(),pos_reviews[:10])
print("First 10 samples of negative reviews\n".format(),neg_reviews[:10])

First 10 samples of positive reviews
 0    Bromwell High is a cartoon comedy. It ran at t...
1    Homelessness (or Houselessness as George Carli...
2    Brilliant over-acting by Lesley Ann Warren. Be...
3    This is easily the most underrated film inn th...
4    This is not the typical Mel Brooks film. It wa...
5    This isn't the comedic Robin Williams, nor is ...
6    Yes its an art... to successfully make a slow ...
7    In this "critically acclaimed psychological th...
8    THE NIGHT LISTENER (2006) **1/2 Robin Williams...
9    You know, Robin Williams, God bless him, is co...
Name: reviews, dtype: object
First 10 samples of negative reviews
 12500    Story of a man who has unnatural feelings for ...
12501    Airport '77 starts as a brand new luxury 747 p...
12502    This film lacked something I couldn't put my f...
12503    Sorry everyone,,, I know this is supposed to b...
12504    When I was little my parents took me along to ...
12505    "It appears that many critics find the id

In [83]:
df = df.sample(frac = 0.1, replace = False, random_state=42)

In [84]:
df.head()

Unnamed: 0,reviews,sentiment
33553,"When I first saw the ad for this, I was like '...",1
9427,"""A Girl's Folly"" is a sort of half-comedy, hal...",1
199,I started watching the show from the first sea...,1
12447,This is a more interesting than usual porn mov...,1
39489,I suppose for 1961 this film was supposed to b...,0


In [85]:
df = df.reset_index(drop=True)

In [86]:
df.head

<bound method NDFrame.head of                                                 reviews  sentiment
0     When I first saw the ad for this, I was like '...          1
1     "A Girl's Folly" is a sort of half-comedy, hal...          1
2     I started watching the show from the first sea...          1
3     This is a more interesting than usual porn mov...          1
4     I suppose for 1961 this film was supposed to b...          0
...                                                 ...        ...
4995  his has to surely be one of the worst gay-them...          0
4996  I wanted to punch the TV. Watching it was tort...          0
4997  i was enjoying this movie most of the time, bu...          0
4998  Sometimes when a film is panned by the critics...          0
4999  well after watching this i can say that it ain...          0

[5000 rows x 2 columns]>

In [87]:
pos_reviews = df[df['sentiment']==1]['reviews']
neg_reviews = df[df['sentiment']==0]['reviews']
print("First 10 samples of positive reviews\n".format(),pos_reviews[:10])
print("First 10 samples of negative reviews\n".format(),neg_reviews[:10])

First 10 samples of positive reviews
 0     When I first saw the ad for this, I was like '...
1     "A Girl's Folly" is a sort of half-comedy, hal...
2     I started watching the show from the first sea...
3     This is a more interesting than usual porn mov...
6     I saw this movie when it was new. Later I rent...
8     First, what I didn't like. The acting was not ...
9     As spectacle, it's hard to fault Nihon chinbot...
12    It starts slowly, showing the dreary lives of ...
13    A year after losing gorgeous Jane Parker (Maur...
15    After "Attack of the Fifty Foot Woman" with Al...
Name: reviews, dtype: object
First 10 samples of negative reviews
 4     I suppose for 1961 this film was supposed to b...
5     This is a poor film. It certainly belongs in t...
7     This meandering tale of mob revenge is simply ...
10    4 Oscar winners, Karl Malden, Sally Field, Shi...
11    Horror films are a curious thing, sometimes th...
14    'Nobody knows anybody' is a conspiracy theory ...

In [88]:
df['reviews'][8]

'First, what I didn\'t like. The acting was not really up to the Hamlet standard. Branagh was really over-the-top, doing a lot of yelling mostly. In my opinion, those actors who were not big-name celebrities generally did a better job; though I would except Billy Crystal and Robin Williams. (And Charlton Heston, too, but I wasn\'t sure if he was playing at being a hack.) A lot of the ambiguities in the play were clearly resolved one way in the flashbacks.<br /><br />What I think speaks very much in this play\'s favor is that it is accessible. Shakespeare is hard to understand for the vast majority of people nowadays; many people are not even inclined to try, because of its reputation as Serious Literature and its archaic English. If they see this film they will understand clearly at least one man\'s interpretation of the play. They will be seeing it more as Shakespeare\'s audiences saw it: a play with sword fights and battles, and mighty kings and nobles, murder and incest and evil sch

# Natural Language Processing Pipeline 

Cleaning: 
* Tokenize the text (break text down into sentences, words, or other units). 
* Removing punctiation, whitespaces and other noise from sentence.
* Remove words that contain numbers.
* Removing stop words like “if,” “but,” “or,” and so on.
* Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database.
*   Normalizing/Lemmatizing  words by condensing all forms of a word to its root form (e.g. rooms -> room, slept -> sleep).

Feature Engineering:
*   Vectorizing text by turning the text into a numerical representation for consumption by your classifier.



### Cleaning

In [72]:
example_text = df['reviews'][8]
example_text

"THE NIGHT LISTENER (2006) **1/2 Robin Williams, Toni Collette, Bobby Cannavale, Rory Culkin, Joe Morton, Sandra Oh, John Cullum, Lisa Emery, Becky Ann Baker. (Dir: Patrick Stettner) <br /><br />Hitchcockian suspenser gives Williams a stand-out low-key performance.<br /><br />What is it about celebrities and fans? What is the near paranoia one associates with the other and why is it almost the norm? <br /><br />In the latest derange fan scenario, based on true events no less, Williams stars as a talk-radio personality named Gabriel No one, who reads stories he's penned over the airwaves and has accumulated an interesting fan in the form of a young boy named Pete Logand (Culkin) who has submitted a manuscript about the travails of his troubled youth to No one's editor Ashe (Morton) who gives it to No one to read for himself. <br /><br />No one is naturally disturbed but ultimately intrigued about the nightmarish existence of Pete being abducted and sexually abused for years until he was

In [73]:
#make text lower case
lower_case = example_text.lower()
lower_case

"the night listener (2006) **1/2 robin williams, toni collette, bobby cannavale, rory culkin, joe morton, sandra oh, john cullum, lisa emery, becky ann baker. (dir: patrick stettner) <br /><br />hitchcockian suspenser gives williams a stand-out low-key performance.<br /><br />what is it about celebrities and fans? what is the near paranoia one associates with the other and why is it almost the norm? <br /><br />in the latest derange fan scenario, based on true events no less, williams stars as a talk-radio personality named gabriel no one, who reads stories he's penned over the airwaves and has accumulated an interesting fan in the form of a young boy named pete logand (culkin) who has submitted a manuscript about the travails of his troubled youth to no one's editor ashe (morton) who gives it to no one to read for himself. <br /><br />no one is naturally disturbed but ultimately intrigued about the nightmarish existence of pete being abducted and sexually abused for years until he was

In [74]:
#remove html
plain_text = bs(lower_case)
plain_text = plain_text.get_text()
plain_text

"the night listener (2006) **1/2 robin williams, toni collette, bobby cannavale, rory culkin, joe morton, sandra oh, john cullum, lisa emery, becky ann baker. (dir: patrick stettner) hitchcockian suspenser gives williams a stand-out low-key performance.what is it about celebrities and fans? what is the near paranoia one associates with the other and why is it almost the norm? in the latest derange fan scenario, based on true events no less, williams stars as a talk-radio personality named gabriel no one, who reads stories he's penned over the airwaves and has accumulated an interesting fan in the form of a young boy named pete logand (culkin) who has submitted a manuscript about the travails of his troubled youth to no one's editor ashe (morton) who gives it to no one to read for himself. no one is naturally disturbed but ultimately intrigued about the nightmarish existence of pete being abducted and sexually abused for years until he was finally rescued by a nurse named donna (collett

In [75]:
# filter for just words
import re
select_words = re.split(r'\W+',plain_text)
select_words = str(select_words)

In [76]:
select_words

"['the', 'night', 'listener', '2006', '1', '2', 'robin', 'williams', 'toni', 'collette', 'bobby', 'cannavale', 'rory', 'culkin', 'joe', 'morton', 'sandra', 'oh', 'john', 'cullum', 'lisa', 'emery', 'becky', 'ann', 'baker', 'dir', 'patrick', 'stettner', 'hitchcockian', 'suspenser', 'gives', 'williams', 'a', 'stand', 'out', 'low', 'key', 'performance', 'what', 'is', 'it', 'about', 'celebrities', 'and', 'fans', 'what', 'is', 'the', 'near', 'paranoia', 'one', 'associates', 'with', 'the', 'other', 'and', 'why', 'is', 'it', 'almost', 'the', 'norm', 'in', 'the', 'latest', 'derange', 'fan', 'scenario', 'based', 'on', 'true', 'events', 'no', 'less', 'williams', 'stars', 'as', 'a', 'talk', 'radio', 'personality', 'named', 'gabriel', 'no', 'one', 'who', 'reads', 'stories', 'he', 's', 'penned', 'over', 'the', 'airwaves', 'and', 'has', 'accumulated', 'an', 'interesting', 'fan', 'in', 'the', 'form', 'of', 'a', 'young', 'boy', 'named', 'pete', 'logand', 'culkin', 'who', 'has', 'submitted', 'a', 'manus

In [77]:
# remove punctuations and digits from oldtext
punct = string.punctuation + string.digits
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789'

In [78]:
table = str.maketrans('', '', punct)
text_punct = str(select_words).translate(table)

text_punct

'the night listener    robin williams toni collette bobby cannavale rory culkin joe morton sandra oh john cullum lisa emery becky ann baker dir patrick stettner hitchcockian suspenser gives williams a stand out low key performance what is it about celebrities and fans what is the near paranoia one associates with the other and why is it almost the norm in the latest derange fan scenario based on true events no less williams stars as a talk radio personality named gabriel no one who reads stories he s penned over the airwaves and has accumulated an interesting fan in the form of a young boy named pete logand culkin who has submitted a manuscript about the travails of his troubled youth to no one s editor ashe morton who gives it to no one to read for himself no one is naturally disturbed but ultimately intrigued about the nightmarish existence of pete being abducted and sexually abused for years until he was finally rescued by a nurse named donna collette giving an excellent performance

In [79]:
# tokenise text
tokens = [word for word in text_punct.split(" ")]
tokens

['the',
 'night',
 'listener',
 '',
 '',
 '',
 'robin',
 'williams',
 'toni',
 'collette',
 'bobby',
 'cannavale',
 'rory',
 'culkin',
 'joe',
 'morton',
 'sandra',
 'oh',
 'john',
 'cullum',
 'lisa',
 'emery',
 'becky',
 'ann',
 'baker',
 'dir',
 'patrick',
 'stettner',
 'hitchcockian',
 'suspenser',
 'gives',
 'williams',
 'a',
 'stand',
 'out',
 'low',
 'key',
 'performance',
 'what',
 'is',
 'it',
 'about',
 'celebrities',
 'and',
 'fans',
 'what',
 'is',
 'the',
 'near',
 'paranoia',
 'one',
 'associates',
 'with',
 'the',
 'other',
 'and',
 'why',
 'is',
 'it',
 'almost',
 'the',
 'norm',
 'in',
 'the',
 'latest',
 'derange',
 'fan',
 'scenario',
 'based',
 'on',
 'true',
 'events',
 'no',
 'less',
 'williams',
 'stars',
 'as',
 'a',
 'talk',
 'radio',
 'personality',
 'named',
 'gabriel',
 'no',
 'one',
 'who',
 'reads',
 'stories',
 'he',
 's',
 'penned',
 'over',
 'the',
 'airwaves',
 'and',
 'has',
 'accumulated',
 'an',
 'interesting',
 'fan',
 'in',
 'the',
 'form',
 'of',


In [80]:
# remove stop words
stop = stopwords.words("english")
stop_tokens = []
for token in tokens:
    if token not in stop:
      stop_tokens.append(token)
    

In [82]:
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [81]:
stop_tokens

['night',
 'listener',
 '',
 '',
 '',
 'robin',
 'williams',
 'toni',
 'collette',
 'bobby',
 'cannavale',
 'rory',
 'culkin',
 'joe',
 'morton',
 'sandra',
 'oh',
 'john',
 'cullum',
 'lisa',
 'emery',
 'becky',
 'ann',
 'baker',
 'dir',
 'patrick',
 'stettner',
 'hitchcockian',
 'suspenser',
 'gives',
 'williams',
 'stand',
 'low',
 'key',
 'performance',
 'celebrities',
 'fans',
 'near',
 'paranoia',
 'one',
 'associates',
 'almost',
 'norm',
 'latest',
 'derange',
 'fan',
 'scenario',
 'based',
 'true',
 'events',
 'less',
 'williams',
 'stars',
 'talk',
 'radio',
 'personality',
 'named',
 'gabriel',
 'one',
 'reads',
 'stories',
 'penned',
 'airwaves',
 'accumulated',
 'interesting',
 'fan',
 'form',
 'young',
 'boy',
 'named',
 'pete',
 'logand',
 'culkin',
 'submitted',
 'manuscript',
 'travails',
 'troubled',
 'youth',
 'one',
 'editor',
 'ashe',
 'morton',
 'gives',
 'one',
 'read',
 'one',
 'naturally',
 'disturbed',
 'ultimately',
 'intrigued',
 'nightmarish',
 'existence',

In [83]:
# remove empty tokens
stop_tokens = [token for token in stop_tokens if len(token) > 0]
stop_tokens

['night',
 'listener',
 'robin',
 'williams',
 'toni',
 'collette',
 'bobby',
 'cannavale',
 'rory',
 'culkin',
 'joe',
 'morton',
 'sandra',
 'oh',
 'john',
 'cullum',
 'lisa',
 'emery',
 'becky',
 'ann',
 'baker',
 'dir',
 'patrick',
 'stettner',
 'hitchcockian',
 'suspenser',
 'gives',
 'williams',
 'stand',
 'low',
 'key',
 'performance',
 'celebrities',
 'fans',
 'near',
 'paranoia',
 'one',
 'associates',
 'almost',
 'norm',
 'latest',
 'derange',
 'fan',
 'scenario',
 'based',
 'true',
 'events',
 'less',
 'williams',
 'stars',
 'talk',
 'radio',
 'personality',
 'named',
 'gabriel',
 'one',
 'reads',
 'stories',
 'penned',
 'airwaves',
 'accumulated',
 'interesting',
 'fan',
 'form',
 'young',
 'boy',
 'named',
 'pete',
 'logand',
 'culkin',
 'submitted',
 'manuscript',
 'travails',
 'troubled',
 'youth',
 'one',
 'editor',
 'ashe',
 'morton',
 'gives',
 'one',
 'read',
 'one',
 'naturally',
 'disturbed',
 'ultimately',
 'intrigued',
 'nightmarish',
 'existence',
 'pete',
 'abd

In [84]:
# pos tag text
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [85]:
pos_tags = pos_tag(stop_tokens)
pos_tags

[('night', 'NN'),
 ('listener', 'NN'),
 ('robin', 'NN'),
 ('williams', 'VBZ'),
 ('toni', 'JJ'),
 ('collette', 'NN'),
 ('bobby', 'NN'),
 ('cannavale', 'NN'),
 ('rory', 'NN'),
 ('culkin', 'NN'),
 ('joe', 'NN'),
 ('morton', 'NN'),
 ('sandra', 'NN'),
 ('oh', 'UH'),
 ('john', 'NN'),
 ('cullum', 'NN'),
 ('lisa', 'JJ'),
 ('emery', 'NN'),
 ('becky', 'NN'),
 ('ann', 'NN'),
 ('baker', 'NN'),
 ('dir', 'NN'),
 ('patrick', 'NN'),
 ('stettner', 'NN'),
 ('hitchcockian', 'JJ'),
 ('suspenser', 'NN'),
 ('gives', 'VBZ'),
 ('williams', 'NNS'),
 ('stand', 'VBP'),
 ('low', 'JJ'),
 ('key', 'JJ'),
 ('performance', 'NN'),
 ('celebrities', 'NNS'),
 ('fans', 'NNS'),
 ('near', 'IN'),
 ('paranoia', 'JJ'),
 ('one', 'CD'),
 ('associates', 'VBZ'),
 ('almost', 'RB'),
 ('norm', 'JJ'),
 ('latest', 'JJS'),
 ('derange', 'NN'),
 ('fan', 'NN'),
 ('scenario', 'NN'),
 ('based', 'VBN'),
 ('true', 'JJ'),
 ('events', 'NNS'),
 ('less', 'RBR'),
 ('williams', 'JJ'),
 ('stars', 'NNS'),
 ('talk', 'VBP'),
 ('radio', 'NN'),
 ('personal

In [104]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [118]:
get_wordnet_pos(token[1])

'n'

In [87]:
wnl = WordNetLemmatizer()
wnl.lemmatize('actors')

'actor'

In [88]:
# lemmatize text
lem_text = []
for token in pos_tags:
   lem_text.append(wnl.lemmatize(token[0], get_wordnet_pos(token[1])))
    

In [89]:
lem_text

['night',
 'listener',
 'robin',
 'williams',
 'toni',
 'collette',
 'bobby',
 'cannavale',
 'rory',
 'culkin',
 'joe',
 'morton',
 'sandra',
 'oh',
 'john',
 'cullum',
 'lisa',
 'emery',
 'becky',
 'ann',
 'baker',
 'dir',
 'patrick',
 'stettner',
 'hitchcockian',
 'suspenser',
 'give',
 'williams',
 'stand',
 'low',
 'key',
 'performance',
 'celebrity',
 'fan',
 'near',
 'paranoia',
 'one',
 'associate',
 'almost',
 'norm',
 'late',
 'derange',
 'fan',
 'scenario',
 'base',
 'true',
 'event',
 'less',
 'williams',
 'star',
 'talk',
 'radio',
 'personality',
 'name',
 'gabriel',
 'one',
 'read',
 'story',
 'pen',
 'airwave',
 'accumulate',
 'interesting',
 'fan',
 'form',
 'young',
 'boy',
 'name',
 'pete',
 'logand',
 'culkin',
 'submit',
 'manuscript',
 'travail',
 'trouble',
 'youth',
 'one',
 'editor',
 'ashe',
 'morton',
 'give',
 'one',
 'read',
 'one',
 'naturally',
 'disturb',
 'ultimately',
 'intrigued',
 'nightmarish',
 'existence',
 'pete',
 'abduct',
 'sexually',
 'abused'

In [90]:
for token in stop_tokens:
  print(f"token: {token}, lemma: {wnl.lemmatize(token)}") 


token: night, lemma: night
token: listener, lemma: listener
token: robin, lemma: robin
token: williams, lemma: williams
token: toni, lemma: toni
token: collette, lemma: collette
token: bobby, lemma: bobby
token: cannavale, lemma: cannavale
token: rory, lemma: rory
token: culkin, lemma: culkin
token: joe, lemma: joe
token: morton, lemma: morton
token: sandra, lemma: sandra
token: oh, lemma: oh
token: john, lemma: john
token: cullum, lemma: cullum
token: lisa, lemma: lisa
token: emery, lemma: emery
token: becky, lemma: becky
token: ann, lemma: ann
token: baker, lemma: baker
token: dir, lemma: dir
token: patrick, lemma: patrick
token: stettner, lemma: stettner
token: hitchcockian, lemma: hitchcockian
token: suspenser, lemma: suspenser
token: gives, lemma: give
token: williams, lemma: williams
token: stand, lemma: stand
token: low, lemma: low
token: key, lemma: key
token: performance, lemma: performance
token: celebrities, lemma: celebrity
token: fans, lemma: fan
token: near, lemma: near
t

In [91]:
 # remove words with only one letter
meaningful_words = []
for token in lem_text:
    if len(token) > 1:
      meaningful_words.append(token)

In [92]:
meaningful_words

['night',
 'listener',
 'robin',
 'williams',
 'toni',
 'collette',
 'bobby',
 'cannavale',
 'rory',
 'culkin',
 'joe',
 'morton',
 'sandra',
 'oh',
 'john',
 'cullum',
 'lisa',
 'emery',
 'becky',
 'ann',
 'baker',
 'dir',
 'patrick',
 'stettner',
 'hitchcockian',
 'suspenser',
 'give',
 'williams',
 'stand',
 'low',
 'key',
 'performance',
 'celebrity',
 'fan',
 'near',
 'paranoia',
 'one',
 'associate',
 'almost',
 'norm',
 'late',
 'derange',
 'fan',
 'scenario',
 'base',
 'true',
 'event',
 'less',
 'williams',
 'star',
 'talk',
 'radio',
 'personality',
 'name',
 'gabriel',
 'one',
 'read',
 'story',
 'pen',
 'airwave',
 'accumulate',
 'interesting',
 'fan',
 'form',
 'young',
 'boy',
 'name',
 'pete',
 'logand',
 'culkin',
 'submit',
 'manuscript',
 'travail',
 'trouble',
 'youth',
 'one',
 'editor',
 'ashe',
 'morton',
 'give',
 'one',
 'read',
 'one',
 'naturally',
 'disturb',
 'ultimately',
 'intrigued',
 'nightmarish',
 'existence',
 'pete',
 'abduct',
 'sexually',
 'abused'

**To be able to replicate this for all reviews I have defined a function below:**

In [93]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):

    '''
    Function to return the corresponding wordnet object value of the 
    'Part Of Speach tag' i.e "thing : NN" corresponds to Noun in wordnet.
    Input: string (pos tag)
    Output: string (a wordnet object)
    '''

    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
import re
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def review_cleaner(text):

    '''
    Function to convert a review to a tokenised string of words.
    Input: string (a raw movie review)
    Output: string (a preprocessed movie review)
    '''

    # lower text
    text = text.lower()
    # split by just words
    text = re.split(r'\W+',text)
    text = " ".join(text)
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)



In [94]:
example_text_1 = df['reviews'][8]
example_text_1 

"THE NIGHT LISTENER (2006) **1/2 Robin Williams, Toni Collette, Bobby Cannavale, Rory Culkin, Joe Morton, Sandra Oh, John Cullum, Lisa Emery, Becky Ann Baker. (Dir: Patrick Stettner) <br /><br />Hitchcockian suspenser gives Williams a stand-out low-key performance.<br /><br />What is it about celebrities and fans? What is the near paranoia one associates with the other and why is it almost the norm? <br /><br />In the latest derange fan scenario, based on true events no less, Williams stars as a talk-radio personality named Gabriel No one, who reads stories he's penned over the airwaves and has accumulated an interesting fan in the form of a young boy named Pete Logand (Culkin) who has submitted a manuscript about the travails of his troubled youth to No one's editor Ashe (Morton) who gives it to No one to read for himself. <br /><br />No one is naturally disturbed but ultimately intrigued about the nightmarish existence of Pete being abducted and sexually abused for years until he was

In [95]:
review_cleaner(example_text_1)

'night listener robin williams toni collette bobby cannavale rory culkin joe morton sandra oh john cullum lisa emery becky ann baker dir patrick stettner br br hitchcockian suspenser give williams stand low key performance br br celebrity fan near paranoia one associate almost norm br br late derange fan scenario base true event less williams star talk radio personality name gabriel one read story pen airwave accumulate interesting fan form young boy name pete logand culkin submit manuscript travail trouble youth one editor ashe morton give one read br br one naturally disturb ultimately intrigued nightmarish existence pete abduct sexually abused year finally rescue nurse name donna collette give excellent performance adopt boy correspondence one reveal pete die aid naturally one want meet fan suddenly doubt possibly devious ulterior motif seed plant estrange lover jess cannavale whose sudden departure new york city apartment one emotional tailspin grow tempest teacup decides investiga

In [96]:
df

Unnamed: 0,reviews,sentiment,review_clean
0,Bromwell High is a cartoon comedy. It ran at t...,1,bromwell high cartoon comedy run time program ...
1,Homelessness (or Houselessness as George Carli...,1,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,brilliant act lesley ann warren best dramatic ...
3,This is easily the most underrated film inn th...,1,easily underrated film inn brook cannon sure f...
4,This is not the typical Mel Brooks film. It wa...,1,typical mel brook film much less slapstick mov...
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,occasionally let kid watch garbage understand ...
49996,When all we have anymore is pretty much realit...,0,anymore pretty much reality tv show people mak...
49997,The basic genre is a thriller intercut with an...,0,basic genre thriller intercut uncomfortable me...
49998,Four things intrigued me as to this film - fir...,0,four thing intrigue film firstly star carly po...


In [97]:
# clean text data
df["review_clean"] = df["reviews"].apply(lambda x: review_cleaner(x))

In [103]:
df

Unnamed: 0,reviews,sentiment,review_clean
0,Bromwell High is a cartoon comedy. It ran at t...,1,bromwell high cartoon comedy run time program ...
1,Homelessness (or Houselessness as George Carli...,1,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,brilliant act lesley ann warren best dramatic ...
3,This is easily the most underrated film inn th...,1,easily underrated film inn brook cannon sure f...
4,This is not the typical Mel Brooks film. It wa...,1,typical mel brook film much less slapstick mov...
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,occasionally let kid watch garbage understand ...
49996,When all we have anymore is pretty much realit...,0,anymore pretty much reality tv show people mak...
49997,The basic genre is a thriller intercut with an...,0,basic genre thriller intercut uncomfortable me...
49998,Four things intrigued me as to this film - fir...,0,four thing intrigue film firstly star carly po...


In [67]:
df.to_pickle('Imdb_movie_reviews_database_cleaned_v1.pckl')