In [1]:
import numpy as np #Linear Algebra
import pandas as pd   #Data processing, CSV file I/O (e.g. pd.read_csv)
import nltk  #NLTK Library

In [2]:
#Loading the dataset
data = pd.read_csv("IMDB_Review.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

In [4]:
#Checking the information of the datset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
#Checking the label attribute which is sentiment
data.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [6]:
data.sentiment.nunique()

2

In [7]:
data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [8]:
#Label encode sentiment to 1(positive) and 0(negative)
data.sentiment.replace('positive', 1, inplace=True)
data.sentiment.replace('negative', 0, inplace=True)

#Now checking the head part
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
#Now checking the review column
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## STEPS TO CLEAN THE REVIEWS :

* Remove HTML tags

* Remove special characters

* Convert everything to lowercase

* Remove stopwords

* Stemming

In [10]:
#1. Remove HTML tags
#Regex rule : "<.*?>"

import re #For regex

def clean(text):
    cleaned = re.compile(r"<.*?>")
    return re.sub(cleaned," ",text)     #Substring replace with " "(space)

data.review = data.review.apply(clean)
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.  The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.  It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.  I would say the main appeal of the show is due to the fact that it goes where other sh

In [11]:
#2. Remove special characters like punctuation
# We can use string.punctuation or manually

def is_special(text):
    empty = ''
    for i in text:
        if i.isalnum():
            empty = empty + i
        else:
            empty = empty + ' '
    return empty

data.review = data.review.apply(is_special)
data.review[0]

'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me   The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word   It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away   I would say the main appeal of the show is due to the fact that it goes where other sh

In [12]:
#3. Convert everything to Lowercase
def lower(text):
    return text.lower()

data.review = data.review.apply(lower)
data.review[0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you ll be hooked  they are right  as this is exactly what happened with me   the first thing that struck me about oz was its brutality and unflinching scenes of violence  which set in right from the word go  trust me  this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs  sex or violence  its is hardcore  in the classic use of the word   it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  em city is home to many  aryans  muslims  gangstas  latinos  christians  italians  irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away   i would say the main appeal of the show is due to the fact that it goes where other sh

In [13]:
#4. Removing the stopwords
from nltk.corpus import stopwords  #For stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(remove_stopwords)
data.review[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'da

In [14]:
#5. Stemming
from nltk.stem import SnowballStemmer

def stem_text(text):
    snowball = SnowballStemmer('english')
    return " ".join([snowball.stem(w) for w in text])

data.review = data.review.apply(stem_text)
data.review[0]

'one review mention watch 1 oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

## Creating the Model

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [16]:
#Seprating the attributes
X = np.array(data.iloc[:,0].values)
Y = np.array(data.sentiment.values)

In [17]:
print(X.shape,Y.shape)

(50000,) (50000,)


In [18]:
#1. Creating Bag of Words (BOW)
#max_featuresint, default=None
# If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)   # We take only thousand features to stored in a dictionary

In [19]:
X = cv.fit_transform(data.review).toarray()
print("X.shape = ",X.shape)
print("Y.shape = ",Y.shape)

X.shape =  (50000, 1000)
Y.shape =  (50000,)


In [20]:
# Let's look at the vocabulary:
# We can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
#For this we can see all our thousand features in form of dictionary
#By using this we can understand that if any new data comes in we can check that new data comes in this vocab or not.
print('Vocabulary: ')
print(cv.vocabulary_)
print()

Vocabulary: 
{'one': 612, 'review': 722, 'mention': 555, 'watch': 955, 'episod': 282, 'right': 727, 'exact': 296, 'happen': 403, 'first': 345, 'thing': 882, 'scene': 745, 'violenc': 945, 'set': 769, 'word': 982, 'go': 386, 'show': 779, 'heart': 411, 'pull': 682, 'drug': 256, 'sex': 771, 'classic': 151, 'use': 932, 'call': 116, 'given': 385, 'state': 827, 'focus': 352, 'main': 529, 'citi': 148, 'prison': 672, 'front': 366, 'face': 310, 'high': 415, 'home': 423, 'mani': 536, 'italian': 462, 'death': 214, 'deal': 213, 'never': 588, 'far': 322, 'away': 72, 'would': 988, 'say': 742, 'appeal': 52, 'due': 257, 'fact': 311, 'goe': 388, 'forget': 356, 'pretti': 670, 'pictur': 643, 'audienc': 67, 'charm': 138, 'romanc': 733, 'mess': 556, 'around': 56, 'ever': 291, 'saw': 741, 'develop': 230, 'tast': 869, 'got': 392, 'level': 504, 'kill': 477, 'order': 615, 'get': 380, 'well': 962, 'manner': 537, 'middl': 559, 'class': 150, 'turn': 916, 'lack': 485, 'street': 840, 'skill': 793, 'experi': 304, 'ma

In [21]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [22]:
# 2. Train test split
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.2,random_state=9)
print("Train shapes : X = {}, y = {}".format(train_x.shape,train_y.shape))
print("Test shapes : X = {}, y = {}".format(test_x.shape,test_y.shape))

Train shapes : X = (40000, 1000), y = (40000,)
Test shapes : X = (10000, 1000), y = (10000,)


In [23]:
# 3.Defining the modeland training them
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gnb = GaussianNB()
mnb = MultinomialNB(alpha= 1.0,fit_prior= True)
bnb = BernoulliNB()

gnb.fit(train_x,train_y)
mnb.fit(train_x,train_y)
bnb.fit(train_x,train_y)

BernoulliNB()

In [24]:
# 4. Prediction and accuracy metrics to choose best model
from sklearn.metrics import accuracy_score

pred_gnb = gnb.predict(test_x)
pred_mnb = mnb.predict(test_x)
pred_bnb = bnb.predict(test_x)

print("Gaussian = ", accuracy_score(test_y,pred_gnb))
print("Mutlinomail = ", accuracy_score(test_y,pred_mnb))
print("Bernauli = ", accuracy_score(test_y,pred_bnb))

Gaussian =  0.7846
Mutlinomail =  0.8308
Bernauli =  0.8388


### Pickling data

In [25]:
#Pickling the file
#Saving the data
import pickle
f = open('model_1.pkl','wb')      # 'wb' instead 'w' for binary file  and f is our object
pickle.dump(bnb,f,-1)             # -1 specifies highest binary protocol,bnb is our data and after that our file object
f.close()

In [26]:
#Unpickling the file
#Loading the data
#Optional step
#f = open('model_1.pkl', 'rb')   # 'r' for reading; can be omitted
#mydict = pickle.load(f)
#f.close()
#print(mydict)

In [27]:
#Another way:
#By using with file is automatically closed

#with open('model_1.pkl','wb') as f:
#    pickle.dump(bnb,f,-1)

In [28]:
#with open('model_1.pkl','rb') as f:
#    pickle.load(f)

In [29]:
#print(f)

### Testing the model

In [30]:
review =  """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.

So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. The film was a chaotic mish mash of action elements and failed 'set pieces'...

I found the villain to be quite amusing.

And now I give up. This movie is not robbing any more of my time but I felt I ought to contribute to restoring the obvious fake rating and reviews this movie has been getting on IMDb."""

print(review)

Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.

So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. The film was a chaotic mish mash of action elements and 

In [31]:
#CLeaning steps via calling our function

# 1.Removing HTML tags
data_clean = clean(review)

#2.Removing Special Characters
spl_char_removal = is_special(data_clean)

#3.Convert into lower case
lower_case = lower(spl_char_removal)

#4.Stopwords Removal
stopword_removal = remove_stopwords(lower_case)

#5.Stemming
stemming = stem_text(stopword_removal)

In [32]:
print(stemming)

terribl complet trash brainless tripe insult anyon 8 year old fan boy im actual pretti disgust movi make money say peopl brainless hand hard earn cash entertain fashion come leav posit 8 8 review oh yes moron sensibl conclus draw anyon rate movi amongst pantheon great titl beyond tri find someth construct say titl hard enjoy iron man toni stark inspir charact movi pale shadow hook movi wonder iron man would knock captain america oh wish charact anyway useless bicker idiot realli organis happi time breweri film chaotic mish mash action element fail set piec found villain quit amus give movi rob time felt ought contribut restor obvious fake rate review movi get imdb


In [46]:
#Now creating Bag of words
bow = []
words = word_tokenize(stemming)
for word in words:
    bow.append(words.count(word))

In [54]:
# We can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
word_dict = cv.vocabulary_
#For this we can see all our thousand features in form of dictionary
print('Vocabulary: ')
print(word_dict)

Vocabulary: 
{'one': 612, 'review': 722, 'mention': 555, 'watch': 955, 'episod': 282, 'right': 727, 'exact': 296, 'happen': 403, 'first': 345, 'thing': 882, 'scene': 745, 'violenc': 945, 'set': 769, 'word': 982, 'go': 386, 'show': 779, 'heart': 411, 'pull': 682, 'drug': 256, 'sex': 771, 'classic': 151, 'use': 932, 'call': 116, 'given': 385, 'state': 827, 'focus': 352, 'main': 529, 'citi': 148, 'prison': 672, 'front': 366, 'face': 310, 'high': 415, 'home': 423, 'mani': 536, 'italian': 462, 'death': 214, 'deal': 213, 'never': 588, 'far': 322, 'away': 72, 'would': 988, 'say': 742, 'appeal': 52, 'due': 257, 'fact': 311, 'goe': 388, 'forget': 356, 'pretti': 670, 'pictur': 643, 'audienc': 67, 'charm': 138, 'romanc': 733, 'mess': 556, 'around': 56, 'ever': 291, 'saw': 741, 'develop': 230, 'tast': 869, 'got': 392, 'level': 504, 'kill': 477, 'order': 615, 'get': 380, 'well': 962, 'manner': 537, 'middl': 559, 'class': 150, 'turn': 916, 'lack': 485, 'street': 840, 'skill': 793, 'experi': 304, 'ma

In [55]:
#Pickling the word_dict
pickle.dump(word_dict,open('bow.pkl','wb'))

In [56]:
#Creating the list to test our data
empty= []
for i in word_dict:
    empty.append(stemming.count(i[0]))

In [64]:
#print(empty)

#Converting the empty list into ndim array 
np.array(empty).reshape(1,1000)

array([[47, 38, 25,  9, 49, 38, 49, 20,  8, 44, 34, 12, 34,  9,  7, 34,
        20, 13, 14, 34, 21, 13, 21,  7, 34,  8, 25, 21, 13,  8,  8, 20,
        20, 25, 54, 14, 14, 40,  8, 51,  9, 34, 51, 14,  8,  7,  8, 13,
        13, 51, 21, 38, 25, 51, 49, 34, 14, 44,  7, 24,  7, 47,  7,  9,
        25, 25, 21, 44, 24, 34, 34, 49, 25, 12, 12, 44, 34,  9, 24, 13,
         8, 47, 44,  8,  7, 34, 34, 49, 13, 51, 49, 25, 12, 44, 34, 49,
        38,  9,  9,  9, 13, 25,  7, 21, 24, 38, 21,  8, 38, 14, 38, 13,
        13, 21,  8, 49, 44, 14, 44,  9, 34, 20, 34, 51, 44, 24, 13, 14,
        21, 49,  7, 14, 38, 25, 13, 34, 21, 34, 13, 24, 24, 12, 54, 25,
        44, 54,  1, 51, 34, 12,  9, 21,  9, 54,  8, 12,  8, 12, 44,  0,
        13,  8, 25, 34, 14, 47, 25, 25, 44, 14, 24, 38, 44, 38, 49, 34,
        54, 34,  0, 14, 34, 54, 25, 12, 34, 25, 47, 20, 38, 34, 44, 13,
        34, 13, 14, 34, 44, 14, 51, 13, 40, 12, 25, 21, 51, 40, 13,  7,
        13, 24, 44, 24,  9, 34, 34, 12, 12, 13,  8, 21, 51,  7, 

In [65]:
#Prediction
y_pred = bnb.predict(np.array(empty).reshape(1,1000))

In [66]:
# 1 == Positive, 0 == Positive
if y_pred == 0:
    print("Movie is not good do not, watch. Reviews are Negative")
else:
    print("You can watch the movie. Reviews are Positive")

Movie is not good do not, watch. Reviews are Negative
