In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
%matplotlib inline

In [2]:
pd.set_option('display.max_columns',50)

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
#shape of the dataset
df.shape

(50000, 2)

In [6]:
#general information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
#Checking if dataset is balanced/not balanced
df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [8]:
#Checking for the presence of null values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
#Checking for the presence of duplicate rows
df[df.duplicated()]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive
6352,If you liked the Grinch movie... go watch that...,negative
6479,I want very much to believe that the above quo...,negative
...,...,...
49912,This is an incredible piece of drama and power...,positive
49950,This was a very brief episode that appeared in...,negative
49984,Hello it is I Derrick Cannon and I welcome you...,negative
49986,This movie is a disgrace to the Major League F...,negative


In [10]:
df[df['review'] == "Quite what the producers of this appalling adaptation were trying to do is impossible to fathom.<br /><br />A group of top quality actors, in the main well cast (with a couple of notable exceptions), who give pretty good performances. Penelope Keith is perfect as Aunt Louise and equally good is Joanna Lumley as Diana. All do well with the scripts they were given.<br /><br />So much for the good. The average would include the sets. Nancherrow is nothing like the house described in the book, although bizarrely the house they use for the Dower House looks remarkably like it. It is clear then that the Dower House is far too big. In the later parts, the writers decided to bring the entire story back to the UK, presumably to save money, although with a little imagination I have no doubt they could have recreated Ceylon.<br /><br />Now to the bad. The screenplay. This is such an appallingly bad adaptation is hard to find words to condemn it. Edward does not die in the battle of Britain but survives, blinded. He makes a brief appearance then commits suicide - why?? Loveday has changed from the young woman totally in love with Gus to a sensible farmer's wife who can give up the love her life with barely a tear (less emotional than Brief Encounter). Gus, a man besotted and passionately in love, is prepared to give up his love without complaint. Walter (Mudge in the book) turns from a shallow unfaithful husband to a devoted family man. Jess is made into a psychologically disturbed young woman who won't speak. Aunt Biddy still has a drink problem but now without any justification. The Dower House is occupied by the army for no obvious reason other than a very short scene with Jess who has a fear of armed soldiers. Whilst Miss Mortimer's breasts are utterly delightful, I could not see how their display on several occasions moved the plot forward. The delightfully named Nettlebed becomes the mundane Dobson. The word limit prevents me from continuing the list.<br /><br />There is a sequel (which I lost all interest in watching after this nonsense) and I wonder if the changes were made to create the follow on story. It is difficult to image that Rosamunde Pilcher would have approved this grotesque perversion of her book; presumably she lost her control when the rights were purchased."]

Unnamed: 0,review,sentiment
2013,Quite what the producers of this appalling ada...,negative
3537,Quite what the producers of this appalling ada...,negative


In [11]:
# There are a total of 418 rows which are duplicate and have to be removed.
df.duplicated().sum()

418

In [12]:
df = df.drop_duplicates()

In [13]:
df = df.reset_index(drop = True)

In [14]:
df.shape

(49582, 2)

In [15]:
# The reviews contain HTML tags and punctuation marks.
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [16]:
#Removing the HTML tags
for i in range(0,df.shape[0]):
    soup = BeautifulSoup(df['review'][i])
    df['review'][i] = soup.get_text()



In [17]:
#Removing the punctuation marks
for i in range(0,df.shape[0]):
    df['review'][i] = re.sub(r'[^\w\s]',' ',df['review'][i])

In [18]:
#Making the review into lowercase
df['review'] = df['review'].apply(str.lower)

In [19]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [20]:
def removal_of_stop_words(s):
    l = []
    tokens = nltk.word_tokenize(s)
    for i in tokens: 
        if i not in sw:
            l.append(i)
    return ' '.join(l)

In [21]:
#Removing the stop words from the sentence    
df['review'] = df['review'].apply(removal_of_stop_words)

In [22]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [23]:
def lemmatization(s):
    l = []
    tokens = nltk.word_tokenize(s)
    for i in tokens: 
        l.append(wl.lemmatize(i))
    return ' '.join(l)

In [24]:
df['review'][2]

'thought wonderful way spend time hot summer weekend sitting air conditioned theater watching light hearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point 2 risk addiction thought proof woody allen still fully control style many us grown love laughed one woody comedies years dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young woman may crown jewel career wittier devil wears prada interesting superman great comedy go see friends'

In [25]:
df['review'] = df['review'].apply(lemmatization)

In [73]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['review'],df['sentiment'],test_size = 0.2, random_state = 60,stratify = df['sentiment'])

In [74]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
vec = CountVectorizer()
vec.fit(X_train)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [77]:
print(X_train.shape)

(39665, 85634)


In [78]:
print(X_test.shape)

(9917, 85634)


In [79]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [80]:
model = MultinomialNB()
parameters = {"alpha":[10**-2,10**-1,10**0,10**1,10**2]}
grid_search = GridSearchCV(model,param_grid = parameters,scoring = "accuracy",cv = 5, n_jobs = 2, return_train_score = True)

In [81]:
grid_search.fit(X_train,y_train)

  return f(**kwargs)


GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=2,
             param_grid={'alpha': [0.01, 0.1, 1, 10, 100]},
             return_train_score=True, scoring='accuracy')

In [82]:
print(grid_search.best_score_)
print(grid_search.best_estimator_)

0.8567502836253624
MultinomialNB(alpha=1)


In [83]:
final_model = MultinomialNB(alpha = 1.0)
final_model.fit(X_train,y_train)

  return f(**kwargs)


MultinomialNB()

In [84]:
predictions = final_model.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(predictions,y_test))

0.8590299485731572


In [86]:
from joblib import dump

In [87]:
dump(vec,"vector.joblib")
dump(final_model,"model.joblib")

['model.joblib']