In [1]:
# nltk.download('stopwords')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# TEXT CELANING PROCESS
# 1) sample 10000 rows
# 2) remove html tags
# 3) remove special characters
# 4) converting everything to lower case
# 5) removing stop words
# 6) stemming

In [6]:
df = df.sample(10000)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1442 to 28876
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [8]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [9]:
# removing html tags

import re
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

"I Am Curious is really two films in one - half of it is the sexual experimental side of Lena and the other half is her curiosity with political/socialism. Whatever the director's intention, the two don't really mesh together. The director should have just stuck with the romantic side of Lena and made a separate movie for the politics. There is a bizarre mixture of political/war rallies, Dr. King, serious political interviews, flopping breasts, and pubic hair. The film feels more like a fictional documentary than a movie. Other than the interesting sex scenes, you'll be bored dry watching this film. Unlike many other reviewers, I think the nude/sexual scenes are overdone for what it is. If you want to see real porn, I'm sure there are better choices. The pervasive nudity is a major distraction from whatever plot there is. I think the cast did a fine job however. They played their parts believably. There is little of the over-the-topness I'm so used to seeing in the American films durin

In [10]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

df['review'] = df['review'].apply(clean_html)

In [11]:
# converting everything to lower
def convert_lower(text):
    return text.lower()

df['review'] = df['review'].apply(convert_lower)

In [12]:
# function to remove special characters

def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x + ' '
    return x

df['review'] = df['review'].apply(remove_special)

In [13]:
df.shape
df.head()
df

Unnamed: 0,review,sentiment
1442,i purchased a dvd of this film for a dollar at...,0
21671,between 1937 and 1939 twentieth century fox m...,0
45370,i am curious is really two films in one half...,0
21300,i ve seen worse which is a backhanded way of ...,0
32221,what gives anthony minghella the right to ruin...,0
...,...,...
34712,i found this movie at a xxx store for 1 on vh...,0
41619,i m sorry to say that there isn t really any w...,0
37783,george and mildred was a spin off from the mid...,0
38078,this is a warm funny film in much the same v...,1


In [14]:
# remove stop words

import nltk
from nltk.corpus import stopwords
def remove_stopword(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y


df['review'] = df['review'].apply(remove_stopword)


In [15]:
# performing stemming

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [16]:
y=[]
def stem_word(text):
    
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z


df['review'] = df['review'].apply(stem_word)



In [17]:
df.head

<bound method NDFrame.head of                                                   review  sentiment
1442   [purchas, dvd, film, dollar, big, dept, store,...          0
21671  [1937, 1939, twentieth, centuri, fox, made, to...          0
45370  [curiou, realli, two, film, one, half, sexual,...          0
21300  [seen, wors, backhand, way, say, crummi, film,...          0
32221  [give, anthoni, minghella, right, ruin, two, e...          0
...                                                  ...        ...
34712  [found, movi, xxx, store, 1, vh, interest, thi...          0
41619  [sorri, say, realli, way, opinion, enzo, would...          0
37783  [georg, mildr, spin, mid, 1970, sit, com, man,...          0
38078  [warm, funni, film, much, vein, work, almodova...          1
28876  [love, movi, focus, issu, realiti, fantasi, re...          1

[10000 rows x 2 columns]>

In [18]:
# join back

def join_back(list_input):
    return " ".join(list_input)

df['review'] = df['review'].apply(join_back)

In [19]:
df.head()

Unnamed: 0,review,sentiment
1442,purchas dvd film dollar big dept store probabl...,0
21671,1937 1939 twentieth centuri fox made ton mr mo...,0
45370,curiou realli two film one half sexual experim...,0
21300,seen wors backhand way say crummi film plot ri...,0
32221,give anthoni minghella right ruin two extraord...,0


In [20]:
x = df.iloc[:,0:1].values

In [21]:
y= df.iloc[:,-1].values

In [22]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000) # max_features = 1000
x = cv.fit_transform(df['review']).toarray()
x.shape

(10000, 1000)

In [23]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [24]:
# trainig model

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

clf1.fit(x_train,y_train)
clf2.fit(x_train,y_train)
clf3.fit(x_train,y_train)

y_pred1 = clf1.predict(x_test)
y_pred2 = clf2.predict(x_test)
y_pred3 = clf3.predict(x_test)

print("GaussianNB = ",accuracy_score(y_test,y_pred1))
print("MultinomialNB = ",accuracy_score(y_test,y_pred2))
print("BernoulliNB = ",accuracy_score(y_test,y_pred3))

GaussianNB =  0.808
MultinomialNB =  0.8495
BernoulliNB =  0.8605
