# IMDB movies reviews sentiment analysis
### 1. Loading the dataset

In [1]:
#importing libraries
import numpy as np
import pandas as pd

In [2]:
#readind the data
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


### 2. Cleaning the dataset

In [5]:
df.shape

(50000, 2)

In [6]:
#sampling limited data
#df = df.sample(20000)

In [7]:
#checking for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
#replacing positive and negative with 1 and 0
df['sentiment'].replace({'positive':1,'negative':0},inplace = True)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [9]:
import re
def clean_html(text):# function for removing html from a text
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

In [10]:
df['review']=df['review'].apply(clean_html)


In [11]:
#converting everything to lowercase
def convert_lower(text):
    return text.lower()

In [12]:
df['review']=df['review'].apply(convert_lower)

In [13]:
#removing special characters
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

In [14]:
df['review']=df['review'].apply(remove_special)

In [15]:
# removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
swlist = stopwords.words('english')

In [17]:
def remove_stopwords(text):
    x =[]
    for i in text.split():
        if i not in swlist:
            x.append(i)
    y=x[:]
    x.clear
    return y

In [18]:
df['review']=df['review'].apply(remove_stopwords)
df

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1
...,...,...
49995,"[thought, movie, right, good, job, creative, o...",1
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...",0
49997,"[catholic, taught, parochial, elementary, scho...",0
49998,"[going, disagree, previous, comment, side, mal...",0


In [19]:
#stemming
from nltk.stem.porter import PorterStemmer

In [20]:
y = []
def stem_words(text):
    for i in text:
        y.append(PorterStemmer().stem(i))
    z=y[:]
    y.clear()
    return z

In [21]:
#example of stemming
stem_words(['loved','loving','me','I'])

['love', 'love', 'me', 'i']

In [22]:
df

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1
...,...,...
49995,"[thought, movie, right, good, job, creative, o...",1
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...",0
49997,"[catholic, taught, parochial, elementary, scho...",0
49998,"[going, disagree, previous, comment, side, mal...",0


In [23]:
df['review']=df['review'].apply(stem_words)

In [24]:
#join back

def join_back(list_input):
    return " ".join(list_input)

In [25]:
df['review']=df['review'].apply(join_back)

In [26]:
df['review']

0        one review mention watch 1 oz episod hook righ...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job creativ origin fir...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    go disagre previou comment side maltin one sec...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object

In [27]:
X = df.iloc[:,0:1].values

In [28]:
X.shape

(50000, 1)

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [50]:
X=cv.fit_transform(df['review']).toarray()

In [51]:
X.shape

(50000, 1500)

In [52]:
y=df.iloc[:,-1].values

In [53]:
y.shape

(50000,)

In [54]:
#splitting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [55]:
x_train.shape

(40000, 1500)

In [56]:
x_test.shape

(10000, 1500)

In [57]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [58]:
clf1= GaussianNB()
clf2= MultinomialNB()
clf3= BernoulliNB()

In [59]:
clf1.fit(x_train,y_train)

GaussianNB()

In [60]:
clf2.fit(x_train,y_train)

MultinomialNB()

In [61]:
clf3.fit(x_train,y_train)

BernoulliNB()

In [62]:
y_pred1  = clf1.predict(x_test)

In [63]:
y_pred2  = clf2.predict(x_test)

In [64]:
y_pred3  = clf3.predict(x_test)

In [65]:
y_test.shape

(10000,)

In [66]:
y_pred1.shape

(10000,)

In [67]:
from sklearn.metrics import accuracy_score

In [68]:
print('Gaussian',accuracy_score(y_test,y_pred1))
print('Multinomial',accuracy_score(y_test,y_pred2))
print('Bernoulli',accuracy_score(y_test,y_pred3))

Gaussian 0.7634
Multinomial 0.8395
Bernoulli 0.8449
