### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder


### Load Dataset

In [50]:
df = pd.read_csv('IMDB Dataset.csv')

In [51]:
df.shape

(50000, 2)

In [52]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [53]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [54]:
data = df.values

In [55]:
X= data[: ,0] # All pixcels
Y= data[ :,1] # labels 


In [8]:
print(X.shape, Y.shape)

(50000,) (50000,)


In [9]:
print(type(Y))

<class 'numpy.ndarray'>


### Train- Test Split

In [10]:
from sklearn.model_selection import train_test_split

XT,Xt,YT,Yt = train_test_split(X, Y, test_size=0.2, random_state=5) # random state shuffles same every time 
print(XT.shape, YT.shape) 
print(Xt.shape, Yt.shape)

(40000,) (40000,)
(10000,) (10000,)


### Cleaning

In [11]:
YT

array(['negative', 'positive', 'negative', ..., 'positive', 'negative',
       'positive'], dtype=object)

In [12]:
le = LabelEncoder()
YT = le.fit_transform(YT)

In [13]:
YT

array([0, 1, 0, ..., 1, 0, 1])

In [14]:
Yt

array(['positive', 'positive', 'positive', ..., 'negative', 'positive',
       'positive'], dtype=object)

In [15]:
le = LabelEncoder()
Yt = le.fit_transform(Yt)

In [16]:
Yt

array([1, 1, 1, ..., 0, 1, 1])

In [17]:
XT= XT.tolist()

In [18]:
Xt=Xt.tolist()

In [19]:
type(Xt)

list

In [20]:
print(len(Xt), len(XT))

10000 40000


In [21]:
import clean_text as ct

In [22]:
XT_clean = [ct.getCleanReview(i) for i in XT] #List Comprehension
Xt_clean = [ct.getCleanReview(i) for i in Xt]

In [23]:
print(XT_clean[2345])
print(len(XT_clean))

dead husband somewhat silli comedi bunch wive conspir bump other husband mean embarrassingli bad like comedi could mention never fufil potenti imagin good could farrelli brother direct ben stiller role carter elson oh carter base jerri springer curiou catch phrase dr elson show look keep talk
40000


### Vectorisation

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
cv = CountVectorizer()

x_vec = cv.fit_transform(XT_clean).toarray()
print(x_vec)
print(x_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(40000, 64877)


In [26]:
print(cv.get_feature_names())

['00', '000', '0000000000001', '00000001', '00001', '00015', '000dm', '001', '003830', '006', '0069', '007', '0079', '0080', '0083', '0093638', '00am', '00o', '00pm', '00schneider', '01', '0126', '0148', '02', '020410', '029', '03', '04', '044', '05', '050', '06', '06th', '07', '07b', '08', '087', '089', '08th', '09', '0f', '0r', '0s', '0tt', '10', '100', '1000', '10000', '1000000', '10000000000', '10000000000000', '10000th', '1000lb', '1001', '1004', '100b', '100bt', '100ft', '100ib', '100k', '100m', '100mile', '100min', '100mph', '100th', '100time', '100x', '101', '101st', '102', '102nd', '103', '104', '1040', '1040a', '105', '1050', '106', '106min', '107', '108', '1080p', '109', '10am', '10ish', '10k', '10line', '10mil', '10min', '10minut', '10pm', '10th', '10x', '10yo', '10yr', '11', '110', '1100', '11001001', '1100ad', '110min', '110mph', '111', '112', '113', '1138', '113min', '113minut', '114', '115', '116', '116minut', '117', '11706', '118', '119', '11f', '11m', '11th', '11yr', 

In [27]:
## Vectorization on the test set
xt_vec = cv.transform(Xt_clean).toarray()
print(xt_vec)
cv.get_feature_names()
print(xt_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(10000, 64877)


###  Multinomial Naive Bayes Model

In [64]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
mnb = MultinomialNB()
print(mnb)


MultinomialNB()


In [66]:
# Training 
mnb.fit(x_vec,YT)

MultinomialNB()

In [67]:
#Predictions
pred=mnb.predict(xt_vec)

In [68]:
type(pred)

numpy.ndarray

In [69]:
pred

array([1, 1, 1, ..., 0, 1, 1])

In [70]:
Yt

array([1, 1, 1, ..., 0, 1, 1])

#### Accuracy

In [71]:
from sklearn import metrics
print(metrics.accuracy_score(Yt, pred))

0.8589


### Bernoulli Naive Bayes Model

In [29]:
from sklearn.naive_bayes import BernoulliNB

In [30]:
bnb = BernoulliNB()
print(bnb)


BernoulliNB()


In [31]:
# Training 
bnb.fit(x_vec,YT)

BernoulliNB()

In [32]:
#Predictions
pred=bnb.predict(xt_vec)

In [33]:
type(pred)

numpy.ndarray

In [34]:
pred

array([1, 1, 1, ..., 0, 1, 1])

In [35]:
Yt

array([1, 1, 1, ..., 0, 1, 1])

In [36]:
from sklearn import metrics
print(metrics.accuracy_score(Yt, pred))

0.8547
