In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\megha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [26]:
df.shape

(50000, 2)

In [27]:
df=df.iloc[:10000] # ONly selecting 10000 rows for further processing of data thereby avoiding the huge dataset because it a machine learning model and not a neural network.
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
df['sentiment'].value_counts() # This checks for the total number of values in each cluster in the sentiment column.

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [29]:
df.isnull().sum() # Checks for any null values present in the data

review       0
sentiment    0
dtype: int64

In [30]:
df.duplicated().sum() # Checks for any duplicate values in the data

17

In [31]:
df.drop_duplicates(inplace=True) # It is used to drop all the duplicate values. Inplace=True is necessary to ensure that all the changes have been made in the original data and not its shallow copy.

In [32]:
df.duplicated().sum()

0

In [33]:
# Basic Preprocessing
# 1. Remove HTML tags
# 2. Convert to lower case
# 3. Remove stopwords

In [34]:
# Remove HTML tags
def remove_html_tags(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)

In [35]:
df['review']=df['review'].apply(remove_html_tags)

In [36]:
# Convert to lower case
df['review']=df['review'].apply(lambda x:x.lower())

In [37]:
# Removing stopwords
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [38]:
def remove_stopwords(text):
    new_text=[]
    for i in text.split():
        if i in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(i)
    return " ".join(new_text[:])

In [39]:
x=df[['review']] # It will return a dataframe
y=df['sentiment'] # It will return a series

In [40]:
x.head()

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."


In [41]:
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [42]:
# Label Encoding
# It is used to provide numerical values to categorical values so that values of the same cluster are represented using a single value to emphasize the relationship between them.
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(y)

In [43]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [45]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [46]:
x_train.shape

(7986, 1)

In [47]:
x_test.shape

(1997, 1)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000) # This will help us to take only essential top 1000 rows of our data
x_train_bow=cv.fit_transform(x_train['review']).toarray()
x_test_bow=cv.transform(x_test['review']).toarray()

In [77]:
x_train_bow

array([[0, 0, 0, ..., 0, 2, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [78]:
# Model Building
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()

gnb.fit(x_train_bow,y_train)

In [79]:
y_pred=gnb.predict(x_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test,y_pred)

0.8062093139709564

In [80]:
confusion_matrix(y_test,y_pred)

array([[717, 235],
       [152, 893]], dtype=int64)

In [81]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

rfc.fit(x_train_bow,y_train)

In [82]:
y_pred=rfc.predict(x_test_bow)

accuracy_score(y_test,y_pred)

0.8267401101652478

In [83]:
confusion_matrix(y_test,y_pred)

array([[780, 172],
       [174, 871]], dtype=int64)

In [84]:
# Use of Count Vectorizer for data preprocessing
cv=CountVectorizer(max_features=3000)

x_train_bow=cv.fit_transform(x_train['review']).toarray()
x_test_bow=cv.transform(x_test['review']).toarray()

rfc=RandomForestClassifier()
rfc.fit(x_train_bow,y_train)
y_pred=rfc.predict(x_test_bow)

accuracy_score(y_test,y_pred)

0.828743114672008

In [85]:
confusion_matrix(y_test,y_pred)

array([[787, 165],
       [177, 868]], dtype=int64)

In [86]:
# Count Vectorizer with ngrams and max_features (it refers to the number of features to be taken into consideration while building the model
cv=CountVectorizer(ngram_range=(2,2),max_features=3000)

x_train_bow=cv.fit_transform(x_train['review']).toarray()
x_test_bow=cv.transform(x_test['review']).toarray()

rfc=RandomForestClassifier()
rfc.fit(x_train_bow,y_train)
y_pred=rfc.predict(x_test_bow)

accuracy_score(y_test,y_pred)

0.7931897846770155

In [87]:
confusion_matrix(y_test,y_pred)

array([[775, 177],
       [236, 809]], dtype=int64)

In [88]:
# Use of Tfidf Vectorizer for data preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=3000)

In [89]:
x_train_tfidf=tfidf.fit_transform(x_train['review'])
x_test_tfidf=tfidf.transform(x_test['review'])

In [90]:
rfc=RandomForestClassifier()
rfc.fit(x_train_tfidf,y_train)
y_pred=rfc.predict(x_test_tfidf)

accuracy_score(y_test,y_pred)

0.8417626439659489

In [75]:
confusion_matrix(y_test,y_pred)

array([[791, 161],
       [176, 869]], dtype=int64)