In [1]:
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv(os.path.join('Data', 'IMDB Dataset.csv'))

Data Exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [6]:
# No. of positive reviews
sum(df['sentiment'] == 'positive')

25000

In [7]:
# No. of negative sentiments
sum(df['sentiment'] == 'negative')

25000

Data Cleaning 

In [8]:
# Removing noisy text (html tags and special charecters)
def remove_noise(data):
    cleaned_reviews = []
    
    for review in data:
        soup = BeautifulSoup(review, 'html.parser')
        text = soup.get_text()
        cleaned_text = ''
        for word in text.split(' '):
            str_ = ''
            for char in word:
                if char.isalnum(): str_+=char
            cleaned_text+=f' {str_}'
        cleaned_text = cleaned_text.strip().lower()
        cleaned_reviews.append(cleaned_text)
    
    return cleaned_reviews

In [9]:
df['cleaned_review'] = pd.DataFrame(remove_noise(df['review']))

In [10]:
# Replacing labels with 0s and 1s
df['sentiment'][df['sentiment'] == 'positive'] = 1
df['sentiment'][df['sentiment'] == 'negative'] = 0

In [11]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love in the time of money is a ...


Shuffling Dataset

In [12]:
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = tts(X, y, random_state=1, test_size=0.3, shuffle=True) # ((35000,), (15000,), (35000,), (15000,))

Vectorize Reviews using Count Vectorizer

In [13]:
# intializing vectorizer
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn vocabulary dictionary
vectorizer.fit(list(X_train) + list(X_test))

# transform documents into document-term-matrix
X_train_ = vectorizer.transform(X_train)
X_test_ = vectorizer.transform(X_test)

Model Building

In [14]:
y = y_train.astype('int')

classifier = LinearSVC(max_iter=3000)
classifier.fit(X_train_, y)

y_test_pred = classifier.predict(X_test_)

Accuracy Score by SVM

In [15]:
y2 = y_test.astype('int')
print(f'Accuracy Score: {accuracy_score(y2, y_test_pred) * 100}')

Accuracy Score: 86.18666666666667
