# Sentiment Analysis Using IMDB dataset

In [14]:
import nltk

from sklearn.preprocessing import LabelEncoder

import pandas as pd

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jakir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading dataset

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/Ataullha/CSE476-Machine-Learning-Lab/main/IMDB%20Dataset.csv').head(100)

In [7]:
df.shape

(100, 2)

In [8]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,review,sentiment
count,100,100
unique,100,2
top,NO SPOILERS!!<br /><br />After Hitchcock's suc...,negative
freq,1,58


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     100 non-null    object
 1   sentiment  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [12]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [13]:
df['sentiment'].value_counts()

negative    58
positive    42
Name: sentiment, dtype: int64

# Label Encoding

In [15]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

In [16]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [19]:
X = df['review']
y = df['sentiment']

In [20]:
X[:5]

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [21]:
y[:5]

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int32

# Cleaning reviews

In [22]:
# For styling only
from tqdm import tqdm

In [18]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

In [23]:
ps = PorterStemmer()
corpus = []

for i in tqdm(range(len(X))):
    review = re.sub("[^a-zA-Z]", " ", X[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  7.02it/s]


In [24]:
print(len(corpus))

100


In [28]:
corpus[:1]

['one review mention watch oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch d

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(max_features=500)
X = cv.fit_transform(corpus).toarray()

In [30]:
X[:5]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.11142743, 0.        ,
        0.12091984],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06559698, 0.10159369, ..., 0.        , 0.        ,
        0.        ]])

# Model

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)

In [32]:
from sklearn.svm import SVC

model = SVC(
    C=1.0,
    kernel='linear',
    random_state=57
)

In [33]:
model.fit(X_train, y_train)

SVC(kernel='linear', random_state=57)

In [34]:
model.score(X_test, y_test)

1.0

In [35]:
y_pred = model.predict(X_test)

In [37]:
def test_model(sentence):
    sen = cv.transform([sentence]).toarray()
    res = model.predict(sen)[0]
    if res == 1:
        print('positive')
    else:
        print('negative')

In [38]:
sentence = 'i love amber heard'
test_model(sentence)

positive


In [39]:
sentence = 'i hate gigi hadid'
test_model(sentence)

negative


In [41]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         5

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

