In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score


nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:

df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [20]:

df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


<span style="font-size:20px">Tokenizing</span>

In [21]:

df['token'] = df['review'].apply(word_tokenize)
df.head()

Unnamed: 0,review,sentiment,token
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ..."


<span style="font-size:20px">Remove stopwords</span>

In [23]:
stop_words = set(stopwords.words('english'))

df['cleaned_review'] = df['review'].apply(lambda x:' '.join([word for word in  word_tokenize(x.lower())if word not in stop_words and word.isalpha()]))
df[['review','cleaned_review']].head()

Unnamed: 0,review,cleaned_review
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love time money visually stunnin...


In [24]:
df['tokens'] = df['cleaned_review'].apply(word_tokenize)
df.head()

Unnamed: 0,review,sentiment,token,cleaned_review,tokens
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...",one reviewers mentioned watching oz episode ho...,"[one, reviewers, mentioned, watching, oz, epis..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /...",wonderful little production br br filming tech...,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...",thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li...",basically family little boy jake thinks zombie...,"[basically, family, little, boy, jake, thinks,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ...",petter mattei love time money visually stunnin...,"[petter, mattei, love, time, money, visually, ..."


<span style="font-size:20px">Lemmatizer</span>

In [25]:

lemmatizer = WordNetLemmatizer()
df['cleaned_review'] = df['cleaned_review'].apply(lambda x:' '.join([ lemmatizer.lemmatize(word) for word in  word_tokenize(x.lower()) if word not in stop_words and word.isalpha()]))
df[['review','cleaned_review']].head()

Unnamed: 0,review,cleaned_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love time money visually stunnin...


In [26]:

df  = df.drop(columns = 'review')
df.head()

Unnamed: 0,sentiment,token,cleaned_review,tokens
0,positive,"[One, of, the, other, reviewers, has, mentione...",one reviewer mentioned watching oz episode hoo...,"[one, reviewers, mentioned, watching, oz, epis..."
1,positive,"[A, wonderful, little, production, ., <, br, /...",wonderful little production br br filming tech...,"[wonderful, little, production, br, br, filmin..."
2,positive,"[I, thought, this, was, a, wonderful, way, to,...",thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,negative,"[Basically, there, 's, a, family, where, a, li...",basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, thinks,..."
4,positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ...",petter mattei love time money visually stunnin...,"[petter, mattei, love, time, money, visually, ..."


In [27]:

df  = df.drop(columns = 'token')
df.head()

Unnamed: 0,sentiment,cleaned_review,tokens
0,positive,one reviewer mentioned watching oz episode hoo...,"[one, reviewers, mentioned, watching, oz, epis..."
1,positive,wonderful little production br br filming tech...,"[wonderful, little, production, br, br, filmin..."
2,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,negative,basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, thinks,..."
4,positive,petter mattei love time money visually stunnin...,"[petter, mattei, love, time, money, visually, ..."


<span style="font-size:20px">Using TF-IDF</span>

In [28]:

vec = TfidfVectorizer(max_features=5000)
X = vec.fit_transform(df['cleaned_review'])
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3853291 stored elements and shape (50000, 5000)>

In [29]:

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
Y = df['sentiment']
Y

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [30]:

df.tail()

Unnamed: 0,sentiment,cleaned_review,tokens
49995,1,thought movie right good job creative original...,"[thought, movie, right, good, job, creative, o..."
49996,0,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,0,catholic taught parochial elementary school nu...,"[catholic, taught, parochial, elementary, scho..."
49998,0,going disagree previous comment side maltin on...,"[going, disagree, previous, comment, side, mal..."
49999,0,one expects star trek movie high art fan expec...,"[one, expects, star, trek, movies, high, art, ..."


<span style="font-size:20px">Training the Model</span>

In [31]:

X_train ,X_test, Y_train, Y_test = train_test_split(X,Y,test_size =0.2,random_state=42)

In [32]:

model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)


<span style="font-size:20px">Checking the accuracy of training data</span>

In [33]:

X_train_prediction = model.predict(X_train)
Accuracy_Train = accuracy_score(X_train_prediction,Y_train)
Accuracy_Train

0.90945

<span style="font-size:20px">Checking the accuracy of test data</span>

In [34]:

X_test_prediction = model.predict(X_test)
Accuracy_Test = accuracy_score(X_test_prediction , Y_test)
Accuracy_Test

0.8862

<span style="font-size:20px">Predicting X values</span>

In [35]:

Y_pred = model.predict(X_test)
Y_pred

array([0, 1, 0, ..., 1, 0, 1])

<span style="font-size:20px">Classification Report</span>


In [36]:

from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n")
print(classification_report(Y_test, Y_pred))


Accuracy: 0.8862

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# Testing Interface

In [None]:

print("--------TESTING-INTERFACE--------")
review = input("\nEnter ypur review either its is postive or negative?\n")
review_vector = vec.transform([review])

prediction = model.predict(review_vector)

if prediction==[1]:
    print("Positive-review.......")
else:
    print("Negative-review.......")


--------TESTING-INTERFACE--------
