In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB 

In [2]:
data=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import nltk
import re
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
stop_words=set(stopwords.words('english'))

In [7]:
def clean_review(text):
    text=text.lower()
    text=re.sub(r"[^a-zA-Z\s+]",'',text) #convert numbers
    text=re.sub(r"http\s+|www\s+",'',text) #remove URLS
    words=word_tokenize(text)
    words=[word for word in words if word not in stop_words]
    return " ".join(words)
    

In [8]:
data['cleaned_review']=data['review'].apply(clean_review)

In [9]:
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer

In [11]:
tfidfv=TfidfVectorizer(max_features=5000)
tfidv_data=tfidfv.fit_transform(data['cleaned_review'])

In [12]:
data["sentiment"]=data["sentiment"].map({"positive": 1,'negative' :0})

In [13]:
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


In [14]:
x=tfidv_data
y=data['sentiment']

In [15]:
lr_model=LogisticRegression(penalty='l2',solver='saga',max_iter=1000)

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [17]:
lr_model.fit(x_train,y_train)

In [18]:
y_pred=lr_model.predict(x_test)

In [19]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      6157
           1       0.88      0.90      0.89      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [20]:
nb_model=MultinomialNB()

In [21]:
nb_model.fit(x_train,y_train) #train the model

In [22]:
y_pred_nb=nb_model.predict(x_test)

In [23]:
print(classification_report(y_pred_nb,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      6166
           1       0.89      0.91      0.90      6334

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500

