In [63]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [64]:
df= pd.read_csv('IMDB_Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [65]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [66]:
df.info()
# There is no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [67]:
# Two variables - explained variable (sentiment) and the explanatory variable (review)
X = df['review']
y = df['sentiment']


In [68]:
# Cleaning the data
# Removing numbers, links, special signs and stop words.
def preprocessing_data( review ):
    review = re.sub(r"http\S+|https\S+|www\S+", '', review, flags=re.MULTILINE)
    review = re.sub(r"\@w+|\#", '', review)
    review = re.sub(r"[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split() 

    stop_words = set(stopwords.words('english'))
    text = [word for word in review if word not in stop_words]
    return " ".join(review)

df['clean_review'] = df['review'].apply(preprocessing_data)
df.head(10)

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...
5,"Probably my all-time favorite movie, a story o...",positive,probably my all time favorite movie a story of...
6,I sure would like to see a resurrection of a u...,positive,i sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,this show was an amazing fresh innovative idea...
8,Encouraged by the positive comments about this...,negative,encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...,positive,if you like original gut wrenching laughter yo...


In [69]:
# Text vectoring
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(df['clean_review'].astype(str))
X_vectorized


<50000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 5845715 stored elements in Compressed Sparse Row format>

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [72]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred

array(['negative', 'positive', 'negative', ..., 'positive', 'negative',
       'positive'], dtype=object)

In [73]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8944
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [56]:
#New text
new_text = ["I absolutely loved this movie, it was fantastic!"]

# cleaning and vectirizing text
cleaned_text = [preprocessing_data(review) for review in new_text]
vectorized_text = vectorizer.transform(cleaned_text)

# predictions
prediction = model.predict(vectorized_text)
print("Predicted Sentiment:", prediction)


Predicted Sentiment: ['positive']


In [76]:
# Another example
new_text2 = ["I didn't really like this movie"]
cleaned_text2 = [preprocessing_data(review) for review in new_text2]
vectorized_text2 = vectorizer.transform(cleaned_text2)

# predictions
prediction2 = model.predict(vectorized_text2)
print("Predicted Sentiment:", prediction2) 

Predicted Sentiment: ['negative']
