In [60]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [59]:
# Step 2: Load the dataset
df = pd.read_csv('/content/IMDB Dataset.csv', engine='python')

In [61]:
# To see all records
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [44]:
# To display the first 5 rows
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [45]:
# To display the last 5 rows
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [46]:
df.shape

(50000, 2)

In [47]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [48]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [50]:
# Split data into features (X) and labels (y)
X = df['review']
y = df['sentiment']

In [51]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Convert text data into numerical features using CountVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [53]:
# Display the vocabulary (words) learned by TF-IDF Vectorizer
print("Vocabulary without stop words:")
print(vectorizer.get_feature_names_out())

Vocabulary without stop words:
['00' '000' '00000000000' ... 'żmijewski' 'יגאל' 'כרמון']


In [54]:
# Initialize the model (Logistic Regression)
model = LogisticRegression()

In [55]:
# Train the Model
model.fit(X_train_vect, y_train)

In [56]:
# Make predictions
y_pred = model.predict(X_test_vect)


In [57]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report: \n{classification_report(y_test, y_pred)}')

Accuracy: 0.8942
Classification Report: 
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [58]:
# Test a new review
new_review = ["The movie was absolutely fantastic! I loved it."]
new_review_vect = vectorizer.transform(new_review)
prediction = model.predict(new_review_vect)
print(f'Sentiment of the new review: {prediction[0]}')

Sentiment of the new review: positive
