In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# ========================
# Step 2: Upload Dataset from local
# ========================
from google.colab import files
uploaded = files.upload()   # Select "IMDB Dataset.csv" file

# Load dataset
data = pd.read_csv("IMDB Dataset.csv")
print("Dataset shape:", data.shape)
print(data.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving IMDB Dataset.csv to IMDB Dataset.csv
Dataset shape: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
# Step 3: Preprocessing
# ========================
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)        # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)     # remove special characters & numbers
    text = " ".join([word for word in text.split() if word not in STOPWORDS])  # remove stopwords
    return text

data['cleaned'] = data['review'].apply(clean_text)

print("\nSample cleaned reviews:")
print(data['cleaned'].head())


Sample cleaned reviews:
0    one reviewers mentioned watching oz episode yo...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: cleaned, dtype: object


In [4]:
# Step 4: Feature Engineering
# ========================
vectorizer = CountVectorizer(max_features=5000)  # Bag of Words
X = vectorizer.fit_transform(data['cleaned']).toarray()
y = data['sentiment']

In [5]:
# Step 5: Train-Test Split
# ========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# Step 6: Train Model
# ========================
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [7]:
# Step 7: Evaluate Model
# ========================
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8735

Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.87      0.87      4961
    positive       0.87      0.88      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [8]:
# Step 8: Predict New Review
# ========================
def predict_sentiment(review):
    review = clean_text(review)
    review_vec = vectorizer.transform([review]).toarray()
    return model.predict(review_vec)[0]

print("\nExamples:")
print("Positive review:", predict_sentiment("This movie was fantastic! I loved the story and acting."))
print("Negative review:", predict_sentiment("Worst film ever. Completely boring and a waste of time."))


Examples:
Positive review: positive
Negative review: negative
