In [6]:
!pip -q install datasets

import re
import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [7]:
dataset = load_dataset("imdb")

# Put into DataFrames and shuffle so we get both classes
train_df = pd.DataFrame({
    "text": dataset["train"]["text"],
    "label": dataset["train"]["label"]
}).sample(frac=1, random_state=42).reset_index(drop=True)

test_df = pd.DataFrame({
    "text": dataset["test"]["text"],
    "label": dataset["test"]["label"]
}).sample(frac=1, random_state=42).reset_index(drop=True)

# Take smaller subsets (fast)
train_df = train_df.head(5000)
test_df = test_df.head(2000)

print("Train class counts:", np.bincount(train_df["label"]))
print("Test class counts:", np.bincount(test_df["label"]))

Train class counts: [2515 2485]
Test class counts: [1040  960]


In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_texts = train_df["text"].apply(clean_text).tolist()
test_texts  = test_df["text"].apply(clean_text).tolist()

y_train = train_df["label"].to_numpy()
y_test  = test_df["label"].to_numpy()

print("Example cleaned review:\n", train_texts[0][:300])

Example cleaned review:
 dumb is as dumb does in this thoroughly uninteresting supposed black comedy essentially what starts out as chris klein trying to maintain a low profile eventually morphs into an uninspired version of the three amigos only without any laughs in order for black comedy to work it must be outrageous whi


In [9]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)

X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (5000, 20000)
X_test shape: (2000, 20000)


In [10]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Model trained successfully!")

Model trained successfully!


In [12]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec).max()
    label = "Positive" if pred == 1 else "Negative"
    return label, prob

examples = [
    "This movie was fantastic, I really loved it!",
    "Worst movie ever. Very boring and waste of time.",
    "It was okay, not bad but not great either."
]

for e in examples:
    label, prob = predict_sentiment(e)
    print(f"Review: {e}\nPrediction: {label} (confidence {prob:.2f})\n")

Review: This movie was fantastic, I really loved it!
Prediction: Positive (confidence 0.93)

Review: Worst movie ever. Very boring and waste of time.
Prediction: Negative (confidence 1.00)

Review: It was okay, not bad but not great either.
Prediction: Negative (confidence 0.69)



In [13]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

Accuracy: 0.8515

Confusion Matrix:
 [[866 174]
 [123 837]]

Report:
               precision    recall  f1-score   support

    Negative       0.88      0.83      0.85      1040
    Positive       0.83      0.87      0.85       960

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



### Insights

- The IMDb dataset contains movie reviews labeled as Negative (0) and Positive (1).
- Text preprocessing was performed (lowercasing, removing HTML tags, removing punctuation/numbers, and extra spaces).
- TF‑IDF was used to convert text into numeric features representing important words in each review.
- A Logistic Regression model was trained on the TF‑IDF vectors to classify reviews as positive or negative.
- The evaluation results (accuracy, confusion matrix, and classification report) show the model performance on unseen test data.
- This approach can be used for real‑world review monitoring to understand customer feedback automatically.