In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("IMDB Dataset.csv")

# Look at the first 5 rows
print(df.head())

# Check how many Positive vs Negative reviews we have
print("\nClass Distribution:")
print(df['sentiment'].value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Class Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [2]:
import re

# Function to clean text
def clean_text(text):
    # 1. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # 2. Remove punctuation and numbers (keep only letters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 3. Lowercase
    text = text.lower()
    return text

# Apply this function to the entire 'review' column
df['cleaned_review'] = df['review'].apply(clean_text)

# Let's see the difference
print("Original:", df['review'][0][:100])
print("Cleaned: ", df['cleaned_review'][0][:100])

Original: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The
Cleaned:  one of the other reviewers has mentioned that after watching just  oz episode youll be hooked they a


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 1. Split Data
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create a Pipeline
# Step A: Turn text into numbers (remove 'english' stop words like 'the', 'a')
# Step B: Train a Logistic Regression model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('model', LogisticRegression())
])

# 3. Train the Pipeline
pipeline.fit(X_train, y_train)

# 4. Predict
predictions = pipeline.predict(X_test)

# 5. Score
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [4]:
# Create some new reviews
my_reviews = [
    "This movie was absolute trash. I hated every second of it.",  # Should be Negative
    "An absolute masterpiece! The acting was brilliant.",          # Should be Positive
    "I fell asleep halfway through. Boring plot.",                 # Should be Negative
    "It was okay, not great but not terrible either."              # The Tricky one
]

# Predict
results = pipeline.predict(my_reviews)
probs = pipeline.predict_proba(my_reviews)

for review, result, prob in zip(my_reviews, results, probs):
    # prob[1] is the probability of being Positive
    print(f"Review: {review}")
    print(f"Prediction: {result.upper()} (Confidence: {prob.max()*100:.2f}%)")
    print("-" * 30)

Review: This movie was absolute trash. I hated every second of it.
Prediction: NEGATIVE (Confidence: 83.03%)
------------------------------
Review: An absolute masterpiece! The acting was brilliant.
Prediction: POSITIVE (Confidence: 96.04%)
------------------------------
Review: I fell asleep halfway through. Boring plot.
Prediction: NEGATIVE (Confidence: 99.25%)
------------------------------
Review: It was okay, not great but not terrible either.
Prediction: NEGATIVE (Confidence: 87.17%)
------------------------------
