In [None]:
# Sentiment Analysis on IMDB Movie Reviews using ML Models

# Step 1: Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc
import time

# Step 2: Load Dataset
df = pd.read_csv("data/IMDB Dataset.csv")
df.head()

# Step 3: Preprocess Text
def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r"\w*\d\w*", '', text)
    return text

df['review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

# Step 4: Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['review']).toarray()
y = df['sentiment'].values

# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train Models and Evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

accuracies = []
f1_scores = []
training_times = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    if name == "Linear Regression":
        preds = model.predict(X_test)
        preds = np.where(preds >= 0.5, 1, 0)
    else:
        preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    accuracies.append(acc)
    f1_scores.append(f1)
    training_times.append(end - start)

    print(f"\n{name} Results:")
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Classification Report:\n", classification_report(y_test, preds))

# Step 7: Accuracy and F1 Score Plot
plt.figure(figsize=(10, 4))
sns.barplot(x=list(models.keys()), y=accuracies, palette="viridis")
plt.title("Model Accuracy Comparison")
plt.ylim(0, 1)
plt.ylabel("Accuracy")
plt.show()

plt.figure(figsize=(10, 4))
sns.barplot(x=list(models.keys()), y=f1_scores, palette="magma")
plt.title("Model F1 Score Comparison")
plt.ylim(0, 1)
plt.ylabel("F1 Score")
plt.show()

# Step 8: Confusion Matrices
for name, model in models.items():
    if name == "Linear Regression":
        preds = model.predict(X_test)
        preds = np.where(preds >= 0.5, 1, 0)
    else:
        preds = model.predict(X_test)

    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Step 9: ROC Curve (Logistic Regression)
y_score = models["Logistic Regression"].predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Step 10: Sentiment Distribution
labels = ['Positive', 'Negative']
sizes = df['sentiment'].value_counts()
colors = ['#66b3ff','#ff9999']

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title("Sentiment Distribution")
plt.axis('equal')
plt.show()
