In [2]:
pip install wordcloud 

Defaulting to user installation because normal site-packages is not writeable
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (513 kB)
Installing collected packages: wordcloud
[0mSuccessfully installed wordcloud-1.9.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Movie Review Sentiment Analysis - Rotten vs Fresh Prediction
# Models: Logistic Regression + Decision Tree + Naive Bayes
# Threshold: 60% (>=60 = Fresh, <60 = Rotten)

# ==========================================
# STEP (1) Problem Definition, Scoping & Framing
# ==========================================

# 1.1) Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
import re
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=" * 60)
print("MOVIE REVIEW SENTIMENT ANALYSIS PROJECT")
print("Models: Logistic Regression + Decision Tree + Naive Bayes")
print("Threshold: >=60% = Fresh, <60% = Rotten")
print("=" * 60)

df = pd.read_csv('rotten_tomatoes_movies.csv')
print("\nDataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# ==========================================
# STEP (2) Data Exploration & Understanding
# ==========================================

print("\n" + "=" * 50)
print("STEP 2: DATA EXPLORATION & UNDERSTANDING")
print("=" * 50)

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nTomatometer Status Distribution:")
print(df['tomatometer_status'].value_counts())

print("\nTomatometer Rating Statistics:")
print(df['tomatometer_rating'].describe())

# ==========================================
# STEP (3) Data Preparation & Feature Engineering
# ==========================================

print("\n" + "=" * 50)
print("STEP 3: DATA PREPARATION & FEATURE ENGINEERING")
print("=" * 50)

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    return text

df['review_content_clean'] = df['review_content'].apply(clean_text)
df['review_content_clean'] = df['review_content_clean'].fillna('')

print("Text cleaning completed!")
print(f"Sample cleaned text: {df['review_content_clean'].iloc[0][:100]}...")

def extract_text_features(df):
    df['review_length'] = df['review_content_clean'].str.len()
    df['word_count'] = df['review_content_clean'].str.split().str.len()
    df['sentence_count'] = df['review_content_clean'].str.count('\.') + 1
    df['avg_word_length'] = df['review_content_clean'].apply(
        lambda x: np.mean([len(word) for word in x.split()]) if x else 0
    )
    df['sentiment_polarity'] = df['review_content_clean'].apply(
        lambda x: TextBlob(x).sentiment.polarity if x else 0
    )
    df['sentiment_subjectivity'] = df['review_content_clean'].apply(
        lambda x: TextBlob(x).sentiment.subjectivity if x else 0
    )
    return df

df = extract_text_features(df)

def create_target_variable(df, threshold=60):
    df['target'] = (df['tomatometer_rating'] >= threshold).astype(int)
    df['target_label'] = df['target'].map({1: 'Fresh', 0: 'Rotten'})
    return df

df = create_target_variable(df)

print(f"\nTarget variable created with threshold: 60%")
print("Target distribution:")
print(df['target'].value_counts())

# ==========================================
# STEP (4) ML Model Selection & Evaluation
# ==========================================

print("\n" + "=" * 50)
print("STEP 4: ML MODEL SELECTION & EVALUATION")
print("=" * 50)

vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95
)

X_text = vectorizer.fit_transform(df['review_content_clean']).toarray()

numerical_features = ['review_length', 'word_count', 'sentence_count', 
                     'avg_word_length', 'sentiment_polarity', 'sentiment_subjectivity']
X_numerical = df[numerical_features].fillna(0).values

# Combine features for LR and DT
X_combined = np.hstack([X_text, X_numerical])
y = df['target'].values

# Split data for combined features (LR, DT)
X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(
    X_combined, y, test_size=0.3, random_state=42, stratify=y
)

# Split data for text features only (NB)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_text, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size (combined features): {X_train_comb.shape}")
print(f"Test set size (combined features): {X_test_comb.shape}")
print(f"Training set size (text features only): {X_train_text.shape}")
print(f"Test set size (text features only): {X_test_text.shape}")

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    if y_pred_proba is not None:
        auc = roc_auc_score(y_test, y_pred_proba)
        print(f"AUC-ROC: {auc:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Rotten', 'Fresh']))
    return model, accuracy

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Naive Bayes': MultinomialNB()
}

model_results = {}
trained_models = {}

# Logistic Regression and Decision Tree use combined features
for name in ['Logistic Regression', 'Decision Tree']:
    trained_model, accuracy = evaluate_model(models[name], X_train_comb, X_test_comb, y_train_comb, y_test_comb, name)
    model_results[name] = accuracy
    trained_models[name] = trained_model

# Naive Bayes uses text features only
trained_model_nb, accuracy_nb = evaluate_model(models['Naive Bayes'], X_train_text, X_test_text, y_train_text, y_test_text, 'Naive Bayes')
model_results['Naive Bayes'] = accuracy_nb
trained_models['Naive Bayes'] = trained_model_nb

# ==========================================
# STEP (5) Performance Tuning & Optimization
# ==========================================

print("\n" + "=" * 50)
print("STEP 5: PERFORMANCE TUNING & OPTIMIZATION")
print("=" * 50)

print("Performing hyperparameter tuning...")

# Logistic Regression tuning
lr_params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
lr_grid = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), lr_params, cv=3, scoring='accuracy')
lr_grid.fit(X_train_comb, y_train_comb)

# Decision Tree tuning
dt_params = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=3, scoring='accuracy')
dt_grid.fit(X_train_comb, y_train_comb)

# Naive Bayes tuning
nb_params = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}
nb_grid = GridSearchCV(MultinomialNB(), nb_params, cv=3, scoring='accuracy')
nb_grid.fit(X_train_text, y_train_text)

print(f"\nBest Logistic Regression Params: {lr_grid.best_params_}")
print(f"Best Logistic Regression CV Accuracy: {lr_grid.best_score_:.4f}")

print(f"\nBest Decision Tree Params: {dt_grid.best_params_}")
print(f"Best Decision Tree CV Accuracy: {dt_grid.best_score_:.4f}")

print(f"\nBest Naive Bayes Params: {nb_grid.best_params_}")
print(f"Best Naive Bayes CV Accuracy: {nb_grid.best_score_:.4f}")

# Evaluate tuned models on test set
best_lr = lr_grid.best_estimator_
best_dt = dt_grid.best_estimator_
best_nb = nb_grid.best_estimator_

print("\nEvaluating tuned models on test data...")

evaluate_model(best_lr, X_train_comb, X_test_comb, y_train_comb, y_test_comb, "Tuned Logistic Regression")
evaluate_model(best_dt, X_train_comb, X_test_comb, y_train_comb, y_test_comb, "Tuned Decision Tree")
evaluate_model(best_nb, X_train_text, X_test_text, y_train_text, y_test_text, "Tuned Naive Bayes")

# ==========================================
# Optional: Save the best Logistic Regression model
import pickle

with open('best_logistic_model.pkl', 'wb') as f:
    pickle.dump(best_lr, f)

print("\nBest Logistic Regression model saved as 'best_logistic_model.pkl'")

# ==========================================
# End of Script
# ==========================================



MOVIE REVIEW SENTIMENT ANALYSIS PROJECT
Models: Logistic Regression + Decision Tree + Naive Bayes
Threshold: >=60% = Fresh, <60% = Rotten

Dataset Shape: (17711, 23)

First 5 rows:
                    rotten_tomatoes_link  \
0                              m/0814255   
1                              m/0878835   
2                                   m/10   
3                 m/1000013-12_angry_men   
4  m/1000079-20000_leagues_under_the_sea   

                                         movie_title  \
0  Percy Jackson & the Olympians: The Lightning T...   
1                                        Please Give   
2                                                 10   
3                    12 Angry Men (Twelve Angry Men)   
4                       20,000 Leagues Under The Sea   

                                          movie_info  \
0  Always trouble-prone, the life of teenager Per...   
1  Kate (Catherine Keener) and her husband Alex (...   
2  A successful, middle-aged Hollywood songwriter