In [35]:
"""
! Question 2: IMDB Top 1000 Regression

* This notebook will:
1. Drop unneeded columns form the imdb_top_1000_New.csv dataset.
2. Predict IMDB_Rating using Released_Year, Runtime2, No_of_Votes, Gross, Genre with train/test split and 5-fold CV, then evaluate RMSE and reliability.
3. Repeat prediction to predict Gross as the target.
4. Treat Released_Year as a regression target and repeat modeling, then report RMSE,
"""

'\n! Question 2: IMDB Top 1000 Regression\n\n* This notebook will:\n1. Drop unneeded columns form the imdb_top_1000_New.csv dataset.\n2. Predict IMDB_Rating using Released_Year, Runtime2, No_of_Votes, Gross, Genre with train/test split and 5-fold CV, then evaluate RMSE and reliability.\n3. Repeat prediction to predict Gross as the target.\n4. Treat Released_Year as a regression target and repeat modeling, then report RMSE,\n'

In [None]:
"""
! Part A: Data Preparation
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# * Load the dataset
df = pd.read_csv('./res/imdb_top_1000_New.csv')

# ? Drop unneeded columns
df.drop(columns=['Series_Title', 'Certificate', 'Runtime'], inplace=True)

# * Inspect columns
print(df.columns)

Index(['Released_Year', 'Runtime2', 'Genre', 'IMDB_Rating', 'No_of_Votes',
       'Gross'],
      dtype='object')


In [None]:
"""
! Part B: Predict IMDB_Rating

? Based on the overall RMSE, do you think the model is reliable?
* The model is reliable because the overall RMSE is low (0.2), with a variance of 48% between x and y tests.
"""

# * Define features and target variable
target = 'IMDB_Rating'
features = ['Released_Year', 'Runtime2', 'No_of_Votes', 'Gross', 'Genre']
X = pd.get_dummies(df[features], drop_first=True)
y = df[target]


# ? Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# * 5-Fold CV RMSE
def cv_rmse(model):
    scores = cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='neg_root_mean_squared_error')
    return -scores.mean()

# * Models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# * Evaluate models
results = {}
for name, model in models.items():
    cv_score = cv_rmse(model)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results[name] = {'CV_RMSE': cv_score, 'Test_RMSE': rmse}
    print(f"{name}: CV RMSE = {cv_score:.2f}, Test RMSE = {rmse:.2f}")

# ! Overall RMSE
overall_rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Overall RMSE: {overall_rmse:.2f}")

# ? Reliability
if overall_rmse < 1:
    print("The model is reliable.")
    # Print % divide by the mean of y
    print(f"Overall Reliability: {(overall_rmse / np.mean(y_test)) * 100:.2f}%")
    # Print % for variance explained
    print(f"Variance Explained: {model.score(X_test, y_test) * 100:.2f}%")
else:
    print("The model is not reliable.")
    
# * Convert results to DataFrame for better visualization
pd.DataFrame(results).T

Linear Regression: CV RMSE = 0.22, Test RMSE = 0.20
Decision Tree: CV RMSE = 0.27, Test RMSE = 0.25
Random Forest: CV RMSE = 0.20, Test RMSE = 0.20
Overall RMSE: 0.20
The model is reliable.
Overall Reliability: 2.47%
Variance Explained: 48.28%


Unnamed: 0,CV_RMSE,Test_RMSE
Linear Regression,0.217869,0.19683
Decision Tree,0.265647,0.251405
Random Forest,0.199967,0.195727


In [None]:
"""
! Part C: Predict Gross

* An RMSE of 63 million on a mean Gross of ~80 million (i.e. ~78% error) is far too large, so none of these regression models are reliable for predicting Gross in its raw scale
"""

# * Define features and target variable for Gross prediction
target = 'Gross'
features_gross = ['Released_Year', 'Runtime2', 'No_of_Votes', 'Genre']
X = pd.get_dummies(df[features_gross], drop_first=True)
y = df[target]

# ? Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# * Evaluate models
results2 = {}
for name, model in models.items():
    cv_score = cv_rmse(model)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results2[name] = {'CV_RMSE': cv_score, 'Test_RMSE': rmse}
    print(f"{name}: CV RMSE = {cv_score:.2f}, Test RMSE = {rmse:.2f}")
    
# ! Overall RMSE
overall_rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Overall RMSE: {overall_rmse:.2f}")
    
# * Convert results to DataFrame for better visualization
pd.DataFrame(results2).T



Linear Regression: CV RMSE = 85408312.83, Test RMSE = 72263191.92
Decision Tree: CV RMSE = 102112298.90, Test RMSE = 93899967.58
Random Forest: CV RMSE = 81774353.15, Test RMSE = 63295143.28
Overall RMSE: 63295143.28
The model is not reliable.


Unnamed: 0,CV_RMSE,Test_RMSE
Linear Regression,85408310.0,72263190.0
Decision Tree,102112300.0,93899970.0
Random Forest,81774350.0,63295140.0


In [64]:
"""Part D: Predict Released_Year as Regression"""

# * Define features and target variable for Released_Year prediction
target = 'Released_Year'
X = pd.get_dummies(df[['Runtime2', 'No_of_Votes', 'Gross', 'Genre']], drop_first=True)
y = df[target]

# ? Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# * Evaluate models
results3 = {}
for name, model in models.items():
    cv_score = cv_rmse(model)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results3[name] = {'CV_RMSE': cv_score, 'Test_RMSE': rmse}
    print(f"{name}: CV RMSE = {cv_score:.2f}, Test RMSE = {rmse:.2f}")
    
# * Convert results to DataFrame for better visualization
pd.DataFrame(results3).T

Linear Regression: CV RMSE = 21.12, Test RMSE = 20.65
Decision Tree: CV RMSE = 26.86, Test RMSE = 25.14
Random Forest: CV RMSE = 19.98, Test RMSE = 19.75


Unnamed: 0,CV_RMSE,Test_RMSE
Linear Regression,21.120425,20.647685
Decision Tree,26.858014,25.13983
Random Forest,19.977993,19.751346
