In [77]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [78]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe

In [79]:
# Load dataset
data = pd.read_csv("./data/01_clean_data.csv")

In [80]:
data = data.dropna(subset=['revenue'])
data = data[data["budget_missing"] == 0]

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3349 entries, 16 to 5996
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   metascore                       3349 non-null   float64
 1   rotten_tomatoes_rating          3349 non-null   float64
 2   meta_critic_rating              3349 non-null   float64
 3   budget                          3349 non-null   float64
 4   metascore_missing               3349 non-null   int64  
 5   rotten_tomatoes_rating_missing  3349 non-null   int64  
 6   meta_critic_rating_missing      3349 non-null   int64  
 7   budget_missing                  3349 non-null   int64  
 8   total_wins                      3349 non-null   int64  
 9   total_noms                      3349 non-null   int64  
 10  oscar_wins                      3349 non-null   int64  
 11  oscar_noms                      3349 non-null   int64  
 12  bafta_wins                      3349 n

## Multilable Binarizer

In [82]:
def apply_mlb_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> pd.DataFrame:
    """
    Applies MultiLabelBinarizer to a single multi-label column and returns a new DataFrame 
    with the original column replaced by its one-hot encoded dummy columns.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to encode.
        delimiter (str): The delimiter used in the column to separate values (default: ", ").
    
    Returns:
        pd.DataFrame: A new DataFrame with the specified feature replaced by one-hot encoded columns.
    """
    df = df.copy()
    
    # Create a new column that is a list of values from the original column
    list_col = feature + "_list"
    df[list_col] = df[feature].apply(lambda x: [i.strip() for i in x.split(delimiter)] if pd.notnull(x) else [])
    
    # Initialize and fit MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    dummies = pd.DataFrame(
        mlb.fit_transform(df[list_col]),
        columns=[f"{feature}_{cls}" for cls in mlb.classes_],
        index=df.index
    )
    
    # Concatenate the dummy columns to the DataFrame and drop the original columns
    df = pd.concat([df, dummies], axis=1).drop([feature, list_col], axis=1)
    return df

# Function to apply MultiLabelBinarizer for multiple features
def apply_mlb_for_features(df: pd.DataFrame, features: list, delimiter: str = ",") -> pd.DataFrame:
    for feature in features:
        df = apply_mlb_for_feature(df, feature, delimiter)
    return df

# List of multi-label features
mlb_features = ['genre_names', 'production_country_name', 'spoken_languages']

# Create a FunctionTransformer
mlb_transformer = FunctionTransformer(func=apply_mlb_for_features, kw_args={'features': mlb_features})

In [83]:
num_cols = make_column_selector(dtype_include=['number'])
num_cols = num_cols(data)

for col in num_cols:
    print(col)

metascore
rotten_tomatoes_rating
meta_critic_rating
budget
metascore_missing
rotten_tomatoes_rating_missing
meta_critic_rating_missing
budget_missing
total_wins
total_noms
oscar_wins
oscar_noms
bafta_wins
bafta_noms
release_year
release_month
release_day
is_weekend
is_holiday_season
movie_age
imdb_rating
imdb_votes
tmdb_vote_count
tmdb_vote_average
runtime_in_min
tmdb_popularity
revenue


In [84]:
cat_cols = make_column_selector(dtype_include=['object'])
cat_cols = cat_cols(data)

for col in cat_cols:
    print(col)

release_date
production_country_name
spoken_languages
title
genre_names
production_company_name
director
writer
actors
age_rating


In [85]:
# ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# ohe.set_output(transform="default")

# scaler = StandardScaler()

In [86]:
modeling_transformer = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), [
            'metascore', 'rotten_tomatoes_rating', 'meta_critic_rating', 'budget', 'tmdb_vote_count', 'tmdb_vote_average', 'runtime_in_min', 
            'tmdb_popularity', 'imdb_rating', 'imdb_votes', 'total_wins', 'total_noms', 'oscar_wins', 'oscar_noms', 'bafta_wins', 'bafta_noms', 
            'release_year', 'release_month', 'release_day', 'movie_age', 
            # 'roi', 'revenue'
            ]),
        ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['age_rating'])
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
modeling_transformer.set_output(transform='pandas')

# transformed_data = modeling_transformer.fit_transform(data)
# transformed_data.head()

In [87]:
# Define a function transformer to drop unwanted columns.
def drop_unwanted_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    return df.drop(columns=columns, errors='ignore')

columns_to_drop = ['production_company_name', 'director', 'writer', 'actors', 'title', 'release_date']

dropper = FunctionTransformer(drop_unwanted_columns, kw_args={'columns': columns_to_drop})

In [88]:
scaling_pipeline = Pipeline(steps=[
    ('modeling_transformer', modeling_transformer),
    ('mlb', mlb_transformer),
    ('dropper', dropper)
])

scaled_data = scaling_pipeline.fit_transform(data)
scaled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3349 entries, 16 to 5996
Data columns (total 71 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   metascore                                         3349 non-null   float64
 1   rotten_tomatoes_rating                            3349 non-null   float64
 2   meta_critic_rating                                3349 non-null   float64
 3   budget                                            3349 non-null   float64
 4   tmdb_vote_count                                   3349 non-null   float64
 5   tmdb_vote_average                                 3349 non-null   float64
 6   runtime_in_min                                    3349 non-null   float64
 7   tmdb_popularity                                   3349 non-null   float64
 8   imdb_rating                                       3349 non-null   float64
 9   imdb_votes             

In [89]:
scaled_data.head()

Unnamed: 0,metascore,rotten_tomatoes_rating,meta_critic_rating,budget,tmdb_vote_count,tmdb_vote_average,runtime_in_min,tmdb_popularity,imdb_rating,imdb_votes,...,production_country_name_Germany,production_country_name_Others,production_country_name_United Kingdom,production_country_name_United States of America,spoken_languages_English,spoken_languages_French,spoken_languages_German,spoken_languages_Italian,spoken_languages_Others,spoken_languages_Spanish
16,0.99074,0.798408,0.990654,-0.701953,-0.531173,0.695176,1.310312,-0.214489,0.701028,-0.436332,...,0,1,0,0,1,1,0,0,1,0
18,1.327854,1.314516,1.327706,-0.776704,-0.617641,0.962158,-0.857842,-0.277085,1.44932,-0.436151,...,0,0,0,1,1,0,0,0,0,0
23,-1.890711,-0.775757,-1.900586,2.568426,-0.643044,-0.500904,0.497254,23.584259,-2.105011,-0.622615,...,0,0,0,1,1,0,0,0,1,1
25,-0.116917,-0.307537,-0.1168,-0.328195,-0.673332,-0.105771,-0.370007,7.040518,-0.688659,-0.635159,...,0,0,0,1,1,0,0,0,0,0
27,1.376013,0.650949,1.375856,-0.421635,-0.649883,-1.114964,0.280439,2.874054,0.166533,-0.660403,...,0,1,0,1,1,0,0,0,0,0


In [90]:
# Saving cleaned dataset for future use
save_dataframe(scaled_data, "03_modeling_data.csv")

✅ Data successfully saved to ./data\03_modeling_data.csv with separator ','


In [91]:
target = 'revenue'
X = data.drop(columns=[target])
y = data[target]

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
# Fit the pipeline on the training data
X_train_scaled = scaling_pipeline.fit_transform(X_train, y_train)

# Transform the test data using the same pipeline
X_test_scaled = scaling_pipeline.transform(X_test)

In [94]:
X_train_scaled.head()

Unnamed: 0,metascore,rotten_tomatoes_rating,meta_critic_rating,budget,tmdb_vote_count,tmdb_vote_average,runtime_in_min,tmdb_popularity,imdb_rating,imdb_votes,...,production_country_name_Germany,production_country_name_Others,production_country_name_United Kingdom,production_country_name_United States of America,spoken_languages_English,spoken_languages_French,spoken_languages_German,spoken_languages_Italian,spoken_languages_Others,spoken_languages_Spanish
2491,-0.060052,-0.531683,-0.059958,-0.417848,-0.273025,-1.164131,-0.487636,-0.149054,-1.324467,-0.410711,...,0,0,0,1,1,0,0,0,0,0
4545,-0.83163,-1.346512,-0.831412,-0.417848,-0.190626,-0.138577,-0.216469,-0.163389,-0.473689,-0.257585,...,0,0,0,1,1,0,0,0,0,0
635,1.097316,1.0239,1.097223,-0.08616,-0.68673,0.562838,-0.433403,-0.199604,0.589782,-0.573836,...,0,0,0,1,1,0,0,0,0,0
111,0.61508,0.579447,0.615064,2.069807,-0.220701,0.429994,0.705502,0.575164,0.058046,0.029376,...,0,0,0,1,1,0,0,0,0,0
1939,1.097316,1.135013,1.097223,-0.712681,-0.337819,2.156963,0.868202,-0.168752,2.078642,-0.149776,...,0,1,1,1,0,0,0,0,1,0


In [95]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

def model_evaluation(models: dict, X_train, y_train, w_train=None):
    """
    Evaluates regression models using Cross Validation.
    Returns a report with R² and RMSE scores.
    """
    
    evaluation_report = {}

    for model_name, model in models.items():
        # R² Score
        r2_scores = cross_val_score(
            model, 
            X_train, 
            y_train, 
            cv=5, 
            scoring="r2", 
            n_jobs=-1
        )
        
        # RMSE Score
        rmse_scores = cross_val_score(
            model, X_train, y_train, 
            cv=5, 
            scoring=make_scorer(lambda y, y_pred: mean_squared_error(y, y_pred, squared=False)), 
            n_jobs=-1
        )

        # Store results
        evaluation_report[model_name] = {
            "R² Mean": np.mean(r2_scores),
            "RMSE Mean": np.mean(rmse_scores)
        }
        
        # Print results
        print(f"{model_name}:")
        print(f"- CV R² scores: {' | '.join([f'{r * 100:.1f}%' for r in r2_scores])}")
        print(f"- Mean R²: {np.mean(r2_scores) * 100:.1f}%")
        print("- -----------------------------------------------------------------------------------------------------------")
        print(f"- CV RMSE scores: {' | '.join([f'{r:,.0f}$' for r in rmse_scores])}")
        print(f"- Mean RMSE: {np.mean(rmse_scores):,.0f}$")
        print()


    
    return evaluation_report

# Models for preliminary testing
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor

models = {
    # "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42),
    "LightGBM Regressor": LGBMRegressor(random_state=42),
    "KNeighbors Regressor": KNeighborsRegressor()
}


In [96]:
_ = model_evaluation(models, X_train_scaled, y_train)

Ridge Regression:
- CV R² scores: 73.2% | 74.3% | 74.5% | 69.3% | 74.3%
- Mean R²: 73.1%
- -----------------------------------------------------------------------------------------------------------
- CV RMSE scores: 96,047,171$ | 114,078,706$ | 127,020,676$ | 131,416,581$ | 104,741,217$
- Mean RMSE: 114,660,870$

Random Forest Regressor:
- CV R² scores: 77.8% | 73.9% | 81.4% | 82.9% | 79.0%
- Mean R²: 79.0%
- -----------------------------------------------------------------------------------------------------------
- CV RMSE scores: 87,332,508$ | 114,886,568$ | 108,588,489$ | 98,217,126$ | 94,772,088$
- Mean RMSE: 100,759,356$

Gradient Boosting Regressor:
- CV R² scores: 77.5% | 76.9% | 82.8% | 81.7% | 79.7%
- Mean R²: 79.7%
- -----------------------------------------------------------------------------------------------------------
- CV RMSE scores: 88,012,775$ | 108,159,672$ | 104,444,165$ | 101,428,685$ | 93,100,271$
- Mean RMSE: 99,029,114$

XGBoost Regressor:
- CV R² scores: 78.