In [51]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [52]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe

In [53]:
# Load dataset
data = pd.read_csv("./data/01_clean_data.csv")

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5997 entries, 0 to 5996
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   metascore                       5997 non-null   float64
 1   rotten_tomatoes_rating          5997 non-null   float64
 2   meta_critic_rating              5997 non-null   float64
 3   budget                          5997 non-null   float64
 4   revenue                         5997 non-null   float64
 5   metascore_missing               5997 non-null   int64  
 6   rotten_tomatoes_rating_missing  5997 non-null   int64  
 7   meta_critic_rating_missing      5997 non-null   int64  
 8   budget_missing                  5997 non-null   int64  
 9   revenue_missing                 5997 non-null   int64  
 10  total_wins                      5997 non-null   int64  
 11  total_noms                      5997 non-null   int64  
 12  oscar_wins                      59

## Multilable Binarizer

In [55]:
def apply_mlb_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> pd.DataFrame:
    """
    Applies MultiLabelBinarizer to a single multi-label column and returns a new DataFrame 
    with the original column replaced by its one-hot encoded dummy columns.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to encode.
        delimiter (str): The delimiter used in the column to separate values (default: ", ").
    
    Returns:
        pd.DataFrame: A new DataFrame with the specified feature replaced by one-hot encoded columns.
    """
    df = df.copy()
    
    # Create a new column that is a list of values from the original column
    list_col = feature + "_list"
    df[list_col] = df[feature].apply(lambda x: [i.strip() for i in x.split(delimiter)] if pd.notnull(x) else [])
    
    # Initialize and fit MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    dummies = pd.DataFrame(
        mlb.fit_transform(df[list_col]),
        columns=[f"{feature}_{cls}" for cls in mlb.classes_],
        index=df.index
    )
    
    # Concatenate the dummy columns to the DataFrame and drop the original columns
    df = pd.concat([df, dummies], axis=1).drop([feature, list_col], axis=1)
    return df

# Function to apply MultiLabelBinarizer for multiple features
def apply_mlb_for_features(df: pd.DataFrame, features: list, delimiter: str = ",") -> pd.DataFrame:
    for feature in features:
        df = apply_mlb_for_feature(df, feature, delimiter)
    return df

# List of multi-label features
mlb_features = ['genre_names', 'production_country_name', 'spoken_languages']

# Create a FunctionTransformer
mlb_transformer = FunctionTransformer(func=apply_mlb_for_features, kw_args={'features': mlb_features})

In [56]:
num_cols = make_column_selector(dtype_include=['number'])
num_cols = num_cols(data)

for col in num_cols:
    print(col)

metascore
rotten_tomatoes_rating
meta_critic_rating
budget
revenue
metascore_missing
rotten_tomatoes_rating_missing
meta_critic_rating_missing
budget_missing
revenue_missing
total_wins
total_noms
oscar_wins
oscar_noms
bafta_wins
bafta_noms
release_year
release_month
release_day
is_weekend
is_holiday_season
movie_age
imdb_rating
imdb_votes
tmdb_vote_count
tmdb_vote_average
runtime_in_min
tmdb_popularity
roi


In [57]:
cat_cols = make_column_selector(dtype_include=['object'])
cat_cols = cat_cols(data)

for col in cat_cols:
    print(col)

release_date
production_country_name
spoken_languages
title
genre_names
production_company_name
director
writer
actors
age_rating


In [58]:
# ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# ohe.set_output(transform="default")

# scaler = StandardScaler()

In [59]:
modeling_transformer = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), [
            'metascore', 'rotten_tomatoes_rating', 'meta_critic_rating', 'budget', 'revenue', 'tmdb_vote_count', 'tmdb_vote_average', 'runtime_in_min', 
            'tmdb_popularity', 'imdb_rating', 'imdb_votes', 'total_wins', 'total_noms', 'oscar_wins', 'oscar_noms', 'bafta_wins', 'bafta_noms', 
            'release_year', 'release_month', 'release_day', 'movie_age', 'roi'
            ]),
        ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['age_rating'])
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
modeling_transformer.set_output(transform='pandas')

# transformed_data = modeling_transformer.fit_transform(data)
# transformed_data.head()

In [60]:
# Define a function transformer to drop unwanted columns.
def drop_unwanted_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    return df.drop(columns=columns, errors='ignore')

columns_to_drop = ['production_company_name', 'director', 'writer', 'actors', 'title']

dropper = FunctionTransformer(drop_unwanted_columns, kw_args={'columns': columns_to_drop})

In [61]:
scaling_pipeline = Pipeline(steps=[
    ('modeling_transformer', modeling_transformer),
    ('mlb', mlb_transformer),
    ('dropper', dropper)
])

scaled_data = scaling_pipeline.fit_transform(data)
scaled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5997 entries, 0 to 5996
Data columns (total 78 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   metascore                                         5997 non-null   float64
 1   rotten_tomatoes_rating                            5997 non-null   float64
 2   meta_critic_rating                                5997 non-null   float64
 3   budget                                            5997 non-null   float64
 4   revenue                                           5997 non-null   float64
 5   tmdb_vote_count                                   5997 non-null   float64
 6   tmdb_vote_average                                 5997 non-null   float64
 7   runtime_in_min                                    5997 non-null   float64
 8   tmdb_popularity                                   5997 non-null   float64
 9   imdb_rating        

In [62]:
scaled_data.head()

Unnamed: 0,metascore,rotten_tomatoes_rating,meta_critic_rating,budget,revenue,tmdb_vote_count,tmdb_vote_average,runtime_in_min,tmdb_popularity,imdb_rating,...,production_country_name_Germany,production_country_name_Others,production_country_name_United Kingdom,production_country_name_United States of America,spoken_languages_English,spoken_languages_French,spoken_languages_German,spoken_languages_Italian,spoken_languages_Others,spoken_languages_Spanish
0,-1.384054,-1.146597,-1.384434,-0.321319,-0.32392,-0.515205,-0.728048,1.273265,0.010508,-1.305179,...,0,1,0,0,1,0,0,0,1,0
1,-1.321501,1.286749,-1.328722,-0.475382,-0.391516,-0.480231,1.631306,0.106625,-0.137201,0.98221,...,0,1,0,0,0,0,0,0,1,0
2,0.085001,-0.234011,0.085235,-0.366579,-0.354981,-0.426263,0.039484,-0.220035,-0.217086,-0.057512,...,0,0,0,1,1,1,0,0,0,0
3,-1.347472,0.759854,-1.348417,-0.448828,-0.381871,-0.523949,0.920533,0.15329,-0.281684,0.878238,...,0,0,0,0,0,1,0,0,0,0
4,-1.370788,-0.766353,-1.362506,-0.538551,-0.405179,-0.532692,1.335903,-1.853332,-0.248635,2.021932,...,0,0,0,1,1,0,0,0,0,0


In [63]:
# Saving cleaned dataset for future use
save_dataframe(scaled_data, "03_modeling_data.csv")

✅ Data successfully saved to ./data\03_modeling_data.csv with separator ','
