In [1]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sqlalchemy import text

In [2]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe
from database.db_utils import init_db
from config.config_loader import load_config
from database.queries import prepped_data_query

## Data Quering and Loading

In [3]:
# Initialize local PostgreSQL session
Session = init_db(load_config("DB_URL"))
session = Session()

# Execute and fetch results
data = session.execute(text(prepped_data_query))

# Close the session
session.close()

# Convert to DataFrame
data = pd.DataFrame(data.fetchall(), columns=data.keys())

In [None]:
# Saving cleaned dataset for future use
# save_dataframe(data, "00_base_data.csv")

✅ Data successfully saved to ./data\00_base_data.csv with separator ','


In [5]:
# Load dataset
# data = pd.read_csv("./data/00_base_data.csv")

## Initial Data Overview

In [6]:
# data["missing_budget"] = (data["budget"] == 0).astype(int)
# data["missing_revenue"] = (data["revenue"] == 0).astype(int)

In [7]:
# data["budget"] = data["budget"].replace(0, np.nan)
# data["revenue"] = data["revenue"].replace(0, np.nan)

In [8]:
# data = data[(data["budget"] != 0) & (data["revenue"] != 0)]

In [9]:
# data = data[(data["budget"] != 0) | (data["revenue"] != 0)]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    3003 non-null   object 
 1   release_date             3003 non-null   object 
 2   tmdb_vote_count          3003 non-null   int64  
 3   tmdb_vote_average        3003 non-null   float64
 4   genre_names              3003 non-null   object 
 5   budget                   1594 non-null   float64
 6   revenue                  1816 non-null   float64
 7   runtime_in_min           3001 non-null   float64
 8   tmdb_popularity          3003 non-null   float64
 9   production_company_name  2990 non-null   object 
 10  production_country_name  2998 non-null   object 
 11  spoken_languages         3001 non-null   object 
 12  director                 3001 non-null   object 
 13  writer                   2972 non-null   object 
 14  actors                  

In [11]:
# Count missing values
data.isnull().sum()

title                         0
release_date                  0
tmdb_vote_count               0
tmdb_vote_average             0
genre_names                   0
budget                     1409
revenue                    1187
runtime_in_min                2
tmdb_popularity               0
production_company_name      13
production_country_name       5
spoken_languages              2
director                      2
writer                       31
actors                        5
imdb_rating                   8
imdb_votes                    5
metascore                   781
age_rating                  269
awards                      529
rotten_tomatoes_rating      467
meta_critic_rating          781
dtype: int64

In [14]:
data.head(5)

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,genre_names,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,...,director,writer,actors,imdb_rating,imdb_votes,metascore,age_rating,awards,rotten_tomatoes_rating,meta_critic_rating
0,The Gorge,2025-02-13,670,7.9,"Action, Horror, Romance, Science Fiction, Thri...",0,0,127,463.558,"Skydance Media, Crooked Highway, Apple Studios...",...,Scott Derrickson,Zach Dean,"Anya Taylor-Joy, Sigourney Weaver, Miles Teller",,,,,,,
1,Captain America: Brave New World,2025-02-12,500,6.204,"Action, Thriller, Science Fiction",180000000,192400000,119,3181.462,"Marvel Studios, Kevin Feige Productions",...,Julius Onah,"Rob Edwards, Malcolm Spellman, Dalan Musson","Harrison Ford, Liv Tyler, Rosa Salazar",,,,PG-13,,,
2,Back in Action,2025-01-15,985,6.5,"Action, Comedy",0,0,114,1040.048,"Chernin Entertainment, Exhibit A, Good One",...,Seth Gordon,"Seth Gordon, Brendan O'Brien","Jamie Foxx, Cameron Diaz, McKenna Roberts",,,,PG-13,,28.0,
3,Wolf Man,2025-01-15,376,6.5,"Horror, Thriller",25000000,34355215,103,1120.683,"Universal Pictures, Cloak & Co., Blumhouse Pro...",...,Leigh Whannell,"Leigh Whannell, Corbett Tuck","Christopher Abbott, Julia Garner, Matilda Firth",5.8,11007.0,50.0,R,,51.0,50.0
4,Your Fault,2024-12-26,1056,7.136,"Romance, Drama",0,0,118,481.299,"Pokeepsie Films, Amazon MGM Studios",...,Domingo González,"Mercedes Ron, Domingo González, Sofía Cuenca","Nicole Wallace, Gabriel Guevara, Marta Hazas",5.3,2511.0,,,,20.0,


## Multilable Categorical Features

There are a nuymber of multilable categorical features that will need to be looked into.

In [None]:
def count_unique_values_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> int:
    """
    Splits the specified feature column by the delimiter and returns the number of unique values.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to process.
        delimiter (str): The delimiter used to separate multiple values in the column.

    Returns:
        int: The number of unique values.
    """
    return len(df[feature].dropna().str.split(rf"{delimiter}\s*").explode().unique())

# List of features you want to analyze:
features = [
    "genre_names", 
    "production_company_name", "production_country_name", 
            "spoken_languages", "director", "writer", "actors"]

# Create a dictionary with the counts for each feature:
unique_counts = {feature: count_unique_values_for_feature(data, feature) for feature in features}

# Display the results:
for feature, count in unique_counts.items():
    print(f"{feature}: {count} unique values")

production_company_name: 21 unique values
production_country_name: 11 unique values
spoken_languages: 11 unique values
director: 21 unique values
writer: 21 unique values
actors: 21 unique values


In [None]:
def print_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> None:
    """
    Prints the top_n unique values from a multi-label column and the total count of values 
    that fall outside the top_n (which would be grouped as 'Others').

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to display.
        delimiter (str): The delimiter separating multiple values (default is a comma).
        others_label (str): The label used for less frequent values.
    """
    # Split the column into individual values and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    
    # Get the top N categories and the sum for the rest
    top_categories = counts.head(top_n)
    others_count = counts[counts.index.difference(top_categories.index)].sum()
    
    print("--------------------------------------------------||")
    print(f"Top {top_n} unique values for '{column}':")
    print(top_categories)
    print(f"Total count of all other values (will be grouped as '{others_label}'): {others_count}")
    print("--------------------------------------------------||\n")


In [None]:
top_values = {
    "genre_names": 20,
    "production_company_name": 20,
    "production_country_name": 10,
    "spoken_languages": 10,
    "director": 20,
    "writer": 20,
    "actors": 20
}

for feature, top_n in top_values.items():
    print_top_categories(data, feature, top_n)

--------------------------------------------------||
Top 20 unique values for 'production_company_name':
production_company_name
Universal Pictures          68
Columbia Pictures           56
Lionsgate                   52
Paramount Pictures          51
Walt Disney Pictures        49
Warner Bros. Pictures       47
Bron Studios                38
Canal+                      35
StudioCanal                 35
Blumhouse Productions       34
France 2 Cinéma             33
Ingenious Media             31
FilmNation Entertainment    30
STXfilms                    29
TSG Entertainment           28
Film4 Productions           27
A24                         26
Voltage Pictures            26
Focus Features              26
Warner Bros. Animation      26
Name: count, dtype: int64
Total count of all other values (will be grouped as 'Others'): 6663
--------------------------------------------------||

--------------------------------------------------||
Top 10 unique values for 'production_country_name'

In [49]:
def group_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> pd.Series:
    """
    Processes a multi-label column by keeping only the top_n categories (based on frequency) 
    and replacing any other category with the 'others_label'.

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to keep.
        delimiter (str): The delimiter separating multiple values in the column.
        others_label (str): The label to use for all categories not in the top_n.

    Returns:
        pd.Series: A new Series with the modified values.
    """
    # Split the column and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    top_categories = counts.head(top_n).index.tolist()
    
    def map_categories(cell):
        if pd.isna(cell):
            return cell
        cats = [cat.strip() for cat in cell.split(delimiter)]
        # Replace any category not in top_categories with others_label
        new_cats = [cat if cat in top_categories else others_label for cat in cats]
        # Remove duplicates while preserving order
        seen = set()
        new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
        return delimiter.join(new_cats)
    
    return df[column].apply(map_categories)


In [None]:
top_values = {
    # "genre_names": 20,  # Keeping all genres
    # "production_company_name": 20,  # Too granular
    "production_country_name": 5,
    "spoken_languages": 5,
    # "director": 20,  # Too granular
    # "writer": 20,  # Too granular
    # "actors": 20  # Too granular
}

for feature, top_n in top_values.items():
    data[feature] = group_top_categories(data, feature, top_n)

In [51]:
data.head()

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,production_country_name,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,The Gorge,2025-02-13,509,7.864,0,0,127,463.558,Others,United States of America,...,0,1,0,0,1,1,0,1,0,0
1,Captain America: Brave New World,2025-02-12,430,6.2,180000000,192400000,119,3181.462,Others,United States of America,...,0,0,0,0,0,1,0,1,0,0
2,Back in Action,2025-01-15,976,6.485,0,0,114,1040.048,Others,United States of America,...,0,0,0,0,0,0,0,0,0,0
3,Wolf Man,2025-01-15,369,6.5,25000000,34355215,103,1120.683,"Universal Pictures,Others,Blumhouse Productions",United States of America,...,0,1,0,0,0,0,0,1,0,0
4,Your Fault,2024-12-26,1050,7.141,0,0,118,481.299,Others,"Spain,United States of America",...,0,0,0,0,1,0,0,0,0,0


In [None]:
# Split the GENRE column on commas and explode it into individual rows.
unique_genres = data["genre_names"].str.split(",\s*").explode()

# Count the occurrences of each unique genre.
genre_counts = unique_genres.value_counts()
print(genre_counts)

# Number of unique genres (which equals the number of columns after one-hot encoding).
num_unique_genres = genre_counts.shape[0]
print(f"One-hot encoding would create {num_unique_genres} new columns.")


genre_names
Drama              838
Comedy             649
Thriller           576
Action             502
Horror             359
Adventure          333
Romance            277
Fantasy            272
Science Fiction    258
Crime              255
Family             218
Animation          212
Mystery            204
History            131
Music               66
War                 56
Documentary         39
Western             19
TV Movie            19
Name: count, dtype: int64
One-hot encoding would create 19 new columns.


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

# Ensure GENRE_NAME is a list of genres
data["genre_list"] = data["genre_names"].apply(lambda x: x.split(", "))

# Use MultiLabelBinarizer to create one-hot encoded columns
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(data["genre_list"]), columns=mlb.classes_)

# Concatenate with original DataFrame and drop unnecessary columns
data = pd.concat([data, genre_dummies], axis=1).drop(["genre_names", "genre_list"], axis=1)

In [11]:
data.head() # Check the transformed data

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,production_country_name,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,The Gorge,2025-02-13,509,7.864,0,0,127,463.558,"Skydance Media, Crooked Highway, Apple Studios...",United States of America,...,0,1,0,0,1,1,0,1,0,0
1,Captain America: Brave New World,2025-02-12,430,6.2,180000000,192400000,119,3181.462,"Marvel Studios, Kevin Feige Productions",United States of America,...,0,0,0,0,0,1,0,1,0,0
2,Back in Action,2025-01-15,976,6.485,0,0,114,1040.048,"Chernin Entertainment, Exhibit A, Good One",United States of America,...,0,0,0,0,0,0,0,0,0,0
3,Wolf Man,2025-01-15,369,6.5,25000000,34355215,103,1120.683,"Universal Pictures, Cloak & Co., Blumhouse Pro...",United States of America,...,0,1,0,0,0,0,0,1,0,0
4,Your Fault,2024-12-26,1050,7.141,0,0,118,481.299,"Pokeepsie Films, Amazon MGM Studios","Spain, United States of America",...,0,0,0,0,1,0,0,0,0,0


## Handling Nulls

### Simple Imputer

#### Categorical Imputer

In [None]:
from sklearn.impute import SimpleImputer
# Create a transformer for categorical features
cat_imputer = SimpleImputer(strategy="constant", fill_value="Unknown")

# List of your categorical columns where you want to replace nulls with "Unknown"
cat_cols = ['production_company_name', 'production_country_name', 'spoken_languages', 'director', 'writer', 'actors', 'age_rating']

In [None]:
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)
zero_cols = ['imdb_rating', 'imdb_votes']

In [None]:
median_imputer = SimpleImputer(strategy='median')
median_cols = []

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor


# Initialize the IterativeImputer (using BayesianRidge as the default estimator)
iter_imputer = IterativeImputer(
    # estimator=BayesianRidge(),
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

iter_cols = ['metascore', 'rotten_tomatoes_rating', 'meta_critic_rating']


In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
        ('drop_cols', 'drop', ['id', 'unnecessary_column'])
    ],
    remainder="passthrough"  # Unlisted columns are kept unchanged (if any)
)

In [None]:
columns_to_drop = []

In [None]:
# First, build a ColumnTransformer that imputes the groups that do not use IterativeImputer.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_imputer, cat_cols),
        ('zero', zero_imputer, zero_cols),
        ('median', median_imputer, median_cols),
        ('drop_cols', 'drop', columns_to_drop)
    ],
    remainder='passthrough'  # Keep other features (including those for iterative imputer) as is.
)

In [None]:
# Then, create a pipeline that applies the preprocessor and then the iterative imputer to the remaining columns.
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('iter_impute', IterativeImputer(random_state=42))
    # ...followed by scaling and model training as needed.
])

In [None]:


# Assume df is your DataFrame and these are your score columns.
score_cols = ["META_CRITIC_RATING", "ROTTEN_TOMATOES_RATING", "METASCORE"]

# Create missing indicator columns for each score
for col in score_cols:
    df[col + "_missing"] = df[col].isnull().astype(int)

# Use SimpleImputer with strategy='median'
median_imputer = SimpleImputer(strategy='median')
df[score_cols] = median_imputer.fit_transform(df[score_cols])

print(df.head())


## Iterative Imputer

In [34]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor


# Sample data with missing values
data = {
    'feature1': [1.0, 2.0, None, 4.0],
    'feature2': [None, 5.0, 6.0, 7.0],
    'feature3': [10.0, None, None, 13.0]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the IterativeImputer (using BayesianRidge as the default estimator)
imputer = IterativeImputer(
    # estimator=BayesianRidge(),
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

# Fit the imputer and transform the data
imputed_array = imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_array, columns=df.columns)

print("\nDataFrame after Iterative Imputation:")
print(imputed_df)


Original DataFrame:
   feature1  feature2  feature3
0       1.0       NaN      10.0
1       2.0       5.0       NaN
2       NaN       6.0       NaN
3       4.0       7.0      13.0

DataFrame after Iterative Imputation:
   feature1  feature2  feature3
0      1.00      5.75     10.00
1      2.00      5.00     10.66
2      1.57      6.00     10.66
3      4.00      7.00     13.00


## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler  # Optional

# Define lists of columns for each imputation strategy:
iterative_cols = ['feature1', 'feature2']
median_cols = ['feature3', 'feature4']
zero_cols = ['feature5', 'feature6']

# Create the imputer transformers:
iterative_imputer = IterativeImputer(random_state=42)
median_imputer = SimpleImputer(strategy='median')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Optionally, you can also scale the numeric features after imputation.
# For example, define a numeric transformer pipeline that applies imputation then scaling:
iterative_pipeline = Pipeline(steps=[
    ('imputer', iterative_imputer),
    ('scaler', StandardScaler())
])
median_pipeline = Pipeline(steps=[
    ('imputer', median_imputer),
    ('scaler', StandardScaler())
])
zero_pipeline = Pipeline(steps=[
    ('imputer', zero_imputer),
    ('scaler', StandardScaler())
])

# Combine all these using a ColumnTransformer:
preprocessor = ColumnTransformer(
    transformers=[
        ('iterative', iterative_pipeline, iterative_cols),
        ('median', median_pipeline, median_cols),
        ('zero', zero_pipeline, zero_cols)
    ],
    remainder='passthrough'  # Include the rest of the columns as they are
)

# Define a complete pipeline with a model at the end
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression())
# ])

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, you can fit the pipeline on your training data:
# pipeline.fit(X_train, y_train)
# And transform the data:
# X_train_processed = pipeline.transform(X_train)


### Custom Top N Transformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TopCategoriesTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer that groups multi-label categorical features by keeping the top N categories,
    replacing any other category with a specified label (e.g., "Others").
    """
    def __init__(self, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others"):
        self.column = column
        self.top_n = top_n
        self.delimiter = delimiter
        self.others_label = others_label
        self.top_categories_ = None

    def fit(self, X, y=None):
        # Split and explode the column to get individual categories.
        exploded = X[self.column].dropna().str.split(rf"{self.delimiter}\s*").explode().str.strip()
        counts = exploded.value_counts()
        self.top_categories_ = counts.head(self.top_n).index.tolist()
        return self

    def transform(self, X):
        def map_categories(cell):
            if pd.isna(cell):
                return cell
            cats = [cat.strip() for cat in cell.split(self.delimiter)]
            # Replace categories not in top_categories_ with others_label.
            new_cats = [cat if cat in self.top_categories_ else self.others_label for cat in cats]
            # Remove duplicates while preserving order.
            seen = set()
            new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
            return self.delimiter.join(new_cats)
        
        X_transformed = X.copy()
        X_transformed[self.column] = X_transformed[self.column].apply(map_categories)
        return X_transformed

# Usage
# genre_transformer = Pipeline(steps=[
#     ('top_cat', TopCategoriesTransformer(column='genre_names', top_n=20, delimiter=","))
# ])

## Save cleanded data

In [12]:
# Saving cleaned dataset for future use
save_dataframe(data, "01_clean_data.csv")

✅ Data successfully saved to ./data\01_clean_data.csv with separator ','
