In [1]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports


## Data Quering and Loading

In [3]:
# DB related imports
# from src.utils import save_dataframe
# from database.db_utils import init_db
# from config.config_loader import load_config
# from database.queries import prepped_data_query
# from sqlalchemy import text

# # Initialize local PostgreSQL session
# Session = init_db(load_config("DB_URL"))
# session = Session()

# # Execute and fetch results
# data = session.execute(text(prepped_data_query))

# # Close the session
# session.close()

# # Convert to DataFrame
# data = pd.DataFrame(data.fetchall(), columns=data.keys())

In [4]:
# Saving cleaned dataset for future use
# save_dataframe(data, "00_base_data.csv")

In [5]:
# Load dataset
data = pd.read_csv("./data/00_base_data.csv")

## Initial Data Overview

In [158]:
# data["missing_budget"] = (data["budget"] == 0).astype(int)
# data["missing_revenue"] = (data["revenue"] == 0).astype(int)

In [159]:
# data["budget"] = data["budget"].replace(0, np.nan)
# data["revenue"] = data["revenue"].replace(0, np.nan)

In [160]:
# data = data[(data["budget"] != 0) & (data["revenue"] != 0)]

In [161]:
# data = data[(data["budget"] != 0) | (data["revenue"] != 0)]

In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    3003 non-null   object 
 1   release_date             3003 non-null   object 
 2   tmdb_vote_count          3003 non-null   int64  
 3   tmdb_vote_average        3003 non-null   float64
 4   genre_names              3003 non-null   object 
 5   budget                   1594 non-null   float64
 6   revenue                  1816 non-null   float64
 7   runtime_in_min           3001 non-null   float64
 8   tmdb_popularity          3003 non-null   float64
 9   production_company_name  2990 non-null   object 
 10  production_country_name  2998 non-null   object 
 11  spoken_languages         3001 non-null   object 
 12  director                 3001 non-null   object 
 13  writer                   2972 non-null   object 
 14  actors                  

In [163]:
# Count missing values
data.isnull().sum()

title                         0
release_date                  0
tmdb_vote_count               0
tmdb_vote_average             0
genre_names                   0
budget                     1409
revenue                    1187
runtime_in_min                2
tmdb_popularity               0
production_company_name      13
production_country_name       5
spoken_languages              2
director                      2
writer                       31
actors                        6
imdb_rating                   8
imdb_votes                    6
metascore                   781
age_rating                  276
awards                      564
rotten_tomatoes_rating      467
meta_critic_rating          781
dtype: int64

In [164]:
data.head(5)

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,genre_names,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,...,director,writer,actors,imdb_rating,imdb_votes,metascore,age_rating,awards,rotten_tomatoes_rating,meta_critic_rating
0,Carter,2022-08-05,378,6.0,"Action, Thriller, Crime",,,132.0,27.043,Apeitda,...,Jung Byung-gil,"Jung Byung-gil, Byeong-sik Jung","Joo Won, Kim Bo-min, Lee Sung-jae",5.1,10297,,TV-MA,,32.0,
1,The Shadow in My Eye,2021-10-28,494,7.829,"War, Drama, History",,,107.0,16.008,Miso Film,...,Ole Bornedal,Ole Bornedal,"Bertram Bisgaard Enevoldsen, Ester Birch, Ella...",7.3,17277,,TV-MA,2 wins & 13 nominations,,
2,Noelle,2019-11-12,673,6.595,"Family, Comedy, Fantasy",,,100.0,10.04,Walt Disney Pictures,...,Marc Lawrence,Marc Lawrence,"Anna Kendrick, Shirley MacLaine, Bill Hader",6.3,28616,48.0,G,1 win total,56.0,48.0
3,Play,2019-09-04,349,7.278,Comedy,,,108.0,5.214,"Chapter 2, Moonshaker",...,Anthony Marciano,"Max Boublil, Anthony Marciano","Max Boublil, Alice Isaaz, Malik Zidi",7.2,1755,,,1 nomination,,
4,Dave Chappelle: Sticks & Stones,2019-08-26,320,7.6,Comedy,,,65.0,7.683,,...,Stan Lathan,Dave Chappelle,Dave Chappelle,8.3,28457,,TV-MA,Won 3 Primetime Emmys. 4 wins & 5 nominations ...,42.0,


## Multilable Categorical Features

There are a nuymber of multilable categorical features that will need to be looked into.

In [165]:
def count_unique_values_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> int:
    """
    Splits the specified feature column by the delimiter and returns the number of unique values.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to process.
        delimiter (str): The delimiter used to separate multiple values in the column.

    Returns:
        int: The number of unique values.
    """
    return len(df[feature].dropna().str.split(rf"{delimiter}\s*").explode().unique())

# List of features you want to analyze:
features = [
    "genre_names", 
    "production_company_name", "production_country_name", 
            "spoken_languages", "director", "writer", "actors"]

# Create a dictionary with the counts for each feature:
unique_counts = {feature: count_unique_values_for_feature(data, feature) for feature in features}

# Display the results:
for feature, count in unique_counts.items():
    print(f"{feature}: {count} unique values")

genre_names: 19 unique values
production_company_name: 4358 unique values
production_country_name: 83 unique values
spoken_languages: 89 unique values
director: 2251 unique values
writer: 4593 unique values
actors: 4961 unique values


In [166]:
def print_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> None:
    """
    Prints the top_n unique values from a multi-label column and the total count of values 
    that fall outside the top_n (which would be grouped as 'Others').

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to display.
        delimiter (str): The delimiter separating multiple values (default is a comma).
        others_label (str): The label used for less frequent values.
    """
    # Split the column into individual values and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    
    # Get the top N categories and the sum for the rest
    top_categories = counts.head(top_n)
    others_count = counts[counts.index.difference(top_categories.index)].sum()
    
    print("--------------------------------------------------||")
    print(f"Top {top_n} unique values for '{column}':")
    print(top_categories)
    print(f"Total count of all other values (will be grouped as '{others_label}'): {others_count}")
    print("--------------------------------------------------||\n")


top_values = {
    "genre_names": 20,
    "production_company_name": 20,
    "production_country_name": 10,
    "spoken_languages": 10,
    "director": 20,
    "writer": 20,
    "actors": 20
}

for feature, top_n in top_values.items():
    print_top_categories(data, feature, top_n)

--------------------------------------------------||


Top 20 unique values for 'genre_names':
genre_names
Drama              1292
Comedy              976
Thriller            864
Action              735
Horror              514
Adventure           487
Romance             401
Crime               393
Science Fiction     388
Fantasy             371
Family              301
Mystery             300
Animation           294
History             189
War                  92
Music                89
Documentary          59
TV Movie             29
Western              27
Name: count, dtype: int64
Total count of all other values (will be grouped as 'Others'): 0
--------------------------------------------------||

--------------------------------------------------||
Top 20 unique values for 'production_company_name':
production_company_name
Universal Pictures          109
Lionsgate                    91
Columbia Pictures            81
Paramount Pictures           77
Warner Bros. Pictures        76
Walt Disney Pictures         63
Canal+                   

In [167]:
def group_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> pd.Series:
    """
    Processes a multi-label column by keeping only the top_n categories (based on frequency) 
    and replacing any other category with the 'others_label'.

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to keep.
        delimiter (str): The delimiter separating multiple values in the column.
        others_label (str): The label to use for all categories not in the top_n.

    Returns:
        pd.Series: A new Series with the modified values.
    """
    # Split the column and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    top_categories = counts.head(top_n).index.tolist()
    
    def map_categories(cell):
        if pd.isna(cell):
            return cell
        cats = [cat.strip() for cat in cell.split(delimiter)]
        # Replace any category not in top_categories with others_label
        new_cats = [cat if cat in top_categories else others_label for cat in cats]
        # Remove duplicates while preserving order
        seen = set()
        new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
        return delimiter.join(new_cats)
    
    return df[column].apply(map_categories)

top_values = {
    # "genre_names": 20,  # Keeping all genres
    # "production_company_name": 20,  # Too granular
    "production_country_name": 5,
    "spoken_languages": 5,
    # "director": 20,  # Too granular
    # "writer": 20,  # Too granular
    # "actors": 20  # Too granular
}

for feature, top_n in top_values.items():
    data[feature] = group_top_categories(data, feature, top_n)


In [168]:
data.head()

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,genre_names,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,...,director,writer,actors,imdb_rating,imdb_votes,metascore,age_rating,awards,rotten_tomatoes_rating,meta_critic_rating
0,Carter,2022-08-05,378,6.0,"Action, Thriller, Crime",,,132.0,27.043,Apeitda,...,Jung Byung-gil,"Jung Byung-gil, Byeong-sik Jung","Joo Won, Kim Bo-min, Lee Sung-jae",5.1,10297,,TV-MA,,32.0,
1,The Shadow in My Eye,2021-10-28,494,7.829,"War, Drama, History",,,107.0,16.008,Miso Film,...,Ole Bornedal,Ole Bornedal,"Bertram Bisgaard Enevoldsen, Ester Birch, Ella...",7.3,17277,,TV-MA,2 wins & 13 nominations,,
2,Noelle,2019-11-12,673,6.595,"Family, Comedy, Fantasy",,,100.0,10.04,Walt Disney Pictures,...,Marc Lawrence,Marc Lawrence,"Anna Kendrick, Shirley MacLaine, Bill Hader",6.3,28616,48.0,G,1 win total,56.0,48.0
3,Play,2019-09-04,349,7.278,Comedy,,,108.0,5.214,"Chapter 2, Moonshaker",...,Anthony Marciano,"Max Boublil, Anthony Marciano","Max Boublil, Alice Isaaz, Malik Zidi",7.2,1755,,,1 nomination,,
4,Dave Chappelle: Sticks & Stones,2019-08-26,320,7.6,Comedy,,,65.0,7.683,,...,Stan Lathan,Dave Chappelle,Dave Chappelle,8.3,28457,,TV-MA,Won 3 Primetime Emmys. 4 wins & 5 nominations ...,42.0,


## Handling Nulls

### Simple Imputer

#### Categorical Imputer

In [6]:
from sklearn.impute import SimpleImputer
# Create a transformer for categorical features
cat_imputer = SimpleImputer(strategy="constant", fill_value="Unknown")

# List of your categorical columns where you want to replace nulls with "Unknown"
cat_cols = ['production_company_name', 'production_country_name', 'spoken_languages', 'director', 'writer', 'actors', 'age_rating']

In [7]:
# Apply imputer
# data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])
# data.head(30)

In [8]:
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)
zero_cols = ['imdb_rating', 'imdb_votes']

In [9]:
median_imputer = SimpleImputer(strategy='median')
median_cols = []

In [10]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor


# Initialize the IterativeImputer (using BayesianRidge as the default estimator)
iter_imputer = IterativeImputer(
    # estimator=BayesianRidge(),
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

iter_cols = ['metascore', 'rotten_tomatoes_rating', 'meta_critic_rating']

In [11]:
# Create missing indicator columns for each score column
for col in iter_cols:
    data[col + "_missing"] = data[col].isnull().astype(int)

# data.head(20)

In [12]:
from sklearn.compose import ColumnTransformer
iter_impute = ColumnTransformer(
    transformers=[
        ('iter_impute', iter_imputer, iter_cols)
    ],
    remainder="passthrough"  # Unlisted columns are kept unchanged (if any)
)

In [13]:
# from sklearn.compose import ColumnTransformer

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_pipeline, numeric_features),
#         ('cat', categorical_pipeline, categorical_features),
#         ('drop_cols', 'drop', ['id', 'unnecessary_column'])
#     ],
#     remainder="passthrough"  # Unlisted columns are kept unchanged (if any)
# )

In [14]:
columns_to_drop = []

In [15]:
# First, build a ColumnTransformer that imputes the groups that do not use IterativeImputer.
process_nulls = ColumnTransformer(
    transformers=[
        ('cat', cat_imputer, cat_cols),
        ('zero', zero_imputer, zero_cols),
        # ('median', median_imputer, median_cols),
        # ('drop_cols', 'drop', columns_to_drop)
    ],
    remainder='passthrough'  # Keep other features (including those for iterative imputer) as is.
)

In [16]:
from sklearn.pipeline import Pipeline

# Then, create a pipeline that applies the preprocessor and then the iterative imputer to the remaining columns.
pipeline = Pipeline(steps=[
    ('preprocess', process_nulls),
    ('iter_impute', iter_impute)
    # ...followed by scaling and model training as needed.
])

In [None]:
# Now, you can fit the pipeline on your training data:
pipeline.fit(data)
# And transform the data:
data = pipeline.transform(data)
data.head()

ValueError: Passing extra keyword arguments to ColumnTransformer.transform is only supported if enable_metadata_routing=True, which you can set using `sklearn.set_config`. See the User Guide <https://scikit-learn.org/stable/metadata_routing.html> for more details. Extra parameters passed are: {'copy'}

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import FunctionTransformer
# from sklearn.preprocessing import StandardScaler  # Uncomment if needed later
# from sklearn.preprocessing import OneHotEncoder   # Uncomment if needed later

# Define your column groups
# zero_cols = ['some_numeric_col1', 'some_numeric_col2']  # Impute missing values with 0
# median_cols = ['budget', 'revenue', 'runtime_in_min']    # Impute missing values with the median
# cat_cols = ['production_company_name', 'production_country_name', 'spoken_languages', 
#             'director', 'writer', 'actors']              # Impute missing categorical values with "Unknown"
            
# Columns you want to drop after imputation (for example, identifiers or irrelevant features)
# drop_cols = ['unnecessary_column1', 'unnecessary_column2']

# Create individual pipelines for each group:

zero_pipeline = Pipeline(steps=[
    ('zero_imputer', SimpleImputer(strategy='constant', fill_value=0))
    # ('scaler', StandardScaler())  # Uncomment to scale these columns
])

median_pipeline = Pipeline(steps=[
    ('median_imputer', SimpleImputer(strategy='median'))
    # ('scaler', StandardScaler())  # Uncomment to scale these columns
])

cat_pipeline = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Unknown'))
    # ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Uncomment if you want one-hot encoding here
])

# impute_pipeline = Pipeline(steps=[
#     ('iter_impute', iter_imputer),
# ])

# Combine them in a ColumnTransformer.
# Any columns not mentioned in the transformers will be left unchanged due to remainder='passthrough'.
preprocessor = ColumnTransformer(
    transformers=[
        ('zero', zero_pipeline, zero_cols),
        ('median', median_pipeline, median_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='passthrough'
)

# Set output to pandas dataframe
preprocessor.set_output(transform='pandas')

# Define a function transformer to drop unwanted columns.
def drop_unwanted_columns(X):
    # Assuming X is a pandas DataFrame; errors='ignore' ensures that missing columns are skipped.
    return X.drop(columns=columns_to_drop, errors='ignore')

dropper = FunctionTransformer(drop_unwanted_columns)


iter_cols = ['remainder__metascore', 'remainder__rotten_tomatoes_rating', 'remainder__meta_critic_rating']

# IterativeImputer applied only to selected columns
iter_imputer = ColumnTransformer(
    transformers=[
        ('iter', iter_imputer, iter_cols)
    ],
    remainder='passthrough'
)

# Now, build the full pipeline.
# Note: The IterativeImputer should be applied after the initial imputations, so it comes after the ColumnTransformer and dropper.
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dropper', dropper),
    # ('iter_imputer', iter_imputer),
    # ('scaler', StandardScaler()),  # Optionally, scale after imputation.
    # ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Optionally, if you need additional encoding.
])

# Example usage:
# Assume 'data' is your DataFrame with the full set of features.
# You can then fit_transform the pipeline on your training data:
transformed_data = full_pipeline.fit_transform(data)

# Column adjustment
# new_columns = [col.split("__", 1)[-1] for col in transformed_data.columns]
# transformed_data.columns = new_columns
transformed_data.head()

# If you need the result back as a DataFrame, you can do:
# (Here, we assume that the preprocessor returns a DataFrame; if it returns an array, you'll have to set column names accordingly.)
# transformed_df = pd.DataFrame(transformed_data)
# transformed_df.head()


ValueError: A given column is not a column of the dataframe

## Iterative Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor


# Sample data with missing values
data = {
    'feature1': [1.0, 2.0, None, 4.0],
    'feature2': [None, 5.0, 6.0, 7.0],
    'feature3': [10.0, None, None, 13.0]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the IterativeImputer (using BayesianRidge as the default estimator)
imputer = IterativeImputer(
    # estimator=BayesianRidge(),
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

# Fit the imputer and transform the data
imputed_array = imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_array, columns=df.columns)

print("\nDataFrame after Iterative Imputation:")
print(imputed_df)


Original DataFrame:
   feature1  feature2  feature3
0       1.0       NaN      10.0
1       2.0       5.0       NaN
2       NaN       6.0       NaN
3       4.0       7.0      13.0

DataFrame after Iterative Imputation:
   feature1  feature2  feature3
0      1.00      5.75     10.00
1      2.00      5.00     10.66
2      1.57      6.00     10.66
3      4.00      7.00     13.00


## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler  # Optional

# Define lists of columns for each imputation strategy:
iterative_cols = ['feature1', 'feature2']
median_cols = ['feature3', 'feature4']
zero_cols = ['feature5', 'feature6']

# Create the imputer transformers:
iterative_imputer = IterativeImputer(random_state=42)
median_imputer = SimpleImputer(strategy='median')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Optionally, you can also scale the numeric features after imputation.
# For example, define a numeric transformer pipeline that applies imputation then scaling:
iterative_pipeline = Pipeline(steps=[
    ('imputer', iterative_imputer),
    ('scaler', StandardScaler())
])
median_pipeline = Pipeline(steps=[
    ('imputer', median_imputer),
    ('scaler', StandardScaler())
])
zero_pipeline = Pipeline(steps=[
    ('imputer', zero_imputer),
    ('scaler', StandardScaler())
])

# Combine all these using a ColumnTransformer:
preprocessor = ColumnTransformer(
    transformers=[
        ('iterative', iterative_pipeline, iterative_cols),
        ('median', median_pipeline, median_cols),
        ('zero', zero_pipeline, zero_cols)
    ],
    remainder='passthrough'  # Include the rest of the columns as they are
)

# Define a complete pipeline with a model at the end
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression())
# ])

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, you can fit the pipeline on your training data:
# pipeline.fit(X_train, y_train)
# And transform the data:
# X_train_processed = pipeline.transform(X_train)


### Custom Top N Transformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TopCategoriesTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer that groups multi-label categorical features by keeping the top N categories,
    replacing any other category with a specified label (e.g., "Others").
    """
    def __init__(self, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others"):
        self.column = column
        self.top_n = top_n
        self.delimiter = delimiter
        self.others_label = others_label
        self.top_categories_ = None

    def fit(self, X, y=None):
        # Split and explode the column to get individual categories.
        exploded = X[self.column].dropna().str.split(rf"{self.delimiter}\s*").explode().str.strip()
        counts = exploded.value_counts()
        self.top_categories_ = counts.head(self.top_n).index.tolist()
        return self

    def transform(self, X):
        def map_categories(cell):
            if pd.isna(cell):
                return cell
            cats = [cat.strip() for cat in cell.split(self.delimiter)]
            # Replace categories not in top_categories_ with others_label.
            new_cats = [cat if cat in self.top_categories_ else self.others_label for cat in cats]
            # Remove duplicates while preserving order.
            seen = set()
            new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
            return self.delimiter.join(new_cats)
        
        X_transformed = X.copy()
        X_transformed[self.column] = X_transformed[self.column].apply(map_categories)
        return X_transformed

# Usage
# genre_transformer = Pipeline(steps=[
#     ('top_cat', TopCategoriesTransformer(column='genre_names', top_n=20, delimiter=","))
# ])

## Save cleanded data

In [None]:
# Saving cleaned dataset for future use
save_dataframe(data, "01_clean_data.csv")

✅ Data successfully saved to ./data\01_clean_data.csv with separator ','
