In [None]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports


In [None]:
# Load dataset
data = pd.read_csv("./data/01_clean_data.csv")

In [None]:
data.info()

## Multilable Binarizer

In [None]:
# from sklearn.preprocessing import MultiLabelBinarizer

# # Ensure GENRE_NAME is a list of genres
# data["genre_list"] = data["genre_names"].apply(lambda x: x.split(", "))

# # Use MultiLabelBinarizer to create one-hot encoded columns
# mlb = MultiLabelBinarizer()
# genre_dummies = pd.DataFrame(mlb.fit_transform(data["genre_list"]), columns=mlb.classes_)

# # Concatenate with original DataFrame and drop unnecessary columns
# data = pd.concat([data, genre_dummies], axis=1).drop(["genre_names", "genre_list"], axis=1)

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def apply_mlb_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> pd.DataFrame:
    """
    Applies MultiLabelBinarizer to a single multi-label column and returns a new DataFrame 
    with the original column replaced by its one-hot encoded dummy columns.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to encode.
        delimiter (str): The delimiter used in the column to separate values (default: ", ").
    
    Returns:
        pd.DataFrame: A new DataFrame with the specified feature replaced by one-hot encoded columns.
    """
    df = df.copy()
    
    # Create a new column that is a list of values from the original column
    list_col = feature + "_list"
    df[list_col] = df[feature].apply(lambda x: [i.strip() for i in x.split(delimiter)] if pd.notnull(x) else [])
    
    # Initialize and fit MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    dummies = pd.DataFrame(
        mlb.fit_transform(df[list_col]),
        columns=[f"{feature}_{cls}" for cls in mlb.classes_],
        index=df.index
    )
    
    # Concatenate the dummy columns to the DataFrame and drop the original columns
    df = pd.concat([df, dummies], axis=1).drop([feature, list_col], axis=1)
    return df

# Example usage:
features_to_encode = ['genre_names', 'production_country_name', 'spoken_languages']
for feat in features_to_encode:
    data = apply_mlb_for_feature(data, feat)


In [None]:
data.head() # Check the transformed data