In [1]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sqlalchemy import text

In [2]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe
from database.db_utils import init_db
from config.config_loader import load_config
from database.queries import prepped_data_query

In [14]:
# Initialize local PostgreSQL session
Session = init_db(load_config("DB_URL"))
session = Session()

# Execute and fetch results
data = session.execute(text(prepped_data_query))

# Close the session
session.close()

# Convert to DataFrame
data = pd.DataFrame(data.fetchall(), columns=data.keys())

In [4]:
# Load dataset
# data = pd.read_csv("./data/01_clean_data.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2004 entries, 0 to 2003
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    2004 non-null   object 
 1   release_date             2004 non-null   object 
 2   tmdb_vote_count          2004 non-null   int64  
 3   tmdb_vote_average        2004 non-null   float64
 4   genre_names              2004 non-null   object 
 5   budget                   2004 non-null   int64  
 6   revenue                  2004 non-null   int64  
 7   runtime_in_min           2004 non-null   int64  
 8   tmdb_popularity          2004 non-null   float64
 9   production_company_name  1996 non-null   object 
 10  production_country_name  2002 non-null   object 
 11  spoken_languages         2002 non-null   object 
 12  director                 2002 non-null   object 
 13  writer                   1981 non-null   object 
 14  actors                  

In [9]:
def count_unique_values_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> int:
    """
    Splits the specified feature column by the delimiter and returns the number of unique values.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to process.
        delimiter (str): The delimiter used to separate multiple values in the column.

    Returns:
        int: The number of unique values.
    """
    return len(df[feature].dropna().str.split(rf"{delimiter}\s*").explode().unique())

# List of features you want to analyze:
features = ["genre_names", "production_company_name", "production_country_name", 
            "spoken_languages", "director", "writer", "actors"]

# Create a dictionary with the counts for each feature:
unique_counts = {feature: count_unique_values_for_feature(data, feature) for feature in features}

# Display the results:
for feature, count in unique_counts.items():
    print(f"{feature}: {count} unique values")

genre_names: 19 unique values
production_company_name: 3237 unique values
production_country_name: 77 unique values
spoken_languages: 83 unique values
director: 1711 unique values
writer: 3353 unique values
actors: 3775 unique values


In [None]:
# Split the GENRE column on commas and explode it into individual rows.
unique_genres = data["genre_names"].str.split(",\s*").explode()

# Count the occurrences of each unique genre.
genre_counts = unique_genres.value_counts()
print(genre_counts)

# Number of unique genres (which equals the number of columns after one-hot encoding).
num_unique_genres = genre_counts.shape[0]
print(f"One-hot encoding would create {num_unique_genres} new columns.")


genre_names
Drama              838
Comedy             649
Thriller           576
Action             502
Horror             359
Adventure          333
Romance            277
Fantasy            272
Science Fiction    258
Crime              255
Family             218
Animation          212
Mystery            204
History            131
Music               66
War                 56
Documentary         39
Western             19
TV Movie            19
Name: count, dtype: int64
One-hot encoding would create 19 new columns.


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

# Ensure GENRE_NAME is a list of genres
data["genre_list"] = data["genre_names"].apply(lambda x: x.split(", "))

# Use MultiLabelBinarizer to create one-hot encoded columns
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(data["genre_list"]), columns=mlb.classes_)

# Concatenate with original DataFrame and drop unnecessary columns
data = pd.concat([data, genre_dummies], axis=1).drop(["genre_names", "genre_list"], axis=1)

In [11]:
data.head() # Check the transformed data

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,production_country_name,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,The Gorge,2025-02-13,509,7.864,0,0,127,463.558,"Skydance Media, Crooked Highway, Apple Studios...",United States of America,...,0,1,0,0,1,1,0,1,0,0
1,Captain America: Brave New World,2025-02-12,430,6.2,180000000,192400000,119,3181.462,"Marvel Studios, Kevin Feige Productions",United States of America,...,0,0,0,0,0,1,0,1,0,0
2,Back in Action,2025-01-15,976,6.485,0,0,114,1040.048,"Chernin Entertainment, Exhibit A, Good One",United States of America,...,0,0,0,0,0,0,0,0,0,0
3,Wolf Man,2025-01-15,369,6.5,25000000,34355215,103,1120.683,"Universal Pictures, Cloak & Co., Blumhouse Pro...",United States of America,...,0,1,0,0,0,0,0,1,0,0
4,Your Fault,2024-12-26,1050,7.141,0,0,118,481.299,"Pokeepsie Films, Amazon MGM Studios","Spain, United States of America",...,0,0,0,0,1,0,0,0,0,0


## Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer

# Assume df is your DataFrame and these are your score columns.
score_cols = ["META_CRITIC_RATING", "ROTTEN_TOMATOES_RATING", "METASCORE"]

# Create missing indicator columns for each score
for col in score_cols:
    df[col + "_missing"] = df[col].isnull().astype(int)

# Use SimpleImputer with strategy='median'
imputer = SimpleImputer(strategy='median')
df[score_cols] = imputer.fit_transform(df[score_cols])

print(df.head())


## Iterative Imputer

In [34]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor


# Sample data with missing values
data = {
    'feature1': [1.0, 2.0, None, 4.0],
    'feature2': [None, 5.0, 6.0, 7.0],
    'feature3': [10.0, None, None, 13.0]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Initialize the IterativeImputer (using BayesianRidge as the default estimator)
imputer = IterativeImputer(
    # estimator=BayesianRidge(),
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

# Fit the imputer and transform the data
imputed_array = imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_array, columns=df.columns)

print("\nDataFrame after Iterative Imputation:")
print(imputed_df)


Original DataFrame:
   feature1  feature2  feature3
0       1.0       NaN      10.0
1       2.0       5.0       NaN
2       NaN       6.0       NaN
3       4.0       7.0      13.0

DataFrame after Iterative Imputation:
   feature1  feature2  feature3
0      1.00      5.75     10.00
1      2.00      5.00     10.66
2      1.57      6.00     10.66
3      4.00      7.00     13.00


## Save cleanded data

In [12]:
# Saving cleaned dataset for future use
save_dataframe(data, "01_clean_data.csv")

✅ Data successfully saved to ./data\01_clean_data.csv with separator ','
