In [215]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, FunctionTransformer

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor

In [216]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe


## Data Quering and Loading

In [217]:
# DB related imports
from database.db_utils import init_db
from config.config_loader import load_config
from database.queries import prepped_data_query
from sqlalchemy import text

# Initialize local PostgreSQL session
Session = init_db(load_config("DB_URL"))
session = Session()

# Execute and fetch results
data = session.execute(text(prepped_data_query))

# Close the session
session.close()

# Convert to DataFrame
data = pd.DataFrame(data.fetchall(), columns=data.keys())

In [218]:
# Saving cleaned dataset for future use
save_dataframe(data, "00_base_data.csv")

✅ Data successfully saved to ./data\00_base_data.csv with separator ','


In [219]:
# Load dataset
data = pd.read_csv("./data/00_base_data.csv")

## Initial Data Overview

In [220]:
# data["missing_budget"] = (data["budget"] == 0).astype(int)
# data["missing_revenue"] = (data["revenue"] == 0).astype(int)

In [221]:
# data["budget"] = data["budget"].replace(0, np.nan)
# data["revenue"] = data["revenue"].replace(0, np.nan)

In [222]:
# data = data[(data["budget"] != 0) & (data["revenue"] != 0)]

In [223]:
# data = data[(data["budget"] != 0) | (data["revenue"] != 0)]

In [224]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    4999 non-null   object 
 1   release_date             4999 non-null   object 
 2   tmdb_vote_count          4999 non-null   int64  
 3   tmdb_vote_average        4999 non-null   float64
 4   genre_names              4999 non-null   object 
 5   budget                   2955 non-null   float64
 6   revenue                  3347 non-null   float64
 7   runtime_in_min           4997 non-null   float64
 8   tmdb_popularity          4999 non-null   float64
 9   production_company_name  4981 non-null   object 
 10  production_country_name  4994 non-null   object 
 11  spoken_languages         4997 non-null   object 
 12  director                 4996 non-null   object 
 13  writer                   4947 non-null   object 
 14  actors                  

In [225]:
# Count missing values
data.isnull().sum()

title                         0
release_date                  0
tmdb_vote_count               0
tmdb_vote_average             0
genre_names                   0
budget                     2044
revenue                    1652
runtime_in_min                2
tmdb_popularity               0
production_company_name      18
production_country_name       5
spoken_languages              2
director                      3
writer                       52
actors                        8
imdb_rating                   9
imdb_votes                    7
metascore                  1244
age_rating                  397
awards                      845
rotten_tomatoes_rating      770
meta_critic_rating         1244
dtype: int64

In [226]:
data.head(5)

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,genre_names,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,...,director,writer,actors,imdb_rating,imdb_votes,metascore,age_rating,awards,rotten_tomatoes_rating,meta_critic_rating
0,Carter,2022-08-05,378,6.0,"Action, Thriller, Crime",,,132.0,27.043,Apeitda,...,Jung Byung-gil,"Jung Byung-gil, Byeong-sik Jung","Joo Won, Kim Bo-min, Lee Sung-jae",5.1,10297,,TV-MA,,32.0,
1,The Shadow in My Eye,2021-10-28,494,7.829,"War, Drama, History",,,107.0,16.008,Miso Film,...,Ole Bornedal,Ole Bornedal,"Bertram Bisgaard Enevoldsen, Ester Birch, Ella...",7.3,17277,,TV-MA,2 wins & 13 nominations,,
2,Noelle,2019-11-12,673,6.595,"Family, Comedy, Fantasy",,,100.0,10.04,Walt Disney Pictures,...,Marc Lawrence,Marc Lawrence,"Anna Kendrick, Shirley MacLaine, Bill Hader",6.3,28616,48.0,G,1 win total,56.0,48.0
3,Play,2019-09-04,349,7.278,Comedy,,,108.0,5.214,"Chapter 2, Moonshaker",...,Anthony Marciano,"Max Boublil, Anthony Marciano","Max Boublil, Alice Isaaz, Malik Zidi",7.2,1755,,,1 nomination,,
4,Dave Chappelle: Sticks & Stones,2019-08-26,320,7.6,Comedy,,,65.0,7.683,,...,Stan Lathan,Dave Chappelle,Dave Chappelle,8.3,28457,,TV-MA,Won 3 Primetime Emmys. 4 wins & 5 nominations ...,42.0,


## Multilable Categorical Features

There are a nuymber of multilable categorical features that will need to be looked into.

In [227]:
def count_unique_values_for_feature(df: pd.DataFrame, feature: str, delimiter: str = ",") -> int:
    """
    Splits the specified feature column by the delimiter and returns the number of unique values.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column to process.
        delimiter (str): The delimiter used to separate multiple values in the column.

    Returns:
        int: The number of unique values.
    """
    return len(df[feature].dropna().str.split(rf"{delimiter}\s*").explode().unique())

# List of features you want to analyze:
features = [
    "genre_names", 
    "production_company_name", "production_country_name", 
            "spoken_languages", "director", "writer", "actors"]

# Create a dictionary with the counts for each feature:
unique_counts = {feature: count_unique_values_for_feature(data, feature) for feature in features}

# Display the results:
for feature, count in unique_counts.items():
    print(f"{feature}: {count} unique values")

genre_names: 19 unique values
production_company_name: 6322 unique values
production_country_name: 92 unique values
spoken_languages: 95 unique values
director: 3094 unique values
writer: 6793 unique values
actors: 6948 unique values


In [228]:
def print_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> None:
    """
    Prints the top_n unique values from a multi-label column and the total count of values 
    that fall outside the top_n (which would be grouped as 'Others').

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to display.
        delimiter (str): The delimiter separating multiple values (default is a comma).
        others_label (str): The label used for less frequent values.
    """
    # Split the column into individual values and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    
    # Get the top N categories and the sum for the rest
    top_categories = counts.head(top_n)
    others_count = counts[counts.index.difference(top_categories.index)].sum()
    
    print("--------------------------------------------------||")
    print(f"Top {top_n} unique values for '{column}':")
    print(top_categories)
    print(f"Total count of all other values (will be grouped as '{others_label}'): {others_count}")
    print("--------------------------------------------------||\n")


top_values = {
    "genre_names": 20,
    "production_company_name": 20,
    "production_country_name": 10,
    "spoken_languages": 10,
    "director": 20,
    "writer": 20,
    "actors": 20
}

for feature, top_n in top_values.items():
    print_top_categories(data, feature, top_n)

--------------------------------------------------||
Top 20 unique values for 'genre_names':
genre_names
Drama              2178
Comedy             1690
Thriller           1413
Action             1189
Horror              798
Adventure           788
Romance             702
Crime               641
Science Fiction     615
Fantasy             554
Family              507
Animation           490
Mystery             459
History             263
Music               138
War                 130
Documentary          99
TV Movie             67
Western              50
Name: count, dtype: int64
Total count of all other values (will be grouped as 'Others'): 0
--------------------------------------------------||

--------------------------------------------------||
Top 20 unique values for 'production_company_name':
production_company_name
Universal Pictures       191
Warner Bros. Pictures    156
Lionsgate                151
Columbia Pictures        142
Paramount Pictures       125
Canal+              

In [229]:
def group_top_categories(df: pd.DataFrame, column: str, top_n: int, delimiter: str = ",", others_label: str = "Others") -> pd.Series:
    """
    Processes a multi-label column by keeping only the top_n categories (based on frequency) 
    and replacing any other category with the 'others_label'.

    Args:
        df (pd.DataFrame): The DataFrame containing your data.
        column (str): The name of the multi-label column.
        top_n (int): The number of top categories to keep.
        delimiter (str): The delimiter separating multiple values in the column.
        others_label (str): The label to use for all categories not in the top_n.

    Returns:
        pd.Series: A new Series with the modified values.
    """
    # Split the column and count frequencies
    exploded = df[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    top_categories = counts.head(top_n).index.tolist()
    
    def map_categories(cell):
        if pd.isna(cell):
            return cell
        cats = [cat.strip() for cat in cell.split(delimiter)]
        # Replace any category not in top_categories with others_label
        new_cats = [cat if cat in top_categories else others_label for cat in cats]
        # Remove duplicates while preserving order
        seen = set()
        new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
        return delimiter.join(new_cats)
    
    return df[column].apply(map_categories)

top_values = {
    # "genre_names": 20,  # Keeping all genres
    # "production_company_name": 20,  # Too granular
    "production_country_name": 5,
    "spoken_languages": 5,
    # "director": 20,  # Too granular
    # "writer": 20,  # Too granular
    # "actors": 20  # Too granular
}

# for feature, top_n in top_values.items():
#     data[feature] = group_top_categories(data, feature, top_n)

# data.head()


## Handling Nulls

### Simple Imputer

#### Categorical Imputer

In [230]:
from sklearn.compose import make_column_selector

num_cols = make_column_selector(dtype_include=['number'])
num_cols = num_cols(data)
print(num_cols)

['tmdb_vote_count', 'tmdb_vote_average', 'budget', 'revenue', 'runtime_in_min', 'tmdb_popularity', 'imdb_rating', 'metascore', 'rotten_tomatoes_rating', 'meta_critic_rating']


In [231]:
cat_cols = make_column_selector(dtype_include=['object'])
cat_cols = cat_cols(data)
print(cat_cols)

['title', 'release_date', 'genre_names', 'production_company_name', 'production_country_name', 'spoken_languages', 'director', 'writer', 'actors', 'imdb_votes', 'age_rating', 'awards']


In [None]:
# Imputer cols
cat_cols = ['production_company_name', 'production_country_name', 'spoken_languages', 'director', 'writer', 'actors', 'age_rating']
zero_cols = ['imdb_rating', 'imdb_votes']
median_cols = ['runtime_in_min']
columns_to_drop = [
    # 'title', 
    # 'awards',
    'release_date'
    ]

In [233]:
# Iter columns setup
iter_cols = [
    'metascore', 'rotten_tomatoes_rating', 'meta_critic_rating',
    'budget', 'revenue'
    ]

predictor_cols = [
    'tmdb_vote_count', 'tmdb_vote_average', 'runtime_in_min', 'tmdb_popularity', 'imdb_rating', 
    'imdb_votes', 'total_wins', 'total_noms', 'oscar_wins', 'oscar_noms', 'bafta_wins', 'bafta_noms',
    'metascore_missing', 'rotten_tomatoes_rating_missing', 'meta_critic_rating_missing',
    ]

full_iter_cols = iter_cols + predictor_cols

In [234]:
# Categorical columns imputer
cat_imputer = SimpleImputer(strategy="constant", fill_value="Unknown")

cat_pipeline = Pipeline(steps=[
    ('cat_imputer', cat_imputer)
    # ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Numerical to zero columns imputer
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

zero_pipeline = Pipeline(steps=[
    ('zero_imputer', SimpleImputer(strategy='constant', fill_value=0))
    # ('scaler', StandardScaler())
])


# Numerical to median columns imputer
median_imputer = SimpleImputer(strategy='median')

median_pipeline = Pipeline(steps=[
    ('median_imputer', median_imputer)
    # ('scaler', StandardScaler())
])


# Iterative imputer
iter_imputer = IterativeImputer(
    estimator=BayesianRidge(),
    # estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    max_iter=10,
    initial_strategy='median',
    n_nearest_features=None,
    imputation_order='ascending',
    random_state=42
    )

iter_pipeline = Pipeline(steps=[
    ('cat_imputer', iter_imputer)
    # ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Custom Transformers

In [235]:
# Define a function to add missing indicators for certain columns.
def add_missing_indicators(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        df[col + "_missing"] = df[col].isnull().astype(int)
    return df

missing_indicator_transformer = FunctionTransformer(add_missing_indicators, validate=False)


# Define a function transformer to drop unwanted columns.
def drop_unwanted_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=columns_to_drop, errors='ignore')

dropper = FunctionTransformer(drop_unwanted_columns)


# To numeric
def convert_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        # Convert to string, remove commas, then convert to numeric
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')
    return df

to_numeric = FunctionTransformer(convert_to_numeric, validate=False)


# def convert_to_numeric_cols(df: pd.DataFrame, cols: list) -> pd.DataFrame:
#     df = df.copy()
#     for col in cols:
#         # Remove commas and convert to numeric (coercing errors to NaN)
#         df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', ''), errors='coerce')
#     return df

# to_numeric_transformer = FunctionTransformer(lambda X: convert_to_numeric_cols(X, cols=['imdb_votes']),
#                                              validate=False)


In [236]:
def extract_awards_info(awards_str):
    """
    Extracts numerical awards information from a text string.

    Parameters
    ----------
    awards_str : str
        The awards description string.

    Returns
    -------
    pd.Series
        A Series with the following index:
        ["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"]
    """
    # Handle missing or "N/A" values.
    if pd.isna(awards_str) or awards_str.strip() in ["N/A", ""]:
        return pd.Series([0, 0, 0, 0, 0, 0],
                         index=["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])
    
    # Extract overall totals.
    # Look for a pattern like "56 wins" (we use negative lookahead to avoid picking up Oscar wins)
    total_wins_match = re.search(r'(\d+)\s+wins?(?!.*Oscars)', awards_str, flags=re.IGNORECASE)
    total_noms_match = re.search(r'(\d+)\s+nominations', awards_str, flags=re.IGNORECASE)
    total_wins = int(total_wins_match.group(1)) if total_wins_match else 0
    total_noms = int(total_noms_match.group(1)) if total_noms_match else 0

    # Oscar-specific extraction:
    oscar_noms_match = re.search(r'Nominated for\s+(\d+)\s+Oscars?', awards_str, flags=re.IGNORECASE)
    oscar_noms = int(oscar_noms_match.group(1)) if oscar_noms_match else 0
    # Look for something like "Oscars. 56 wins" or "Oscars 56 wins" (using non-digit separator)
    oscar_wins_match = re.search(r'Oscars?[\W_]+(\d+)\s+wins?', awards_str, flags=re.IGNORECASE)
    oscar_wins = int(oscar_wins_match.group(1)) if oscar_wins_match else 0

    # BAFTA-specific extraction:
    # For nominations, sometimes the text might run together (e.g. "BAFTA Award28 nominations total")
    bafta_noms_match = re.search(r'Nominated for\s+(\d+)\s*BAFTA', awards_str, flags=re.IGNORECASE)
    bafta_noms = int(bafta_noms_match.group(1)) if bafta_noms_match else 0
    # For wins, allow an optional "Award" word after BAFTA.
    bafta_wins_match = re.search(r'BAFTA(?:\s+Award)?[\D_]+(\d+)\s+wins?', awards_str, flags=re.IGNORECASE)
    bafta_wins = int(bafta_wins_match.group(1)) if bafta_wins_match else 0

    return pd.Series([total_wins, total_noms, oscar_wins, oscar_noms, bafta_wins, bafta_noms],
                     index=["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])


def transform_awards(X):
    """
    Expects X to be a DataFrame with a single column (e.g., 'awards').
    Applies extract_awards_info row-wise and returns a DataFrame.
    """
    # Apply the function to the first (and only) column
    return X.iloc[:, 0].apply(extract_awards_info)

# Wrap the function in a FunctionTransformer
awards_transformer = FunctionTransformer(transform_awards, validate=False)

In [237]:
from functools import partial

def transform_top_categories(X, column, top_n, delimiter=",", others_label="Others"):
    """
    Transforms a multi-label column by keeping only the top_n categories (based on frequency)
    and replacing all other categories with a generic label.
    
    Parameters:
        X (pd.DataFrame): Input DataFrame.
        column (str): The name of the multi-label column to process.
        top_n (int): Number of top categories to keep.
        delimiter (str): Delimiter separating the values.
        others_label (str): Label to assign to categories not among the top_n.
    
    Returns:
        pd.DataFrame: A DataFrame with one column (the processed column).
    """
    X = X.copy()
    # Split the column values, explode, and count frequencies.
    exploded = X[column].dropna().str.split(rf"{delimiter}\s*").explode().str.strip()
    counts = exploded.value_counts()
    top_categories = counts.head(top_n).index.tolist()
    
    def map_categories(cell):
        if pd.isna(cell):
            return cell
        # Split and strip each value.
        cats = [cat.strip() for cat in cell.split(delimiter)]
        # Replace values not in top_categories with others_label.
        new_cats = [cat if cat in top_categories else others_label for cat in cats]
        # Remove duplicates while preserving order.
        seen = set()
        new_cats = [x for x in new_cats if x not in seen and not seen.add(x)]
        return delimiter.join(new_cats)
    
    X[column] = X[column].apply(map_categories)
    # Return a DataFrame with just the transformed column.
    return X[[column]]

# Now, to create a FunctionTransformer for, say, the 'production_country_name' column with top_n=5:
transformer_prod_country = FunctionTransformer(
    func=partial(transform_top_categories, column="production_country_name", top_n=5, delimiter=",", others_label="Others"),
    validate=False
)

# Similarly, for 'spoken_languages' column with top_n=5:
transformer_spoken_lang = FunctionTransformer(
    func=partial(transform_top_categories, column="spoken_languages", top_n=5, delimiter=",", others_label="Others"),
    validate=False
)

In [238]:
def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['release_year'] = df['release_date'].dt.year
    df['release_month'] = df['release_date'].dt.month
    df['release_day'] = df['release_date'].dt.day
    df['is_weekend'] = (df['release_date'].dt.weekday >= 4).astype(int)
    df['is_holiday_season'] = df['release_month'].isin([6, 7, 11, 12]).astype(int)
    df['movie_age'] = 2025 - df['release_year']
    return df

# Wrap the function as a transformer
date_features_transformer = FunctionTransformer(add_date_features, validate=False)

### Apply Transformations

In [239]:
drop_transformer = ColumnTransformer(
    transformers=[
        ('dropper', 'drop', columns_to_drop)
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)
# Set output to pandas dataframe
drop_transformer.set_output(transform='pandas')

In [240]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('to_numeric', to_numeric, zero_cols)
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)
# Set output to pandas dataframe
numeric_transformer.set_output(transform='pandas')

In [241]:
main_transformer = ColumnTransformer(
    transformers=[
        ('zero', zero_pipeline, zero_cols),
        ('median', median_pipeline, median_cols),
        ('cat', cat_pipeline, cat_cols),
        ('awards', awards_transformer, ['awards']),
        ('date_features', date_features_transformer, ['release_date']),
        ('dropper', dropper, columns_to_drop)
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
main_transformer.set_output(transform='pandas')

# Apply the preprocessor to the data
# clean_data = main_transformer.fit_transform(data)
# clean_data.head()

In [242]:
top_n_transformer = ColumnTransformer(
    transformers=[
        ('prod_country', transformer_prod_country, ['production_country_name']),
        ('spoken_lang', transformer_spoken_lang, ['spoken_languages'])
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
top_n_transformer.set_output(transform='pandas')

In [243]:
missing_indicator_transformer = ColumnTransformer(
    transformers=[
        ('missing_indicator', missing_indicator_transformer, iter_cols),
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
missing_indicator_transformer.set_output(transform='pandas')

In [244]:
iter_transformer = ColumnTransformer(
    transformers=[
        # ('iter', iter_pipeline, iter_cols),
        ('iter', iter_pipeline, full_iter_cols),
    ],
    remainder='passthrough', 
    verbose_feature_names_out=False
)

# Set output to pandas dataframe
iter_transformer.set_output(transform='pandas')

In [245]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer




In [246]:
full_pipeline = Pipeline(steps=[
    ('numeric_transformer', numeric_transformer),
    ('main_transformer', main_transformer),
    ('missing_indicator_transformer', missing_indicator_transformer),
    ('top_n_transformer', top_n_transformer),
    ('iter_preprocessor', iter_transformer),
    # ('date_feature_engineering', date_features_transformer),
    ('drop_transformer', drop_transformer)
])

# Set output to pandas dataframe
full_pipeline.set_output(transform='pandas')

clean_data = full_pipeline.fit_transform(data)
clean_data.info()

ValueError: A given column is not a column of the dataframe

In [None]:
clean_data.head()

Unnamed: 0,metascore,rotten_tomatoes_rating,meta_critic_rating,budget,revenue,tmdb_vote_count,tmdb_vote_average,runtime_in_min,tmdb_popularity,imdb_rating,...,age_rating,release_date,release_year,release_month,release_day,is_weekend,is_holiday_season,movie_age,title,genre_names
0,57.0,32.0,57.0,14353110.0,-31131910.0,378.0,6.0,132.0,27.043,5.1,...,TV-MA,2022-08-05,2022,8,5,1,0,3,Carter,"Action, Thriller, Crime"
1,57.0,72.813826,57.0,23364750.0,20024500.0,494.0,7.829,107.0,16.008,7.3,...,TV-MA,2021-10-28,2021,10,28,0,0,4,The Shadow in My Eye,"War, Drama, History"
2,48.0,56.0,48.0,23012370.0,18022680.0,673.0,6.595,100.0,10.04,6.3,...,G,2019-11-12,2019,11,12,0,1,6,Noelle,"Family, Comedy, Fantasy"
3,57.0,70.523715,57.0,21812250.0,11213340.0,349.0,7.278,108.0,5.214,7.2,...,Unknown,2019-09-04,2019,9,4,0,0,6,Play,Comedy
4,57.0,42.0,57.0,17784960.0,-11652320.0,320.0,7.6,65.0,7.683,8.3,...,TV-MA,2019-08-26,2019,8,26,0,0,6,Dave Chappelle: Sticks & Stones,Comedy


## Feature Engineering

In [None]:
# clean_data['release_date'] = pd.to_datetime(clean_data['release_date'])
# clean_data['release_year'] = clean_data['release_date'].dt.year
# clean_data['release_month'] = clean_data['release_date'].dt.month
# clean_data['release_day'] = clean_data['release_date'].dt.day

# clean_data['is_weekend'] = (clean_data['release_date'].dt.weekday >= 4).astype(int)  # 1 if Fri-Sun
# clean_data['is_holiday_season'] = clean_data['release_month'].isin([6, 7, 11, 12]).astype(int)  # Summer & winter holidays
# clean_data['movie_age'] = 2025 - clean_data['release_year']  # If predicting in 2025

# clean_data.drop(columns=['release_date'], inplace=True)


In [None]:
clean_data['roi'] = (clean_data['revenue'] - clean_data['budget']) / clean_data['budget']

In [None]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   metascore                       4999 non-null   float64       
 1   rotten_tomatoes_rating          4999 non-null   float64       
 2   meta_critic_rating              4999 non-null   float64       
 3   budget                          4999 non-null   float64       
 4   revenue                         4999 non-null   float64       
 5   tmdb_vote_count                 4999 non-null   float64       
 6   tmdb_vote_average               4999 non-null   float64       
 7   runtime_in_min                  4999 non-null   float64       
 8   tmdb_popularity                 4999 non-null   float64       
 9   imdb_rating                     4999 non-null   float64       
 10  imdb_votes                      4999 non-null   float64       
 11  tota

## Save cleanded data

In [None]:
# Saving cleaned dataset for future use
save_dataframe(clean_data, "01_clean_data.csv")

✅ Data successfully saved to ./data\01_clean_data.csv with separator ','
