In [1]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sqlalchemy import text

In [2]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from database.db_utils import init_db
from config.config_loader import load_config
from database.queries import prepped_data_query

In [4]:
# Initialize local PostgreSQL session
Session = init_db(load_config("DB_URL"))
session = Session()

# Execute and fetch results
data = session.execute(text(prepped_data_query))

# Close the session
session.close()

# Convert to DataFrame
data = pd.DataFrame(data.fetchall(), columns=data.keys())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    151 non-null    object 
 1   release_date             151 non-null    object 
 2   tmdb_vote_count          151 non-null    int64  
 3   tmdb_vote_average        151 non-null    float64
 4   genre_names              151 non-null    object 
 5   budget                   151 non-null    int64  
 6   revenue                  151 non-null    int64  
 7   runtime_in_min           151 non-null    int64  
 8   tmdb_popularity          151 non-null    float64
 9   production_company_name  151 non-null    object 
 10  production_country_name  151 non-null    object 
 11  spoken_languages         151 non-null    object 
 12  director                 151 non-null    object 
 13  writer                   151 non-null    object 
 14  actors                   1

In [5]:
# Sample function to extract numerical values
def extract_awards_info(awards_str):
    if pd.isna(awards_str) or awards_str == "N/A":
        return pd.Series([0, 0, 0, 0, 0, 0], 
                         index=["total_wins", "total_nominations", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])

    # Extract wins and nominations
    wins = sum(map(int, re.findall(r'(\d+) win', awards_str)))
    nominations = sum(map(int, re.findall(r'(\d+) nomination', awards_str)))

    # Extract Oscar-specific wins & nominations
    oscar_wins = sum(map(int, re.findall(r'Nominated for \d+ Oscars?\. (\d+) wins?', awards_str)))
    oscar_noms = sum(map(int, re.findall(r'Nominated for (\d+) Oscars?', awards_str)))

    # Extract BAFTA-specific wins & nominations
    bafta_wins = sum(map(int, re.findall(r'Nominated for \d+ BAFTA.*?(\d+) wins?', awards_str)))
    bafta_noms = sum(map(int, re.findall(r'Nominated for (\d+) BAFTA', awards_str)))

    return pd.Series([wins, nominations, oscar_wins, oscar_noms, bafta_wins, bafta_noms], 
                     index=["total_wins", "total_nominations", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])

In [6]:
data[["total_wins", "total_nominations", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"]] = data["awards"].apply(extract_awards_info)
data.drop(columns=["awards"], inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    151 non-null    object 
 1   release_date             151 non-null    object 
 2   tmdb_vote_count          151 non-null    int64  
 3   tmdb_vote_average        151 non-null    float64
 4   genre_names              151 non-null    object 
 5   budget                   151 non-null    int64  
 6   revenue                  151 non-null    int64  
 7   runtime_in_min           151 non-null    int64  
 8   tmdb_popularity          151 non-null    float64
 9   production_company_name  151 non-null    object 
 10  production_country_name  151 non-null    object 
 11  spoken_languages         151 non-null    object 
 12  director                 151 non-null    object 
 13  writer                   151 non-null    object 
 14  actors                   1

In [8]:
data.head()

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,genre_names,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,...,box_office,age_rating,rotten_tomatoes_rating,meta_critic_rating,total_wins,total_nominations,oscar_wins,oscar_noms,bafta_wins,bafta_noms
0,Back in Action,2025-01-15,909,6.556,"Action, Comedy",0,0,114,857.578,"Chernin Entertainment, Exhibit A, Good One",...,,PG-13,28.0,,0,0,0,0,0,0
1,Wolf Man,2025-01-15,323,6.4,"Horror, Thriller",25000000,34222643,103,848.052,"Universal Pictures, Cloak & Co., Blumhouse Pro...",...,20027975.0,R,51.0,50.0,0,0,0,0,0,0
2,Ad Vitam,2025-01-09,291,6.294,"Thriller, Drama, Crime, Action",0,0,97,73.082,Les Films du Cap,...,,,38.0,,0,0,0,0,0,0
3,Your Fault,2024-12-26,1037,7.141,"Romance, Drama",0,0,118,393.38,"Pokeepsie Films, Amazon MGM Studios",...,,,20.0,,0,0,0,0,0,0
4,Babygirl,2024-12-25,446,5.7,Drama,20000000,47756311,115,401.385,"A24, 2AM, Man Up Film",...,8406060.0,R,77.0,81.0,6,15,0,0,0,0


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

# Ensure GENRE_NAME is a list of genres
data["genre_list"] = data["genre_names"].apply(lambda x: x.split(", "))

# Use MultiLabelBinarizer to create one-hot encoded columns
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(data["genre_list"]), columns=mlb.classes_)

# Concatenate with original DataFrame and drop unnecessary columns
data = pd.concat([data, genre_dummies], axis=1).drop(["genre_names", "genre_list"], axis=1)

In [10]:
data.head() # Check the transformed data

Unnamed: 0,title,release_date,tmdb_vote_count,tmdb_vote_average,budget,revenue,runtime_in_min,tmdb_popularity,production_company_name,production_country_name,...,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western
0,Back in Action,2025-01-15,909,6.556,0,0,114,857.578,"Chernin Entertainment, Exhibit A, Good One",United States of America,...,0,0,0,0,0,0,0,0,0,0
1,Wolf Man,2025-01-15,323,6.4,25000000,34222643,103,848.052,"Universal Pictures, Cloak & Co., Blumhouse Pro...",United States of America,...,0,0,1,0,0,0,0,1,0,0
2,Ad Vitam,2025-01-09,291,6.294,0,0,97,73.082,Les Films du Cap,France,...,0,0,0,0,0,0,0,1,0,0
3,Your Fault,2024-12-26,1037,7.141,0,0,118,393.38,"Pokeepsie Films, Amazon MGM Studios","Spain, United States of America",...,0,0,0,0,0,1,0,0,0,0
4,Babygirl,2024-12-25,446,5.7,20000000,47756311,115,401.385,"A24, 2AM, Man Up Film","United States of America, Netherlands",...,0,0,0,0,0,0,0,0,0,0
