In [None]:
# Imports
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sqlalchemy import text

In [None]:
# Directory config to project root to insure consistency across environments for project specific imports
from pyprojroot import here
os.chdir(here())

# Project specific imports
from src.utils import save_dataframe
from database.db_utils import init_db
from config.config_loader import load_config
from database.queries import prepped_data_query

In [None]:
# Load dataset
data = pd.read_csv("./data/01_clean_data.csv")

In [None]:
# Sample function to extract numerical values
def extract_awards_info(awards_str):
    if pd.isna(awards_str) or awards_str == "N/A":
        return pd.Series([0, 0, 0, 0, 0, 0], 
                         index=["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])

    # Extract wins and nominations
    wins = sum(map(int, re.findall(r'(\d+) win', awards_str)))
    nominations = sum(map(int, re.findall(r'(\d+) nomination', awards_str)))

    # Extract Oscar-specific wins & nominations
    oscar_wins = sum(map(int, re.findall(r'Nominated for \d+ Oscars?\. (\d+) wins?', awards_str)))
    oscar_noms = sum(map(int, re.findall(r'Nominated for (\d+) Oscars?', awards_str)))

    # Extract BAFTA-specific wins & nominations
    bafta_wins = sum(map(int, re.findall(r'Nominated for \d+ BAFTA.*?(\d+) wins?', awards_str)))
    bafta_noms = sum(map(int, re.findall(r'Nominated for (\d+) BAFTA', awards_str)))

    return pd.Series([wins, nominations, oscar_wins, oscar_noms, bafta_wins, bafta_noms], 
                     index=["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"])

# Apply function to dataset
data[["total_wins", "total_noms", "oscar_wins", "oscar_noms", "bafta_wins", "bafta_noms"]] = data["awards"].apply(extract_awards_info)
data.drop(columns=["awards"], inplace=True)