## 3. Data Enrichment - Understanding the "Why"

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

### 3.1. Load Datasets

In [2]:

db_user = 'root'
db_password = 'Kristine2004$'  
db_host = 'localhost'
db_name = 'banking_analysis'
connection_str = f'mysql+mysqlconnector://{db_user}:{db_password}@{db_host}/{db_name}'
engine = create_engine(connection_str)

df_losses = pd.read_sql('SELECT * FROM banking_losses', engine)
print("Banking losses data loaded from MySQL.")

df_population = pd.read_csv('../data/states_by_population_2019.csv', encoding='latin-1')
df_gsdp = pd.read_csv('../data/states_by_GSDP_rs_billions_2018-23.csv', encoding='latin-1')
df_branches = pd.read_csv('../data/states_distribution_bankBranches_2023.csv', encoding='latin-1')
df_literacy = pd.read_csv('../data/states_by_literacy_percent_2023-24.csv', encoding='latin-1')
print("External datasets loaded.")

Banking losses data loaded from MySQL.
External datasets loaded.


### 3.2. Data Cleaning and Standardization

In [3]:

df_population.rename(columns={'total_population': 'Population_Total', 'population_male': 'Population_Male', 'population_female': 'Population_Female'}, inplace=True)
df_gsdp.rename(columns={'State/Union Territory': 'State', '2018-19': 'GSDP_2018-19', '2019-20': 'GSDP_2019-20', '2020-21': 'GSDP_2020-21', '2021-22': 'GSDP_2021-22', '2022-23': 'GSDP_2022-23'}, inplace=True)
df_branches.rename(columns={'Branches_2023': 'Bank_Branches_Total'}, inplace=True)
df_literacy.rename(columns={'State/UT': 'State', 'Literacy Rate': 'Literacy_Rate_Percent'}, inplace=True)

all_dfs = [df_losses, df_population, df_gsdp, df_branches, df_literacy]
for df in all_dfs:
    if 'State' in df.columns:
        df['State'] = df['State'].str.upper().str.replace('&', 'AND').str.strip()

### 3.3. Loading into MySQL

In [None]:
df_losses.replace([np.inf, -np.inf], np.nan, inplace=True)

df_losses.to_sql('banking_losses', con=engine, if_exists='replace', index=False)
df_population.to_sql('state_population', con=engine, if_exists='replace', index=False)
df_gsdp.to_sql('state_gsdp', con=engine, if_exists='replace', index=False)
df_branches.to_sql('state_bank_branches', con=engine, if_exists='replace', index=False)
df_literacy.to_sql('state_literacy', con=engine, if_exists='replace', index=False)

print("All tables loaded into 'banking_analysis' database.")