In [135]:
# Imported all of the imports that I need
import json
import pandas as pd
import os

# Imported my script and util python file
import sys
sys.path.append('/files/me204-2025-project-jayden/scripts')

# Imported util to reduce redundancy
from utils import indicators

In [136]:
# Loaded the raw JSON data in
with open('/files/me204-2025-project-jayden/data/raw/world_bank_raw_data.json', 'r') as f:
    raw_json_data = json.load(f)

# Defined a function that parsed the JSON structure into a list of records (one per country-year-indicator)
# Each record represented a single data point for a specific country and year
# The function extracted the country name, ISO3 code, year, and the indicator value
# It also handled missing values by assigning None when the indicator value was not available
def parse_json_to_records(json_data, code):
    return [
        {
            'country': entry['country']['value'],
            # Extracted the full country name
            # iso3 is the code name (or sort of like an id) for each country
            'iso3': entry['country']['id'],
            # Extracted the ISO3 country code
            'year': int(entry['date']),
            # Converted the year from string to integer
            code: float(entry['value']) if entry['value'] is not None else None
        }
        for entry in json_data[1]
        # Iterated over the list of data entries in the JSON response
    ]


In [147]:
# Created a list of DataFrame (one for each indicator) using list comprehension
# Each DataFrame was renamed to use an easily readable label instead of the raw indicator code
dfs = [
    pd.DataFrame(parse_json_to_records(raw_json_data[code], code)).rename(columns={code: label})
    for code, label in indicators.items()
]

# Merged all individual DataFrames into a single DataFrame by iteratively joining on country, iso3, and year
df_full = dfs[0]
for df in dfs[1:]:
    df_full = pd.merge(df_full, df, on=['country', 'iso3', 'year'], how='outer')

# Sorted the final DataFrame by country code and year, then reset the index for cleanliness
df_full = df_full.sort_values(['iso3', 'year']).reset_index(drop=True)

# Displayed the final DataFrame
df_full

Unnamed: 0,country,iso3,year,GDP_Current_USD,Health_Expenditure_%_GDP,Education_Expenditure_%_GDP,Unemployment_Rate_%,Inflation_Annual_%,FDI_Net_Inflows_%_GDP,Population_Total,Life_Expectancy_Years,Govt_Effectiveness,Rule_of_Law,Control_of_Corruption,Political_Stability,Voice_and_Accountability
0,Denmark,DK,2018,355293400000.0,10.097642,7.29691,5.131,0.813609,2.466701,5793636.0,80.953659,1.810602,1.773226,2.156996,0.931982,1.574022
1,Denmark,DK,2019,345401500000.0,10.152435,7.25505,5.018,0.758132,-1.100671,5814422.0,81.45122,1.873268,1.833377,2.121649,0.967599,1.543434
2,Denmark,DK,2020,355631000000.0,10.718546,7.38354,5.637,0.420712,0.363875,5831404.0,81.602439,1.840067,1.806794,2.236486,0.920234,1.514785
3,Denmark,DK,2021,408378200000.0,10.747286,6.99993,5.043,1.853045,4.162143,5856733.0,81.404878,1.959734,1.895721,2.333371,0.928663,1.544976
4,Denmark,DK,2022,401945600000.0,9.475862,5.296434,4.434,7.696567,7.667934,5903037.0,81.304878,1.990135,1.899581,2.402638,0.868918,1.591254
5,Denmark,DK,2023,407091900000.0,9.417846,,5.094,3.305178,1.124286,5946952.0,81.853659,2.015649,1.908936,2.376053,0.850848,1.664699
6,Denmark,DK,2024,429457400000.0,,,5.587,1.3722,4.214609,5976992.0,,,,,,
7,Finland,FI,2018,273869300000.0,9.045142,6.27558,7.361,1.083821,-3.844567,5515525.0,81.734146,2.013878,2.034399,2.171759,0.89189,1.589352
8,Finland,FI,2019,267014800000.0,9.171427,6.41695,6.695,1.024094,6.097077,5521606.0,81.982927,1.972805,2.012629,2.113933,0.835259,1.562998
9,Finland,FI,2020,270000300000.0,9.632496,6.63494,7.759,0.290555,-0.939114,5529543.0,81.931707,1.901004,2.018592,2.17044,0.981825,1.609512
