# Team Assignment 1 - HeeYoung KWON

In [None]:
#Necessary Importations (following Isa's importations)
import pandas as pd
import requests
import io
import os
import seaborn as sns
import matplotlib.pyplot as plt 

##### My Data's were not available on the API, however was available on CSV
###### I have 2 raw files in the raw branch
###### gfi.csv -> Global Financial Inclusion Data from World Bank Group Data Bank https://databank.worldbank.org/reports.aspx?source=1228
###### wdi.csv -> World Development Indicators from  World Bank Group Data Bank https://databank.worldbank.org/source/world-development-indicators

###### Code under is the initial API code that I've used (unfortunately didn't work)

In [None]:
# API Code that I initally wrote (If you spot a reason why, please let me know)
import pandas as pd
import requests

def download_worldbank(indicator, countries, date_start=None, date_end=None):
    url_base = 'http://api.worldbank.org/v2/sources/28/'  # World Bank API base URL
    country_codes = ';'.join(countries)  # Join country codes into a string
    url = (
        f"{url_base}country/{country_codes}/indicator/{indicator}"
        f"?format=json&per_page=50000"
    )
    
    if date_start and date_end:
        url += f"&date={date_start}:{date_end}"
    
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")
    
    data = response.json()
    
    if len(data) < 2:
        raise Exception(f"No data available for indicator {indicator}")
    
    df = pd.json_normalize(data[1])  # only the data
    df['indicator'] = indicator  # make sure you can know the indicator
    return df

def download_multiple_indicators(indicator_list, countries, date_start=None, date_end=None, ignore_errors=True):
    all_dfs = []
    for indicator in indicator_list:
        try:
            print(f"📥 Downloading: {indicator}")
            df = download_worldbank(indicator, countries, date_start, date_end)
            all_dfs.append(df)
        except Exception as e:
            print(f"❌ Failed to download {indicator}: {e}")
            if not ignore_errors:
                raise e

    if not all_dfs:
        raise Exception("No indicators were successfully downloaded.")

    final_df = pd.concat(all_dfs, ignore_index=True)
    return final_df

countries = ['BGD','IND','PAK','LKA','NPL','BTN','MMR','BRN','IDN','KHM','LAO','THA','VNM','SGP','MYS','PHL']
indicators = [
    'account.t.d',           # Account (% age 15+)
    'account.t.d.1',         # Account, female (% age 15+)
    'account.t.d.2',         # Account, male (% age 15+)
    'account.t.d.9',         # Account, rural (% age 15+)
    'account.t.d.10',        # Account, urban (% age 15+)
    'fin1.t.d',              # Financial institution account (% age 15+)
]

merged_data = download_multiple_indicators(indicators, countries, date_start='2011', date_end='2021')

print(merged_data.head())

##### Cleaning the datasets (Codes are inside src/cleaning)
###### I have cleaned the data into 2 categories 
###### a. financial inclusion (You can find the executed code in the file name clean_code.py)
###### b. digital transaction (You can find the executed code in the file name clean_code2.py)

In [None]:
# [Step 1] Importing required libraries
import pandas as pd
import os

# [Step 2] Setting the file path
# Find out the project root path based on the current python file
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Create a path to the raw file 
# MY file name was "gfi.csv," so if you are applying to your file, please change it to your file name
raw_data_path = os.path.join(project_root, 'data', 'raw', 'gfi.csv')

# [Step 3] Loading data
df = pd.read_csv(raw_data_path)

# [Step 4] Create a list of required columns 
# Change the column names according to your data
target_columns = [
    "Account (% age 15+)",
    "Account, female (% age 15+)",
    "Account, male (% age 15+)",
    "Account, rural (% age 15+)",
    "Account, urban (% age 15+)",
    "Financial institution account (% age 15+)",
    "Financial institution account, female (% age 15+)",
    "Financial institution account, male (% age 15+)",
    "Financial institution account, rural (% age 15+)",
    "Financial institution account, urban (% age 15+)",
    "Mobile money account (% age 15+)",
    "Mobile money account, female (% age 15+)",
    "Mobile money account, male (% age 15+)",
    "Mobile money account, rural (% age 15+)",
    "Mobile money account, urban (% age 15+)",
    "Own a mobile phone (% age 15+)",
    "Owns a debit or credit card (% age 15+)"
]

# [Step 5] Extract only selected columns
# (Important: Basic ID information usually needs to be left along with Country Name, Country Code, etc.)
id_columns = ["Country Name", "Country Code", "Series Name", "Series Code"]

# Total columns to extract = Identification + Inclusion related metrics
columns_to_keep = id_columns + target_columns
df_selected = df[df['Series Name'].isin(target_columns)]

# [Step 6] Create a path for the processed folder to be saved
# Change the 'gfi_financial_inclusion.csv' according to your desrired file name
processed_data_path = os.path.join(project_root, 'data', 'processed', 'gfi_financial_inclusion.csv')

# [Step 7] Save as a new file
df_selected.to_csv(processed_data_path, index=False)

print(f"✅ Data was saved successfully: {processed_data_path}")

##### Making a pivot (Codes are inside src/pivot)
###### I have made 2 categories data into a pivot 
###### a. financial inclusion (You can find the executed code in the file name pivot.py)
###### b. digital transaction (You can find the executed code in the file name pivot2.py)

In [None]:
import pandas as pd
import os

# [1] Setting the project path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
file_path = os.path.join(project_root, 'data', 'processed', 'gfi_financial_inclusion.csv')

# [2] Loading data
df = pd.read_csv(file_path)

# [3] Filter Southeast Asia & South Asia countries
southeast_asia_list = ['VNM','LAO','THA','KHM','MYS','SGP','MMR','PHL','BRN','IDN']
south_asia_list = ['BGD','IND','PAK','NPL','LKA','BTN']
target_countries = southeast_asia_list + south_asia_list
df = df[df["Country Code"].isin(target_countries)]

# [4] Separate years in wide → long format
df_long = pd.melt(
    df,
    id_vars=["Country Name", "Country Code", "Series Name", "Series Code"],
    var_name="year",
    value_name="value"
)

# [5] Extract only the year number ("2011 [YR2011]" → 2011)
df_long["year"] = df_long["year"].str.extract(r"(\d{4})")
df_long["year"] = df_long["year"].astype(int)

# [6] Number conversion
df_long["value"] = pd.to_numeric(df_long["value"], errors="coerce")

# [7] pivot: Organize indicators into columns, organize values ​​by country and year
df_pivot = df_long.pivot_table(
    index=["Country Code", "year"],
    columns="Series Name",
    values="value"
).reset_index()

# [8] Preview results
print(df_pivot.head())

# Saving Path
save_path = os.path.join(project_root, 'data', 'processed', 'gfi_inclusion_pivot_table.csv')
df_pivot.to_csv(save_path, index=False)
print(f"✅ Save complete: {save_path}")