In [8]:
import pandas as pd
import numpy as np

pd.set_option("future.no_silent_downcasting", True)

from wsi.indicators.education import build_education_df
from wsi.indicators.employment import build_employment_df
from wsi.indicators.parliament import build_parliamentary_df
from wsi.indicators.poverty import build_poverty_df
from wsi.indicators.legal import build_legal_df
from wsi.indicators.son_bias import build_son_bias_df
from wsi.indicators.maternal_mortality import build_maternal_mortality_df
from wsi.indicators.attitude_violence import build_attitudes_violence_df
from wsi.indicators.child_marriage import build_child_marriage_df
from wsi.indicators.water_sanitation import build_water_sanitation_df
from wsi.indicators.electricity import build_access_electricity_df
from wsi.indicators.financial_inclusion import build_financial_inclusion_df
from wsi.indicators.phone_use import build_cell_phone_use_df

from wsi.config import INDICATORS, EXCLUDE_ISO # we exclude certain ISO's from the index calculation due to lack of data
from wsi.map_iso_name import ALL_ISO_NAME
from wsi.map_iso_income import CODE_INCOME
from wsi.map_iso_region import CODE_SUBREGION

# Parameters
years = list(range(1995, 2025))

indicator_columns = [k for k, v in INDICATORS.items()]
invert_columns = [k for k in indicator_columns if INDICATORS[k].get("invert")]

indicator_builders = {
    'Education': build_education_df,
    'Employment': build_employment_df,
    'Parliamentary Representation': build_parliamentary_df,
    'Poverty': build_poverty_df,
    'Legal Protection Index': build_legal_df,
    'Son Bias': build_son_bias_df,
    'Maternal Mortality': build_maternal_mortality_df,
    'Attitudes Towards Violence': build_attitudes_violence_df,
    'Child Marriage': build_child_marriage_df,
    'Access Water Sanitation': build_water_sanitation_df,
    'Access Electricity': build_access_electricity_df,
    'Financial Inclusion': build_financial_inclusion_df,
    "Cell Phone Use": build_cell_phone_use_df,
}


"""
Indicator Processing Pipeline:

For each indicator:
1. Create a full DataFrame of all ISO codes and years.
2. Merge the indicator data into this full structure and sort by ISO and Year.
3. Fill missing values per country using:
   - Linear interpolation
   - Forward and backward fill
   - Track all filled values with a corresponding source column.
4. Compute region and income group averages from the filled data.
5. For countries with entirely missing data for an indicator:
   - Fill using the appropriate average (region or income), based on config.
   - Mark these with the appropriate source tag (e.g., 'AVG_REG' or 'AVG_INC').
"""


In [None]:
# TODO: don't actually compute averages for those that are not included in index calculation

# Initialise main DataFrame
df = pd.DataFrame(
    [(iso, year) for iso in ALL_ISO_NAME for year in years],
    columns=["ISO_code", "Year"]
)

# Merge each indicator and handle filling
for ind in indicator_columns:
    indicator_df = indicator_builders[ind]()
    df = df.merge(indicator_df, on=['ISO_code', 'Year'], how='left')
    df[f"{ind} (source)"] = ''
    df.loc[df[ind].notna(), f"{ind} (source)"] = 'ORI'

# Fill missing values per country with interpolation + ffill/bfill
def fill_missing(group):
    for ind in indicator_columns:
        group[ind] = pd.to_numeric(group[ind], errors='coerce')
        before = group[ind].isnull()
        group[ind] = group[ind].interpolate(limit_direction='both')
        after_interp = before & group[ind].notnull()
        group.loc[after_interp, f"{ind} (source)"] = 'TSI'
        group[ind] = group[ind].ffill().bfill()
        after_fill = before & group[ind].notnull()
        group.loc[after_fill, f"{ind} (source)"] = 'TSI'
    return group

df = df.groupby('ISO_code').apply(fill_missing, include_groups=False).reset_index(drop=True)


In [None]:
df['Region'] = df['ISO_code'].map(CODE_SUBREGION)
df['Income'] = df['ISO_code'].map(CODE_INCOME)

# Region and income averages
region_avgs = df.groupby(['Region', 'Year'])[indicator_columns].mean()
income_avgs = df.groupby(['Income', 'Year'])[indicator_columns].mean()

# Fill entirely missing countries with region or income averages
for ind in indicator_columns:
    missing = df.groupby('ISO_code')[ind].apply(lambda x: x.isna().all())
    fill_strategy = INDICATORS[ind]["fill"]

    for iso in missing[missing].index:
        if fill_strategy == "income_avg":
            group = CODE_INCOME.get(iso)
            if group and (group, years[0]) in income_avgs.index:
                fill_vals = income_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_INC'

        elif fill_strategy == "region_avg":
            group = CODE_SUBREGION.get(iso)
            if group and (group, years[0]) in region_avgs.index:
                fill_vals = region_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_REG'

In [5]:
df

Unnamed: 0,ISO_code,Year,Education,Education (source),Employment,Employment (source),Parliamentary Representation,Parliamentary Representation (source),Poverty,Poverty (source),...,Access Water Sanitation,Access Water Sanitation (source),Access Electricity,Access Electricity (source),Cell Phone Use,Cell Phone Use (source),Financial Inclusion,Financial Inclusion (source),Region,Income
0,AFG,1995,0.217986,TSI,48.240352,TSI,25.925926,TSI,,AVG_INC,...,23.106328,TSI,4.4,TSI,0.000000,ORI,0.026162,TSI,Southern Asia,Low income
1,AFG,1996,0.217986,TSI,48.240352,TSI,25.925926,TSI,,AVG_INC,...,23.106328,TSI,4.4,TSI,0.000000,ORI,0.026162,TSI,Southern Asia,Low income
2,AFG,1997,0.217986,TSI,48.240352,TSI,25.925926,TSI,,AVG_INC,...,23.106328,TSI,4.4,TSI,0.000000,ORI,0.026162,TSI,Southern Asia,Low income
3,AFG,1998,0.217986,TSI,48.240352,TSI,25.925926,TSI,,AVG_INC,...,23.106328,TSI,4.4,TSI,0.000000,ORI,0.026162,TSI,Southern Asia,Low income
4,AFG,1999,0.217986,TSI,48.240352,TSI,25.925926,TSI,,AVG_INC,...,23.106328,TSI,4.4,TSI,0.000000,ORI,0.026162,TSI,Southern Asia,Low income
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,WSM,2020,1.068139,TSI,40.121009,TSI,10.000000,ORI,1.2,TSI,...,97.268063,ORI,100.0,ORI,32.103625,ORI,0.416811,AVG_INC,Polynesia,Lower middle income
1436,WSM,2021,1.068139,TSI,38.895073,TSI,10.000000,ORI,1.2,TSI,...,97.798956,ORI,98.3,ORI,31.540839,ORI,0.435345,AVG_INC,Polynesia,Lower middle income
1437,WSM,2022,1.068139,TSI,37.669138,ORI,7.843137,ORI,1.2,TSI,...,98.294082,ORI,98.3,ORI,60.400572,ORI,0.439529,AVG_INC,Polynesia,Lower middle income
1438,WSM,2023,1.068139,TSI,37.669138,TSI,12.962963,ORI,1.2,TSI,...,98.294082,TSI,98.3,TSI,60.400572,TSI,0.439529,AVG_INC,Polynesia,Lower middle income


In [None]:
# check missingness

indicator_columns = [col for col in df.columns if col in INDICATORS]  # or manually list indicators if needed

for col in indicator_columns:
    missing_rows = df[df[col].isnull()][['ISO_code', 'Year']].copy()
    if not missing_rows.empty:
        missing_rows['Indicator'] = col
        print(missing_rows)



     ISO_code  Year  Indicator
1170      TKL  1995  Education
1171      TKL  1996  Education
1172      TKL  1997  Education
1173      TKL  1998  Education
1174      TKL  1999  Education
1175      TKL  2000  Education
1176      TKL  2001  Education
1177      TKL  2002  Education
1178      TKL  2003  Education
1179      TKL  2004  Education
1180      TKL  2005  Education
1181      TKL  2006  Education
1182      TKL  2007  Education
1183      TKL  2008  Education
1184      TKL  2009  Education
1185      TKL  2010  Education
1186      TKL  2011  Education
1187      TKL  2012  Education
1188      TKL  2013  Education
1189      TKL  2014  Education
1190      TKL  2015  Education
1191      TKL  2016  Education
1192      TKL  2017  Education
1193      TKL  2018  Education
1194      TKL  2019  Education
1195      TKL  2020  Education
1196      TKL  2021  Education
1197      TKL  2022  Education
1198      TKL  2023  Education
1199      TKL  2024  Education
1380      WLF  1995  Education
1381    

In [None]:
"""
use dimension mapping
normalise and aggregate, invert and get Dimension
calc WSI baseline (some are excluded)
"""



In [None]:
# check graph