In [None]:
import pandas as pd
import numpy as np

pd.set_option("future.no_silent_downcasting", True)

from wsi.indicators.education import build_education_df
from wsi.indicators.employment import build_employment_df
from wsi.indicators.parliament import build_parliamentary_df
from wsi.indicators.poverty import build_poverty_df
from wsi.indicators.legal import build_legal_df
from wsi.indicators.son_bias import build_son_bias_df
from wsi.indicators.maternal_mortality import build_maternal_mortality_df
from wsi.indicators.attitude_violence import build_attitudes_violence_df
from wsi.indicators.child_marriage import build_child_marriage_df
from wsi.indicators.water_sanitation import build_water_sanitation_df
from wsi.indicators.electricity import build_access_electricity_df
from wsi.indicators.financial_inclusion import build_financial_inclusion_df
from wsi.indicators.phone_use import build_cell_phone_use_df

from wsi.config import INDICATORS, EXCLUDE_ISO # we exclude certain ISO's from the index calculation due to lack of data
from wsi.map_iso_name import ALL_ISO_NAME
from wsi.map_iso_income import CODE_INCOME
from wsi.map_iso_region import CODE_SUBREGION

# Parameters
years = list(range(1995, 2025))

indicator_columns = [k for k, v in INDICATORS.items()]
invert_columns = [k for k in indicator_columns if INDICATORS[k].get("invert")]

indicator_builders = {
    'Education': build_education_df,
    'Employment': build_employment_df,
    'Parliamentary Representation': build_parliamentary_df,
    'Poverty': build_poverty_df,
    'Legal Protection Index': build_legal_df,
    'Son Bias': build_son_bias_df,
    'Maternal Mortality': build_maternal_mortality_df,
    'Attitudes Towards Violence': build_attitudes_violence_df,
    'Child Marriage': build_child_marriage_df,
    'Access Water Sanitation': build_water_sanitation_df,
    'Access Electricity': build_access_electricity_df,
    'Financial Inclusion': build_financial_inclusion_df,
    "Cell Phone Use": build_cell_phone_use_df,
}


"""
Indicator Processing Pipeline:

For each indicator:
1. Create a full DataFrame of all ISO codes and years.
2. Merge the indicator data into this full structure and sort by ISO and Year.
3. Fill missing values per country using:
   - Linear interpolation
   - Forward and backward fill
   - Track all filled values with a corresponding source column.
4. Compute region and income group averages from the filled data.
5. For countries with entirely missing data for an indicator:
   - Fill using the appropriate average (region or income), based on config.
   - Mark these with the appropriate source tag (e.g., 'AVG_REG' or 'AVG_INC').
"""


In [None]:
from wsi.utils import processed_data_path

In [None]:

# Initialise main DataFrame, establishes order
df_raw = pd.DataFrame(
    [(iso, year) for iso in ALL_ISO_NAME for year in years],
    columns=["ISO_code", "Year"]
)

# Merge each indicator and handle filling
for ind in indicator_columns:
    indicator_df = indicator_builders[ind]()
    df_raw = df_raw.merge(indicator_df, on=['ISO_code', 'Year'], how='left')


output_path = processed_data_path("raw_baseline_indicators.csv")
df_raw.to_csv(output_path, index=False)


In [None]:
missing_summary = []

for iso, group in df_raw.groupby('ISO_code'):
    missing_indicators = [col for col in indicator_columns if group[col].isna().all()]
    if missing_indicators:
        missing_summary.append({
            'ISO_code': iso,
            'Country': ALL_ISO_NAME.get(iso, "Unknown"),
            'Missing_Count': len(missing_indicators),
            'Indicators': ', '.join(missing_indicators)
        })

# Step 3: Create summary DataFrame
summary = pd.DataFrame(missing_summary)

# Step 4: Sort by number of missing indicators
summary = summary.sort_values(by='Missing_Count', ascending=False).reset_index(drop=True)

# Step 5: Save to CSV
output_path = processed_data_path("missing_indicators_summary.csv")
summary.to_csv(output_path, index=False)

In [None]:
# Partition into included and excluded (for the index calculation) based on ISO code
df = df_raw[~df_raw["ISO_code"].isin(EXCLUDE_ISO)].copy()
df_excluded = df_raw[df_raw["ISO_code"].isin(EXCLUDE_ISO)].copy()

# Add source tracking columns
for ind in indicator_columns:
    df[f"{ind} (source)"] = ''
    df.loc[df[ind].notna(), f"{ind} (source)"] = 'ORI'

# Fill missing values per country with interpolation + ffill/bfill
def fill_missing(group):
    for ind in indicator_columns:
        group[ind] = pd.to_numeric(group[ind], errors='coerce')
        before = group[ind].isnull()
        group[ind] = group[ind].interpolate(limit_direction='both')
        after_interp = before & group[ind].notnull()
        group.loc[after_interp, f"{ind} (source)"] = 'TSI'
        group[ind] = group[ind].ffill().bfill()
        after_fill = before & group[ind].notnull()
        group.loc[after_fill, f"{ind} (source)"] = 'TSI'
    return group

df = df.groupby('ISO_code').apply(fill_missing).reset_index(drop=True)

In [None]:
df['Region'] = df['ISO_code'].map(CODE_SUBREGION)
df['Income'] = df['ISO_code'].map(CODE_INCOME)

# Region and income averages
region_avgs = df.groupby(['Region', 'Year'])[indicator_columns].mean()
income_avgs = df.groupby(['Income', 'Year'])[indicator_columns].mean()

# Fill entirely missing countries with region or income averages
for ind in indicator_columns:
    missing = df.groupby('ISO_code')[ind].apply(lambda x: x.isna().all())
    fill_strategy = INDICATORS[ind]["fill"]

    for iso in missing[missing].index:
        if fill_strategy == "income_avg":
            group = CODE_INCOME.get(iso)
            if group and (group, years[0]) in income_avgs.index:
                fill_vals = income_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_INC'

        elif fill_strategy == "region_avg":
            group = CODE_SUBREGION.get(iso)
            if group and (group, years[0]) in region_avgs.index:
                fill_vals = region_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_REG'

In [None]:
df

In [None]:
# TODO: make special excetios for this missingness

# Identify missing values for each indicator
indicator_columns = [col for col in df.columns if col in INDICATORS]

missing_records = []

for col in indicator_columns:
    missing_rows = df[df[col].isnull()][['ISO_code']].copy()
    if not missing_rows.empty:
        missing_rows['Indicator'] = col
        missing_records.append(missing_rows)

# Combine and drop duplicates
if missing_records:
    all_missing = pd.concat(missing_records)
    unique_missing = all_missing.drop_duplicates()
    print(unique_missing)
else:
    print("No missing values found.")


In [None]:
"""
use dimension mapping
normalise and aggregate, invert and get dimension score
calc WSI baseline (some are excluded)
"""


In [None]:
# replicate graph

In [None]:
# shocks metric (refactor; check 2024 values)

# recreate all graphs

In [None]:
# website graphs for 'explore by ...', see https://flourish.studio/ for example