In [1]:
import pandas as pd
import numpy as np

pd.set_option("future.no_silent_downcasting", True)

from wsi.indicators.education import build_education_df
from wsi.indicators.employment import build_employment_df
from wsi.indicators.parliament import build_parliamentary_df
from wsi.indicators.poverty import build_poverty_df
from wsi.indicators.legal import build_legal_df
from wsi.indicators.son_bias import build_son_bias_df
from wsi.indicators.maternal_mortality import build_maternal_mortality_df
from wsi.indicators.attitude_violence import build_attitudes_violence_df
from wsi.indicators.child_marriage import build_child_marriage_df
from wsi.indicators.water_sanitation import build_water_sanitation_df
from wsi.indicators.electricity import build_access_electricity_df
from wsi.indicators.financial_inclusion import build_financial_inclusion_df
from wsi.indicators.phone_use import build_cell_phone_use_df

from wsi.config import INDICATORS, EXCLUDE_ISO # we exclude certain ISO's from the index calculation due to lack of data
from wsi.map_iso_name import ALL_ISO_NAME
from wsi.map_iso_income import CODE_INCOME
from wsi.map_iso_region import CODE_SUBREGION
from wsi.utils import processed_data_path

from wsi.utils import imgs_path
import matplotlib.pyplot as plt
import seaborn as sns

# Parameters
years = list(range(1995, 2025))

indicator_columns = [k for k, v in INDICATORS.items()]

dimension_indicators = {}
for ind, meta in INDICATORS.items():
    dim = meta['dimension']
    dimension_indicators.setdefault(dim, []).append(ind)
dimensions = list(dimension_indicators.keys())

indicator_builders = {
    'Education': build_education_df,
    'Employment': build_employment_df,
    'Parliamentary Representation': build_parliamentary_df,
    'Poverty': build_poverty_df,
    'Legal Protection Index': build_legal_df,
    'Son Bias': build_son_bias_df,
    'Maternal Mortality': build_maternal_mortality_df,
    'Attitudes Towards Violence': build_attitudes_violence_df,
    'Child Marriage': build_child_marriage_df,
    'Access Water Sanitation': build_water_sanitation_df,
    'Access Electricity': build_access_electricity_df,
    'Financial Inclusion': build_financial_inclusion_df,
    "Cell Phone Use": build_cell_phone_use_df,
}


"""
Indicator Processing Pipeline:

For each indicator:
1. Create a full DataFrame of all ISO codes and years.
2. Merge the indicator data into this full structure and sort by ISO and Year.
3. Fill missing values per country using:
   - Linear interpolation
   - Forward and backward fill
   - Track all filled values with a corresponding source column.
4. Compute region and income group averages from the filled data.
5. For countries with entirely missing data for an indicator:
   - Fill using the appropriate average (region or income), based on config.
   - Mark these with the appropriate source tag (e.g., 'AVG_REG' or 'AVG_INC').
"""


In [2]:

# Initialise main DataFrame, establishes order
df_raw = pd.DataFrame(
    [(iso, year) for iso in ALL_ISO_NAME for year in years],
    columns=["ISO_code", "Year"]
)

# Merge each indicator and handle filling
for ind in indicator_columns:
    indicator_df = indicator_builders[ind]()
    df_raw = df_raw.merge(indicator_df, on=['ISO_code', 'Year'], how='left')


output_path = processed_data_path("raw_baseline_indicators.csv")
df_raw.to_csv(output_path, index=False)


In [3]:
missing_summary = []

for iso, group in df_raw.groupby('ISO_code'):
    missing_indicators = [col for col in indicator_columns if group[col].isna().all()]
    if missing_indicators:
        missing_summary.append({
            'ISO_code': iso,
            'Country': ALL_ISO_NAME.get(iso, "Unknown"),
            'Missing_Count': len(missing_indicators),
            'Indicators': ', '.join(missing_indicators)
        })

# Step 3: Create summary DataFrame
summary = pd.DataFrame(missing_summary)

# Step 4: Sort by number of missing indicators
summary = summary.sort_values(by='Missing_Count', ascending=False).reset_index(drop=True)

# Step 5: Save to CSV
output_path = processed_data_path("missing_indicators_summary.csv")
summary.to_csv(output_path, index=False)

In [4]:
# Partition into included and excluded (for the index calculation) based on ISO code
df = df_raw[~df_raw["ISO_code"].isin(EXCLUDE_ISO)].copy()
df_excluded = df_raw[df_raw["ISO_code"].isin(EXCLUDE_ISO)].copy()

# Add source tracking columns
for ind in indicator_columns:
    df[f"{ind} (source)"] = ''
    df.loc[df[ind].notna(), f"{ind} (source)"] = 'ORI'

# Fill missing values per country with interpolation + ffill/bfill
def fill_missing(group):
    for ind in indicator_columns:
        group[ind] = pd.to_numeric(group[ind], errors='coerce')
        before = group[ind].isnull()
        group[ind] = group[ind].interpolate(limit_direction='both')
        after_interp = before & group[ind].notnull()
        group.loc[after_interp, f"{ind} (source)"] = 'TSI'
        group[ind] = group[ind].ffill().bfill()
        after_fill = before & group[ind].notnull()
        group.loc[after_fill, f"{ind} (source)"] = 'TSI'
    return group

df = df.groupby('ISO_code').apply(fill_missing).reset_index(drop=True)

  df = df.groupby('ISO_code').apply(fill_missing).reset_index(drop=True)


In [5]:
df['Region'] = df['ISO_code'].map(CODE_SUBREGION)
df['Income'] = df['ISO_code'].map(CODE_INCOME)

# Region and income averages
region_avgs = df.groupby(['Region', 'Year'])[indicator_columns].mean()
income_avgs = df.groupby(['Income', 'Year'])[indicator_columns].mean()

# Fill entirely missing countries with region or income averages
for ind in indicator_columns:
    missing = df.groupby('ISO_code')[ind].apply(lambda x: x.isna().all())
    fill_strategy = INDICATORS[ind]["fill"]

    for iso in missing[missing].index:
        if fill_strategy == "income_avg":
            group = CODE_INCOME.get(iso)
            if group and (group, years[0]) in income_avgs.index:
                fill_vals = income_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_INC'

        elif fill_strategy == "region_avg":
            group = CODE_SUBREGION.get(iso)
            if group and (group, years[0]) in region_avgs.index:
                fill_vals = region_avgs.loc[group]
                df.loc[df['ISO_code'] == iso, ind] = fill_vals[ind].values
                df.loc[df['ISO_code'] == iso, f"{ind} (source)"] = 'AVG_REG'

In [6]:
df

Unnamed: 0,ISO_code,Year,Education,Employment,Parliamentary Representation,Poverty,Legal Protection Index,Son Bias,Maternal Mortality,Attitudes Towards Violence,...,Son Bias (source),Maternal Mortality (source),Attitudes Towards Violence (source),Child Marriage (source),Access Water Sanitation (source),Access Electricity (source),Cell Phone Use (source),Financial Inclusion (source),Region,Income
0,AFG,1995,0.217986,48.240352,25.925926,55.268182,0.197917,105.8,1346.14410,7.485015,...,ORI,TSI,AVG_REG,TSI,TSI,TSI,ORI,TSI,Southern Asia,Low income
1,AFG,1996,0.217986,48.240352,25.925926,55.268182,0.197917,106.0,1346.14410,7.485015,...,ORI,TSI,AVG_REG,TSI,TSI,TSI,ORI,TSI,Southern Asia,Low income
2,AFG,1997,0.217986,48.240352,25.925926,55.268182,0.197917,106.2,1346.14410,7.485015,...,ORI,TSI,AVG_REG,TSI,TSI,TSI,ORI,TSI,Southern Asia,Low income
3,AFG,1998,0.217986,48.240352,25.925926,55.268182,0.197917,106.4,1346.14410,7.485015,...,ORI,TSI,AVG_REG,TSI,TSI,TSI,ORI,TSI,Southern Asia,Low income
4,AFG,1999,0.217986,48.240352,25.925926,55.268182,0.197917,106.5,1346.14410,7.485015,...,ORI,TSI,AVG_REG,TSI,TSI,TSI,ORI,TSI,Southern Asia,Low income
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,ZWE,2020,0.852830,66.377519,34.571429,39.800000,0.875000,102.4,356.75892,8.828383,...,ORI,ORI,ORI,TSI,ORI,ORI,ORI,TSI,Eastern Africa,Lower middle income
5696,ZWE,2021,0.852830,64.499665,34.571429,39.800000,0.875000,102.4,356.75892,8.828383,...,ORI,TSI,TSI,TSI,ORI,ORI,ORI,ORI,Eastern Africa,Lower middle income
5697,ZWE,2022,0.852830,63.862897,33.625731,39.800000,0.875000,102.4,356.75892,8.828383,...,ORI,TSI,TSI,TSI,ORI,ORI,ORI,TSI,Eastern Africa,Lower middle income
5698,ZWE,2023,0.852830,65.737269,33.625731,39.800000,0.875000,102.5,356.75892,8.828383,...,ORI,TSI,TSI,TSI,TSI,TSI,TSI,TSI,Eastern Africa,Lower middle income


In [7]:
# TODO: make special exceptions for this missingness

# Identify missing values for each indicator
indicator_columns = [col for col in df.columns if col in INDICATORS]

missing_records = []

for col in indicator_columns:
    missing_rows = df[df[col].isnull()][['ISO_code']].copy()
    if not missing_rows.empty:
        missing_rows['Indicator'] = col
        missing_records.append(missing_rows)

# Combine and drop duplicates
if missing_records:
    all_missing = pd.concat(missing_records)
    unique_missing = all_missing.drop_duplicates()
    print(unique_missing)
else:
    print("No missing values found.")


     ISO_code                   Indicator
30        AGO  Attitudes Towards Violence
810       CAF  Attitudes Towards Violence
990       CMR  Attitudes Towards Violence
1020      COD  Attitudes Towards Violence
1050      COG  Attitudes Towards Violence
1650      FJI  Attitudes Towards Violence
1710      FSM  Attitudes Towards Violence
1740      GAB  Attitudes Towards Violence
1950      GNQ  Attitudes Towards Violence
2700      KIR  Attitudes Towards Violence
3240      MHL  Attitudes Towards Violence
3810      NRU  Attitudes Towards Violence
4020      PLW  Attitudes Towards Violence
4050      PNG  Attitudes Towards Violence
4470      SLB  Attitudes Towards Violence
4680      STP  Attitudes Towards Violence
4920      TCD  Attitudes Towards Violence
5100      TON  Attitudes Towards Violence
5220      TUV  Attitudes Towards Violence
5520      VUT  Attitudes Towards Violence
5550      WSM  Attitudes Towards Violence
210       AUS              Child Marriage
840       CAN              Child M

In [8]:
from scipy.stats import gmean
from wsi.config import INDICATORS
import pandas as pd

# --- Normalization helper ---
def normalize_column(column: pd.Series) -> pd.Series:
    """Normalize a pandas Series between 0 and 1."""
    return (column - column.min()) / (column.max() - column.min()) if column.max() != column.min() else 0.5

# --- Apply normalization and scoring ---
def apply_indicator_scoring(df: pd.DataFrame) -> pd.DataFrame:
    # Group indicators by dimension
    dimension_groups = {}
    for indicator, cfg in INDICATORS.items():
        dimension = cfg["dimension"]
        dimension_groups.setdefault(dimension, []).append(indicator)
    
    # Normalize and possibly invert indicators
    for indicator, cfg in INDICATORS.items():
        col_score = f"{indicator} (score)"
        df[col_score] = normalize_column(df[indicator])
        if cfg["invert"]:
            df[col_score] = 1 - df[col_score]
    
    # Calculate dimension scores
    for dimension, indicators in dimension_groups.items():
        score_cols = [f"{indicator} (score)" for indicator in indicators]
        df[dimension] = df[score_cols].mean(axis=1)

    # Calculate composite index: geometric mean of available dimensions
    score_columns = ['Equity', 'Protection', 'Resilience']
    df["WSI (Baseline)"] = df[score_columns].apply(lambda row: gmean(row.dropna()) if not row.isnull().all() else pd.NA, axis=1)

    return df


In [9]:
df_scored = apply_indicator_scoring(df)
df_scored["Economy"] = df_scored["ISO_code"].map(ALL_ISO_NAME)
df_scored

Unnamed: 0,ISO_code,Year,Education,Employment,Parliamentary Representation,Poverty,Legal Protection Index,Son Bias,Maternal Mortality,Attitudes Towards Violence,...,Child Marriage (score),Access Water Sanitation (score),Access Electricity (score),Cell Phone Use (score),Financial Inclusion (score),Equity,Protection,Resilience,WSI (Baseline),Economy
0,AFG,1995,0.217986,48.240352,25.925926,55.268182,0.197917,105.8,1346.14410,7.485015,...,0.545692,0.163862,0.036290,0.000000,0.018369,0.248064,0.565406,0.054630,0.197145,Afghanistan
1,AFG,1996,0.217986,48.240352,25.925926,55.268182,0.197917,106.0,1346.14410,7.485015,...,0.545692,0.163862,0.036290,0.000000,0.018369,0.248064,0.562549,0.054630,0.196813,Afghanistan
2,AFG,1997,0.217986,48.240352,25.925926,55.268182,0.197917,106.2,1346.14410,7.485015,...,0.545692,0.163862,0.036290,0.000000,0.018369,0.248064,0.559692,0.054630,0.196479,Afghanistan
3,AFG,1998,0.217986,48.240352,25.925926,55.268182,0.197917,106.4,1346.14410,7.485015,...,0.545692,0.163862,0.036290,0.000000,0.018369,0.248064,0.556835,0.054630,0.196144,Afghanistan
4,AFG,1999,0.217986,48.240352,25.925926,55.268182,0.197917,106.5,1346.14410,7.485015,...,0.545692,0.163862,0.036290,0.000000,0.018369,0.248064,0.555406,0.054630,0.195976,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,ZWE,2020,0.852830,66.377519,34.571429,39.800000,0.875000,102.4,356.75892,8.828383,...,0.560052,0.393873,0.523185,0.263555,0.231507,0.586013,0.756398,0.353030,0.538877,Zimbabwe
5696,ZWE,2021,0.852830,64.499665,34.571429,39.800000,0.875000,102.4,356.75892,8.828383,...,0.560052,0.388146,0.485887,0.279082,0.232920,0.582040,0.756398,0.346509,0.534325,Zimbabwe
5697,ZWE,2022,0.852830,63.862897,33.625731,39.800000,0.875000,102.4,356.75892,8.828383,...,0.560052,0.388352,0.496976,0.274318,0.232920,0.578887,0.756398,0.348142,0.534195,Zimbabwe
5698,ZWE,2023,0.852830,65.737269,33.625731,39.800000,0.875000,102.5,356.75892,8.828383,...,0.560052,0.388352,0.496976,0.274318,0.232920,0.582853,0.754969,0.348142,0.535075,Zimbabwe


In [20]:
import re

def plot_country_index(merged_df):

    all_indicators = dimensions + indicator_columns

    # --- Colors ---
    color_palette = sns.color_palette("Set1", len(all_indicators))
    indicator_colors = dict(zip(all_indicators, color_palette))

    # --- Plot per country ---
    for iso_code in merged_df['ISO_code'].unique():
        country_data = merged_df[merged_df['ISO_code'] == iso_code]
        country_name = country_data['Economy'].iloc[0]
        
        fig, axes = plt.subplots(5, 1, figsize=(10, 15), sharex=True,
                                 gridspec_kw={'height_ratios': [1, 1, 1, 1, 1]})
        fig.suptitle(f"WSI (Baseline), Dimensions & Indicators for {country_name} ({iso_code})", fontsize=18)

        # --- Plot 1: WSI (Baseline) ---
        sns.lineplot(x='Year', y="WSI (Baseline)", data=country_data, color='blue', label = "WSI (Baseline)", ax=axes[0])
        axes[0].set_title("WSI (Baseline)", fontsize=14)
        axes[0].set_ylabel("Index Value")
        axes[0].set_ylim(0, 1)
        axes[0].legend(title="WSI", bbox_to_anchor=(1.05, 1), loc='upper left')

        # --- Plot 2: Dimensions ---
        for dim, color in zip(dimensions, ['green', 'orange', 'purple']):
            if dim in country_data.columns:
                sns.lineplot(x='Year', y=dim, data=country_data, label=dim, color=color, ax=axes[1])
        axes[1].set_title("Dimensions (Equity, Protection, Resilience)", fontsize=14)
        axes[1].set_ylabel("Dimension Value")
        axes[1].set_ylim(0, 1)
        axes[1].legend(title="Dimension", bbox_to_anchor=(1.05, 1), loc='upper left')

        # --- Plot 3-5: Indicator Scores by Dimension ---
        for idx, dim in enumerate(dimensions, start=2):
            for ind in dimension_indicators[dim]:
                score_col = f"{ind} (score)"
                source_col = f"{ind} (source)"
                if score_col in country_data:
                    sns.lineplot(x='Year', y=score_col, data=country_data,
                                 label=ind, color=indicator_colors.get(ind, 'grey'), ax=axes[idx])
                    if source_col in country_data:
                        ori_data = country_data[country_data[source_col] == 'ORI']
                        if not ori_data.empty:
                            sns.scatterplot(x='Year', y=score_col, data=ori_data,
                                            s=50, edgecolor='white',
                                            color=indicator_colors.get(ind, 'grey'), ax=axes[idx])
            axes[idx].set_title(f"{dim} Indicators (Scores)", fontsize=14)
            axes[idx].set_ylabel("Indicator Value")
            axes[idx].set_ylim(0, 1)
            axes[idx].legend(title="Indicator", bbox_to_anchor=(1.05, 1), loc='upper left')

        # --- Layout and Save ---
        plt.tight_layout()
        for ax in axes:
            ax.set_ylim(-0.05, 1.05)

        safe_country = re.sub(r"[^\w\s-]", "", country_name).replace(" ", "_")
        filename = f"{safe_country}_baseline_indicators.png"
        filepath = imgs_path("baseline_indicators", filename)
        print(filepath)
        plt.savefig(filepath, dpi=300)
        plt.close()


In [21]:
plot_country_index(df_scored)

C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Afghanistan_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Angola_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Albania_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\United_Arab_Emirates_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Argentina_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Armenia_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Antigua_and_Barbuda_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Australia_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Austria_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Azerbaijan_baseline_indicators.png
C:\Users\kbuc0011\Documents\WSI\imgs\baseline_indicators\Burundi_baseline_i

In [None]:
# shocks metric (refactor; check 2024 values)

In [None]:
# website graphs for 'explore by ...', see https://flourish.studio/ for example