In [1]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import sklearn 
import sys
import pandas as pd
from importlib import reload

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns

from mosaiks.utils.imports import *

# Key prediction functions are here
from prediction_utils import (X_matrix_to_demeaned_X, df_to_demeaned_y_vars,
                              get_within_perf_from_kfold_dict,cv_solve, solver_kwargs,
                            get_test_idxs,predict_y_from_kfold_dict)
                              
from prediction_utils import make_train_pred_scatterplot as make_scatterplot

## Evaluate on test set

On Feb 24, 2022 we decide to evaluate on the test set. We will implement all the same procedures as `code/analysis/hdi_and_iwi_model_training.ipynb` and produce the test set results table with the same structure.


In July, 2023 we updated our label data from the Global Data Lab. Additional provincial observations were added to the training and test sets. The 35 test countries that were included in the test set originally have not changed.

In Dec, 2023 we changed our NL data source to VIIRS from DMSP. After making this decision based on reviewer feedback, we ran this notebook again to report updated test set results.


****

In order to execute this notebook, you will need saved model information which is included in the GitHub repo.

### Read in Xs and Ys

In [2]:
adm1_X = pd.read_pickle(
(data_dir + "features/mosaiks_features/"
"GDL_ADM1_polygon_X_creation_GHS_POP.p")).drop(columns="GDLCODE")

filepath = (data_dir+"features/nl_features/GDL_HDI_polygons/"
            "viirs_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")

nl = pd.read_pickle(filepath).loc[adm1_X.index]

In [3]:
raw = pd.read_pickle( (data_dir + "int/GDL_HDI/"
                     "HDI_indicators_and_indices_clean.p") ).loc[adm1_X.index]

In [4]:
iwi_path = (data_dir + "raw/GDL_IWI/"
"GDL-Mean-International-Wealth-Index-(IWI)-score-of-region-data_linear_interp=True_nearest_val=8.csv")
#These specific value uses nearest neighbor linear interpolation for a max of 7 years. 
# This is consistent with treating the DHS cluster observations as a cross section
iwi = pd.read_csv(iwi_path)
# Issue in IWI data. No national observation for a select few obs (Argentina subnats). Going to drop as the data 
# quality is unreliable anyway
g = iwi.groupby("ISO_Code")
no_country = g.first()[g["Region"].agg(lambda x: "Total" not in list(x))].index
iwi = iwi.set_index("ISO_Code").drop(no_country).reset_index()
# ARG is now dropped

iwi = iwi.set_index("GDLCODE").rename(columns = {"2019":"iwi"})


In [5]:
raw = raw.join(iwi[["iwi"]],how = "left" )

### Subset to test set

In [6]:
test_idxs = get_test_idxs()
test_locs = raw["ISO_Code"].isin(test_idxs)

test_df = raw.loc[test_locs].copy()

X_test = adm1_X.loc[test_df.index]

X_nl = nl.loc[test_df.index]

In [7]:
tasks = ['Sub-national HDI', 
         "Life expectancy", 
         "Mean years schooling", 
         "Expected years schooling",
         "GNI per capita in thousands of US$ (2011 PPP)",
         "iwi"
        ]

log_task = {'Sub-national HDI': False,
              "Life expectancy": False,
               "Mean years schooling" : False,
               "Expected years schooling" : False,
               "GNI per capita in thousands of US$ (2011 PPP)" : True,
               "iwi":False}


model_directory = data_dir + "model_data/"

test_df[tasks] = test_df[tasks].astype(float)

In [8]:
X_demeaned = X_matrix_to_demeaned_X(X_test)
X_demeaned_nl = X_matrix_to_demeaned_X(X_nl)

### Cross-country RCF

In [9]:
path = (model_directory+
           "cross_country_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=GHS_POP.pkl")
kfold_dict = pickle.load(open(path, "rb"))


In [10]:
cc_rcf_perf_dict = {}
cc_train_n = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_test,kfold_dict, task)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    cc_rcf_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])
    
    cc_train_n[task] = len(np.hstack(kfold_dict[task]["locations_test"]))


### Cross-country NL model

In [11]:
path = (model_directory+
           "cross_country_nl_solve_all_outcomes_country_fold_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_kfold_dict = pickle.load(open(path, "rb"))

In [12]:
cc_nl_perf_dict = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_nl,nl_kfold_dict, task)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    cc_nl_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])

### Cross-country RCF + NL

In [13]:
path = (model_directory+
           "cross_country_rcf_and_nl_solve_all_"
           "outcomes_country_fold_DENSE_pop_weight=GHS_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_and_rcf_kfold_dict = pickle.load(open(path, "rb"))

In [14]:
cc_nl_and_rcf_perf_dict = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_test,nl_and_rcf_kfold_dict,task, X_nl)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    cc_nl_and_rcf_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])

## Now we repeat for the ADM0 models

### Cross-country RCF

In [15]:
path = (model_directory+
           "kfold_solve_adm0_model_full"
           "GHS_pop_weighted_feats_DENSE.pkl")

adm0_kfold_dict = pickle.load(open(path, "rb"))

In [16]:
adm0_pred_adm1_rcf_perf_dict = {}

adm0_train_n = {}

adm1_test_n = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_test,adm0_kfold_dict, task)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    adm0_pred_adm1_rcf_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])
    
    adm0_train_n[task] = len(np.hstack(adm0_kfold_dict[task]["locations_test"]))
    adm1_test_n[task] = keep.sum()

### Cross-country NL

In [17]:
path = (model_directory +
           "VIIRS_hist_bins_GHS_pop_weighted.pkl")

adm0_nl_kfold_dict = pickle.load(open(path, "rb"))


In [18]:
adm0_pred_adm1_nl_perf_dict = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_nl,adm0_nl_kfold_dict, task)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    adm0_pred_adm1_nl_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])

### Cross-country RCF+NL

In [19]:
path = (model_directory+
           "kfold_solve_adm0_level_GHS_pop_weighted_feats_rcf_nl_VIIRS_hist_bins_GHS_pop_weighted.pkl")

adm0_rcf_nl_kfold_dict = pickle.load(open(path, "rb"))



In [20]:
adm0_preds = predict_y_from_kfold_dict(X_test,adm0_rcf_nl_kfold_dict, task, X_nl)

In [21]:
adm1_preds = predict_y_from_kfold_dict(X_test,nl_and_rcf_kfold_dict,task, X_nl)

In [22]:
adm0_pred_adm1_rcf_nl_perf_dict  = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_test,adm0_rcf_nl_kfold_dict, task, X_nl)
    truth = test_df[task]
    
    if log_task[task]:
        truth = np.log(truth)
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)
    
    print(task)
    print(sklearn.metrics.r2_score(truth[keep],preds[keep]))
    
    adm0_pred_adm1_rcf_nl_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep])

Sub-national HDI
0.7916197010898719
Life expectancy
0.6517522803774589
Mean years schooling
0.6348670879171606
Expected years schooling
0.6996837574317191
GNI per capita in thousands of US$ (2011 PPP)
0.7612026273589092
iwi
0.5052681782969255


## Now we repeat for the demeaned models

### Within-country RCF

In [23]:
def get_idxs_without_adm0_observations(df, country_col = "ISO_Code"):
    """
    For country-demeaned outcomes, we want to remove all the ADM0 level observations. 
    
    In other words, if there is no within-country we shouldn't keep this as training data
    
    """
    col = tasks[0]
    g = df.groupby("ISO_Code")
    
    country_counts = df.groupby(country_col)[col].count() 

    adm0_idxs = country_counts[country_counts == 1].index
    adm1_idxs = df.reset_index().set_index("ISO_Code").drop(adm0_idxs).set_index(df.index.name).index
    
    assert len(adm1_idxs) == len(df)-len(adm0_idxs)
    
    return adm1_idxs

In [24]:
path = (model_directory+
           "within_country_demeaned_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=GHS_POP.pkl")
demeaned_kfold_dict = pickle.load(open(path, "rb"))

In [25]:
demean_rcf_perf_dict = {}

demean_n_train = {}

demean_test_n = {}

for task in tasks:
    print(task)
    preds = predict_y_from_kfold_dict(X_demeaned,demeaned_kfold_dict, task)
    
    truth = df_to_demeaned_y_vars(task, test_df, log_before_diff = log_task[task])
    
    within = get_idxs_without_adm0_observations(test_df)
    preds = preds.loc[within]
    truth = truth.loc[within]
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)

    demean_rcf_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep],
                                                                demeaned_input=True, not_demeaned_df=test_df)
    
    demean_n_train[task] = len(np.hstack(demeaned_kfold_dict[task]["locations_test"]))
    
    demean_test_n[task] = keep.sum()
    

Sub-national HDI
Life expectancy
Mean years schooling
Expected years schooling
GNI per capita in thousands of US$ (2011 PPP)
iwi


### Within-country NL

In [26]:
path = (model_directory+
           "within_country_nl_demeaned_solve_all_outcomes_country_fold_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_demeaned_kfold_dict = pickle.load(open(path, "rb"))


In [27]:
demean_nl_perf_dict = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_demeaned_nl,nl_demeaned_kfold_dict, task)
    
    truth = df_to_demeaned_y_vars(task, test_df, log_before_diff = log_task[task])
    
    within = get_idxs_without_adm0_observations(test_df)
    preds = preds.loc[within]
    truth = truth.loc[within]
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)

    demean_nl_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep],
                                                                demeaned_input=True, not_demeaned_df=test_df)



### Within-country RCF+NL

In [28]:
path = (model_directory+
           "within_country_rcf_and_nl_demeaned_solve_all_outcomes_country_fold"
           "_DENSE_pop_weight=GHS_VIIRS_hist_bins_GHS_pop_weighted.pkl")

nl_and_rcf_demeaned_kfold_dict = pickle.load(open(path, "rb"))


In [29]:
demeaned_nl_and_rcf_demeaned_perf_dict = {}

for task in tasks:
    preds = predict_y_from_kfold_dict(X_demeaned,nl_and_rcf_demeaned_kfold_dict, task,X_demeaned_nl)
    
    truth = df_to_demeaned_y_vars(task, test_df, log_before_diff = log_task[task])
    
    within = get_idxs_without_adm0_observations(test_df)
    preds = preds.loc[within]
    truth = truth.loc[within]
    
    if task == "iwi":
        keep = ~truth.isnull()
    else:
        keep = np.full(len(truth), True)

    demeaned_nl_and_rcf_demeaned_perf_dict[task] = get_within_perf_from_kfold_dict(None, 
    task, metric="ALL", truth=truth[keep], preds=preds[keep],
                                                                                   demeaned_input=True, not_demeaned_df=test_df)
    
    # Save HDI preds from demeaned (primary model). This is needed for maps
    if task == tasks[0]:
        path = data_dir + "preds/hdi_test_set_preds_df_demean_rcf_nl.p"
        df = get_within_perf_from_kfold_dict(None, task, metric="ALL", 
                                     truth=truth[keep],
                                     preds=preds[keep],
                                     return_df=True,
                                    demeaned_input=True, 
                                     not_demeaned_df=test_df)
        df.to_pickle(path)
        
        
    
    


### Make test set tables

In [30]:
task = tasks[0]

outcomes_dicts =[
    
adm0_pred_adm1_rcf_perf_dict,
adm0_pred_adm1_nl_perf_dict,
adm0_pred_adm1_rcf_nl_perf_dict,
    
cc_rcf_perf_dict,
cc_nl_perf_dict,
cc_nl_and_rcf_perf_dict,

demean_rcf_perf_dict,
demean_nl_perf_dict,
demeaned_nl_and_rcf_demeaned_perf_dict

]

outcomes = [outcome[task] for outcome in outcomes_dicts]

table = pd.DataFrame(outcomes).round(2)

table[table < 0] = "$< 0$"


rename_dict = {"pearson": "$\rho^{2}$", "spearman":"Spearman r", "r2":"$R^{2}$",
               "within_adm0_pearson": "$\rho^{2}$", "within_adm0_spearman":"Spearman r", "within_adm0_r2":"$R^{2}$",
    
}

table = table.rename(columns = rename_dict)

table.loc[0:2,"HDI"] = "\textbf{Country level}"
table.loc[3:5,"HDI"] = "\textbf{Province level}"
table.loc[6:8,"HDI"] = "\textbf{Within-country}" 

table.loc[0:2,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]
table.loc[3:5,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]
table.loc[6:8,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]

table.loc[9] = "\textbf{Predicted at ADM1 level} " + "(n={:,})".format(adm1_test_n[task])

table.loc[10] = "\emph{Full variation performance}"
table.iloc[10,3:6] = "\emph{Within-country performance}"

table.drop(columns = "Spearman r", inplace=True)

table.loc[11] = ("(" + pd.Series(np.arange(1,table.shape[1]+1)).astype(str) +")").to_numpy()

table = table.T.reset_index().set_index([9,10,"index",11])

tab = table.T

tab.columns.names = ([None, None, None, None])

table = tab.set_index([tab.columns[-2],tab.columns[-1]])

table.index.names =["\emph{\textbf{HDI trained at:}}", "\emph{\textbf{Features}}"]

table = table.iloc[[8,6,7,5,3,4,2,0,1]]

In [31]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Predicted at ADM1 level} (n=378),\textbf{Predicted at ADM1 level} (n=378),\textbf{Predicted at ADM1 level} (n=378),\textbf{Predicted at ADM1 level} (n=378)
Unnamed: 0_level_1,Unnamed: 1_level_1,\emph{Full variation performance},\emph{Full variation performance},\emph{Within-country performance},\emph{Within-country performance}
Unnamed: 0_level_2,Unnamed: 1_level_2,$\rho^{2}$,$R^{2}$,$\rho^{2}$,$R^{2}$
Unnamed: 0_level_3,Unnamed: 1_level_3,(1),(2),(3),(4)
\emph{	extbf{HDI trained at:}},\emph{	extbf{Features}},Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
\textbf{Within-country},\textbf{MOSAIKS+NL},0.97,0.97,0.43,0.42
\textbf{Within-country},\textbf{MOSAIKS},0.96,0.96,0.25,0.21
\textbf{Within-country},\textbf{NL},0.97,0.97,0.45,0.45
\textbf{Province level},\textbf{MOSAIKS+NL},0.87,0.87,0.4,0.09
\textbf{Province level},\textbf{MOSAIKS},0.8,0.8,0.19,$< 0$
\textbf{Province level},\textbf{NL},0.64,0.62,0.42,$< 0$
\textbf{Country level},\textbf{MOSAIKS+NL},0.79,0.79,0.29,$< 0$
\textbf{Country level},\textbf{MOSAIKS},0.66,0.62,0.13,$< 0$
\textbf{Country level},\textbf{NL},0.6,0.56,0.38,$< 0$


In [36]:
print(table.to_latex(bold_rows=False,column_format="ll||cc|cc",
      escape=False, multicolumn_format="c")  )

\begin{tabular}{ll||cc|cc}
\toprule
                       &             & \multicolumn{4}{c}{\textbf{Predicted at ADM1 level} (n=378)} \\
                       &             & \multicolumn{2}{c}{\emph{Full variation performance}} & \multicolumn{2}{c}{\emph{Within-country performance}} \\
                       &             &                               $\rho^{2}$ & $R^{2}$ &                        $\rho^{2}$ & $R^{2}$ \\
                       &             &                                      (1) &     (2) &                               (3) &     (4) \\
\emph{\textbf{HDI trained at:}} & \emph{\textbf{Features}} &                                          &         &                                   &         \\
\midrule
\textbf{Within-country} & \textbf{MOSAIKS+NL} &                                     0.97 &    0.97 &                              0.43 &    0.42 \\
                       & \textbf{MOSAIKS} &                                     0.96 &    0.96 &                 