In [1]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import sklearn 
import sys
import pandas as pd
from importlib import reload

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns

from scipy.stats import spearmanr

import geopandas as gpd

import warnings

from mosaiks.utils.imports import *

# Key prediction functions are here
from analysis.prediction_utils import (X_matrix_to_demeaned_X,df_to_demeaned_y_vars,
make_train_pred_scatterplot as make_scatterplot, cv_solve, solver_kwargs, get_truth_preds_from_kfold_results,
                             predict_y_from_kfold_dict, generalized_demean)

### Generate ADM2 preds of HDI

Methodologically, we match the methods using the NL and IWI downscaling experiments

In [2]:
task = "Sub-national HDI"

model_directory = data_dir + "/model_data/"

pop_df = pd.read_pickle(data_dir + "/int/GHS_pop/pop_count_sums_for_ADM2_polygons.p")

In [3]:
path = (model_directory+
           "within_country_rcf_and_nl_demeaned_solve_all_outcomes_country_fold"
           "_DENSE_pop_weight=GHS_VIIRS_hist_bins_GHS_pop_weighted.pkl")

nl_and_rcf_demeaned_kfold_dict = pickle.load(open(path, "rb"))

In [4]:
mosaiks_features_direc = data_dir + "/features/mosaiks_features/"
X_adm2 = pd.read_pickle(mosaiks_features_direc + "ADM_2_regions_RCF_global_dense_GHS_POP_pop_weight=True.p").drop(columns = "shapeID")
X_adm1 = pd.read_pickle(mosaiks_features_direc + "ADM_2_regions_RCF_global_dense_aggregated_to_ADM1_GHS_POP_pop_weight=True.p")


X_adm0_not_weighted =X_matrix_to_demeaned_X(X_adm1, return_mean_frame = True)

X_adm2["shapeGroup"] = pd.Series(X_adm2.index).apply(lambda x : x[:3]).to_numpy()
X_adm2_demeaned = generalized_demean(X_adm2, X_adm0_not_weighted, "shapeGroup")

X_adm2.drop(columns = "shapeGroup", inplace=True)


In [5]:
nl_features_direc = data_dir + "features/nl_features/"

In [6]:
os.listdir(nl_features_direc)

['GDL_ADM0_polygons', 'geoBoundaries_ADM2', 'GDL_HDI_polygons', 'DHS_polygons']

In [7]:
nl_adm1 = pd.read_pickle(nl_features_direc +
                         "GDL_HDI_polygons/viirs_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")
nl_adm2 = pd.read_pickle(nl_features_direc + 
                         "geoBoundaries_ADM2/viirs_geoBoundaries_ADM2_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")


nl_adm2["shapeGroup"] = pd.Series(nl_adm2.index).apply(lambda x : x[:3]).to_numpy()

In [8]:
## Make demeaned nl feats at ADM2
### Make demeaned X_adm2

nl_adm0_not_weighted = X_matrix_to_demeaned_X(nl_adm1, return_mean_frame = True)

nl_adm2_demean = generalized_demean(nl_adm2, nl_adm0_not_weighted, "shapeGroup")

nl_adm2_demean = nl_adm2_demean.loc[X_adm2_demeaned.index]


nl_adm2.drop(columns = "shapeGroup", inplace=True)
nl_adm2 = nl_adm2.loc[X_adm2.index]

## Now we can generate predictions

In [9]:
predicted_deviations_from_adm0_mean = predict_y_from_kfold_dict(X_adm2_demeaned,
                                                                nl_and_rcf_demeaned_kfold_dict,
                                                               task,
                                                               nl_adm2_demean)

### We pull from the notebook that hierachally links ADM1 and ADM2 shapefiles

In [10]:
path = data_dir + "/int/ADM2_to_GDL_link/adm2_polygons_linked_to_GDL_adm1.p"
adm2_shp = pd.read_pickle(path)

gpdf = pd.read_pickle(data_dir + "/int/GDL_HDI/HDI_ADM1_shapefile_clean.p")

raw = pd.read_pickle( (data_dir + "/int/GDL_HDI/"
                     "HDI_indicators_and_indices_clean.p") )

In [11]:
#compare size of polygons here

#adm2.to_crs({'init': 'epsg:6933'})["geometry"].area.mean()/1e6
# gpdf.crs = {"init": "EPSG:4326"}
# gpdf.to_crs({'init': 'epsg:6933'})["geometry"].area.mean()/1e6

In [12]:
# gpdf.crs = {"init": "EPSG:4326"}
# gpdf.to_crs({'init': 'epsg:6933'})["geometry"].area.mean()/1e6

In [13]:
adm2_shp = adm2_shp.merge(predicted_deviations_from_adm0_mean.rename("predicted_dev_from_adm0"), 
                                                          "left", 
                                                          left_on = "shapeID", right_index=True)

adm2_shp = adm2_shp.merge(raw[task].rename("adm1_mean"),
                          "left", left_on="GDL_ADM1", right_index=True)

adm1_pred_means = adm2_shp.groupby("GDL_ADM1")["predicted_dev_from_adm0"].mean().rename("mean_of_pred_adm2_obs")
adm2_shp = adm2_shp.merge(adm1_pred_means, "left", left_on = "GDL_ADM1", right_index=True)

adm2_shp["adj_factor"] = adm2_shp["adm1_mean"] - adm2_shp["mean_of_pred_adm2_obs"]

adm2_shp["adjusted_preds"] = adm2_shp["predicted_dev_from_adm0"] + adm2_shp["adj_factor"]

#Clip to [0,1] for HDI. Can be outside range after re-centering.
adm2_shp["adjusted_preds"] = np.clip(adm2_shp["adjusted_preds"],0,1)

adm2_shp.set_index("shapeID", inplace=True)

In [14]:
adm2_shp.head()

Unnamed: 0_level_0,shapeName,shapeISO,shapeGroup,shapeType,ADM1_shapeID,ADM0_shapeID,ADMHIERARCHY,geometry,GDL_ADM1,percent_overlap_GDL_ADM1,predicted_dev_from_adm0,adm1_mean,mean_of_pred_adm2_obs,adj_factor,adjusted_preds
shapeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BRN-ADM2-3_0_0-B1,Kota Batu,,BRN,ADM2,BRN-ADM1-3_0_0-B3,BRN-ADM0-3_0_0-B1,BRN-ADM2-3_0_0-B1|BRN-ADM1-3_0_0-B3|BRN-ADM0-3...,"POLYGON ((115.01400 4.93203, 114.99643 4.94485...",BRNt,87.0,-0.030022,0.83,-0.039444,0.869444,0.839422
BRN-ADM2-3_0_0-B2,Kilanas,,BRN,ADM2,BRN-ADM1-3_0_0-B3,BRN-ADM0-3_0_0-B1,BRN-ADM2-3_0_0-B2|BRN-ADM1-3_0_0-B3|BRN-ADM0-3...,"POLYGON ((114.90051 4.87061, 114.87530 4.86444...",BRNt,100.0,0.01898,0.83,-0.039444,0.869444,0.888425
BRN-ADM2-3_0_0-B3,Tanjong Maya,,BRN,ADM2,BRN-ADM1-3_0_0-B4,BRN-ADM0-3_0_0-B1,BRN-ADM2-3_0_0-B3|BRN-ADM1-3_0_0-B4|BRN-ADM0-3...,"POLYGON ((114.70729 4.75307, 114.66990 4.78399...",BRNt,100.0,-0.074511,0.83,-0.039444,0.869444,0.794934
BRN-ADM2-3_0_0-B4,Serasa,,BRN,ADM2,BRN-ADM1-3_0_0-B3,BRN-ADM0-3_0_0-B1,BRN-ADM2-3_0_0-B4|BRN-ADM1-3_0_0-B3|BRN-ADM0-3...,"POLYGON ((115.04237 4.96770, 115.04136 4.97294...",BRNt,96.12,0.024899,0.83,-0.039444,0.869444,0.894344
BRN-ADM2-3_0_0-B5,Mentiri,,BRN,ADM2,BRN-ADM1-3_0_0-B3,BRN-ADM0-3_0_0-B1,BRN-ADM2-3_0_0-B5|BRN-ADM1-3_0_0-B3|BRN-ADM0-3...,"POLYGON ((115.01400 4.93203, 115.01859 4.93775...",BRNt,99.23,0.018028,0.83,-0.039444,0.869444,0.887473


#### Add population totals

In [15]:
adm2_shp = adm2_shp.merge(pop_df, how="left", left_index=True,right_index=True)

#### Replace Ireland estimates with NANs. These cannot be verified

In [16]:
adm2_shp.loc[adm2_shp["shapeGroup"] == "IRL","adjusted_preds"] = np.nan

In [17]:
## Drop Ireland
adm2_shp_drop_irl = adm2_shp[adm2_shp["shapeGroup"] != "IRL"]
adm2_shp_drop_irl.to_pickle(data_dir + "/preds/hdi_preds_at_adm2.p")

## Save a clean CSV version -- This is what we will release publicly

In [18]:
adm2_shp[["shapeName","shapeGroup",
          "ADM1_shapeID","GDL_ADM1","percent_overlap_GDL_ADM1",
          "adm1_mean","total_pop","adjusted_preds"]].rename(columns={"adm1_mean":"adm1_HDI_Smits","adjusted_preds":"predicted_adm2_HDI",
                                                                    "total_pop":"est_total_pop"}).to_csv(data_dir + "/preds/hdi_preds_at_adm2.csv")


In [19]:
## Double check centering. True without truncating, False when truncating to [0,1] HDI range
#all(adm2_shp_drop_irl.groupby("GDL_ADM1")["adjusted_preds"].mean().round(8) == adm2_shp_drop_irl.groupby("GDL_ADM1")["adm1_mean"].first().round(8))

## Now produce HDI preds from other, not primary models

In [20]:
model_directory = data_dir + "/model_data/"

### Cross country models

In [21]:
path = (model_directory+
           "cross_country_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=GHS_POP.pkl")

kfold_dict = pickle.load(open(path, "rb"))

path = (model_directory+
           "cross_country_nl_solve_all_outcomes_country_fold_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_kfold_dict = pickle.load(open(path, "rb"))


path = (model_directory+
           "cross_country_rcf_and_nl_solve_all_outcomes_country_fold_DENSE_pop_weight=GHS_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_and_rcf_kfold_dict = pickle.load(open(path, "rb"))


In [22]:
rcf_nl_cc_preds = predict_y_from_kfold_dict(X_adm2,
                                                                nl_and_rcf_kfold_dict,
                                                               task,
                                                               nl_adm2).rename("rcf_and_nl_cc_preds")


rcf_cc_preds = predict_y_from_kfold_dict(X_adm2,
                                                                kfold_dict,
                                                               task,
                                                               None).rename("rcf_cc_preds")

nl_cc_preds = predict_y_from_kfold_dict(nl_adm2,
                                                                nl_kfold_dict,
                                                               task,
                                                               None).rename("nl_cc_preds")


In [23]:
## adm0 models

path = (model_directory+
           "kfold_solve_adm0_level_GHS_pop_weighted_feats_rcf_nl_VIIRS_hist_bins_GHS_pop_weighted.pkl")
adm0_rcf_nl_kfold_dict = pickle.load(open(path, "rb"))

path = (model_directory+
           "kfold_solve_adm0_model_full"
           "GHS_pop_weighted_feats_DENSE.pkl")

adm0_kfold_dict = pickle.load(open(path, "rb"))

path = (model_directory +
           "VIIRS_hist_bins_GHS_pop_weighted.pkl")
adm0_nl_kfold_dict = pickle.load(open(path, "rb"))

In [24]:
adm0_rcf_nl_preds = predict_y_from_kfold_dict(X_adm2,
                                                                adm0_rcf_nl_kfold_dict ,
                                                               task,
                                                               nl_adm2).rename("adm0_rcf_and_nl_preds")


adm0_rcf_preds = predict_y_from_kfold_dict(X_adm2,
                                                                adm0_kfold_dict,
                                                               task,
                                                               None).rename("adm0_rcf_preds")

adm0_nl_preds = predict_y_from_kfold_dict(nl_adm2,
                                                                adm0_nl_kfold_dict,
                                                               task,
                                                               None).rename("adm0_nl_preds")


In [25]:
## Within country model
path = (model_directory+
           "within_country_demeaned_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=GHS_POP.pkl")
demeaned_kfold_dict = pickle.load(open(path, "rb"))

path = (model_directory+
           "within_country_nl_demeaned_solve_all_outcomes_country_fold_VIIRS_hist_bins_GHS_pop_weighted.pkl")
nl_demeaned_kfold_dict = pickle.load(open(path, "rb"))

In [26]:
within_rcf_and_nl_preds = predict_y_from_kfold_dict(X_adm2_demeaned,
                                                                nl_and_rcf_demeaned_kfold_dict ,
                                                               task,
                                                               nl_adm2_demean).rename("within_rcf_and_nl_preds")


within_rcf_preds = predict_y_from_kfold_dict(X_adm2_demeaned,
                                                               demeaned_kfold_dict,
                                                               task,
                                                               None).rename("within_rcf_preds")

within_nl_preds = predict_y_from_kfold_dict(nl_adm2_demean,
                                                                nl_demeaned_kfold_dict,
                                                               task,
                                                               None).rename("within_nl_preds")


In [27]:
def recenter_adm1(pred_series, adm2_shp=adm2_shp, clip_bounds=[0,1]):
    
    col = pred_series.name
    temp_merged = pred_series.to_frame().merge(adm2_shp[["adm1_mean","GDL_ADM1"]], how="left", left_index=True, right_index=True)

    adj = temp_merged.groupby("GDL_ADM1")["adm1_mean"].first() - temp_merged.groupby("GDL_ADM1")[col].mean()
    temp_merged = temp_merged.merge(adj.rename("adj"),left_on="GDL_ADM1", right_index=True)
    temp_merged[col] = (temp_merged["adj"] + temp_merged[col])
    
    assert all(temp_merged.groupby("GDL_ADM1")["adm1_mean"].mean().round(6) == temp_merged.groupby("GDL_ADM1")[col].mean().round(6))
    
    return np.clip(temp_merged[col],*clip_bounds)
    
    

In [28]:
joint_preds = pd.concat([rcf_nl_cc_preds, 
                         rcf_cc_preds,
                         nl_cc_preds,
                         
                         adm0_rcf_nl_preds,
                         adm0_rcf_preds,
                         adm0_nl_preds,
                        
                         recenter_adm1(within_rcf_and_nl_preds),
                         recenter_adm1(within_rcf_preds),
                         recenter_adm1(within_nl_preds)
                        ], axis=1)

In [29]:
# drop Ireand
joint_preds = joint_preds[~pd.Series(joint_preds.index.str.startswith("IRL")).to_numpy()]

In [30]:
idxs = adm2_shp.index[adm2_shp.index.isin(joint_preds.index)]

In [31]:
## Check that these are identical

In [32]:
all(joint_preds.loc[idxs, "within_rcf_and_nl_preds"].round(6) == adm2_shp.loc[idxs,"adjusted_preds"].round(6))

True

In [33]:
joint_preds.to_pickle(data_dir + "preds/hdi_preds_from_all_models_at_adm2.p")