In [30]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import sklearn 
import sys
import pandas as pd
from importlib import reload

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns

from scipy.stats import spearmanr

import geopandas as gpd

import warnings

from mosaiks.utils.imports import *

# Key prediction functions are here
from analysis.prediction_utils import (X_matrix_to_demeaned_X,df_to_demeaned_y_vars,
make_train_pred_scatterplot as make_scatterplot, cv_solve, solver_kwargs, get_truth_preds_from_kfold_results,
                             predict_y_from_kfold_dict, generalized_demean)

## Generate ADM2 preds of HDI over time


### Read in the cleaned data

In [31]:
task = "Sub-national HDI"

In [32]:
pred_df = pd.read_pickle(data_dir + "/preds/hdi_preds_at_adm2.p")
pred_df = pred_df.drop(columns = ["adm1_mean","adj_factor","adjusted_preds"])

### reformat hdi over time data

In [33]:
t_df = pd.read_csv(data_dir + "raw/GDL_HDI/SHDI-SGDI-Total 7.0.csv",low_memory=False)[["shdi","year","GDLCODE"]]
t_df = t_df.rename(columns = {"shdi":task})

In [34]:
t_df = t_df[t_df["year"] >=2012]
t_df = t_df.pivot(index="GDLCODE",columns="year", values=task)

In [35]:
t_df.columns = "hdi_adm1_" + t_df.columns.astype(str)

for col in t_df.columns:
    t_df[col] = pd.to_numeric(t_df[col], errors="coerce")

In [36]:
t_df.head()

year,hdi_adm1_2012,hdi_adm1_2013,hdi_adm1_2014,hdi_adm1_2015,hdi_adm1_2016,hdi_adm1_2017,hdi_adm1_2018,hdi_adm1_2019,hdi_adm1_2020,hdi_adm1_2021
GDLCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AFGr101,0.548,0.552,0.553,0.548,0.551,0.553,0.555,0.561,0.556,0.55
AFGr102,0.48,0.483,0.483,0.477,0.479,0.479,0.48,0.484,0.479,0.472
AFGr103,0.468,0.469,0.466,0.459,0.461,0.463,0.464,0.469,0.464,0.459
AFGr104,0.466,0.48,0.492,0.497,0.5,0.501,0.502,0.507,0.502,0.497
AFGr105,0.448,0.451,0.451,0.445,0.448,0.449,0.449,0.454,0.449,0.444


### Centering preds

In [37]:
pred_df = pred_df.merge(t_df, "left", left_on="GDL_ADM1",right_index=True)

In [38]:
years = np.arange(2012,2022).astype(str)

for year in years:
    print(year)
    pred_df["adj_factor"] = pred_df[f"hdi_adm1_{year}"] - pred_df["mean_of_pred_adm2_obs"]
    pred_df[f"predicted_adm2_HDI_{year}"] = pred_df["predicted_dev_from_adm0"] + pred_df["adj_factor"]
    pred_df[f"predicted_adm2_HDI_{year}"] = np.clip(pred_df[f"predicted_adm2_HDI_{year}"],0,1)

pred_df = pred_df.drop(columns = list(t_df.columns))

2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [39]:
pred_df = pred_df[[f"predicted_adm2_HDI_{year}" for year in years]]

In [40]:
pred_df.mean()

predicted_adm2_HDI_2012    0.784705
predicted_adm2_HDI_2013    0.792151
predicted_adm2_HDI_2014    0.795631
predicted_adm2_HDI_2015    0.798203
predicted_adm2_HDI_2016    0.801281
predicted_adm2_HDI_2017    0.804627
predicted_adm2_HDI_2018    0.807489
predicted_adm2_HDI_2019    0.811507
predicted_adm2_HDI_2020    0.803456
predicted_adm2_HDI_2021    0.803869
dtype: float64

In [41]:
pred_df.to_csv(data_dir + "preds/time_series/hdi_adm2_predictions_2012-2021.csv")