In [None]:
%load_ext pyinstrument

In [None]:
# # reload evaluation module
# import importlib
# import custom.evaluation
# importlib.reload(custom.evaluation)

In [None]:
import importlib

from pathlib import Path

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit, GroupKFold #train_test_split, RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor

from custom.preprocessing import load_and_merge_data, preserve_geometry, _add_shrid_to_mosaiks, _merge_mosaiks_and_secc
from custom.shrug_data import lengthen_shapefile_ID_names, load_shrug_shapefiles, load_shrug_secc
from custom.evaluation import show_results, plot_prediction_maps
from custom.mosaiks_data import load_mosaiks_data
from custom.utils import load_gdf

## Load data

In [None]:
# %%pyinstrument
gdf = load_and_merge_data()
# takes ~ 2.5mins on EC2 t2.2xlarge

## Pre-process data

In [None]:
# Select target
y_name = "secc_pov_rate_rural"

In [None]:
# drop rows with unavailable target
gdf_clean = gdf.dropna(subset=y_name)

In [None]:
# select list of columns to drop from the data so only features and target are left
shrug_key_cols = [
    "pc11_state_id",
    "pc11_district_id",
    "pc11_subdistrict_id",
    "pc11_village_id",
    "tv_name",
    "shrid",
    "pc11_v_uid",
    "geometry",
    "geometry_village"
]
geo_cols = ["Lat", "Lon"] + shrug_key_cols

shrug_secc_cols = [
    "shrid",
    "secc_inc_cultiv_share",
    "nco2d_cultiv_share",
    "secc_cons_pc_rural",
    "secc_cons_pc_urban",
    "secc_pov_rate_rural",
    "secc_pov_rate_urban",
    "secc_pov_rate_tend_rural",
    "secc_pov_rate_tend_urban",
    "num_members_mean_rural",
    "num_members_mean_urban",
]
shrug_secc_cols.remove(y_name)

cols_to_drop = list(shrug_secc_cols) + geo_cols

In [None]:
# select features and target
X = gdf_clean.drop(cols_to_drop + [y_name], axis=1)
y = gdf_clean[y_name]

Grouped train-test split: making sure that datapoints from the same village don't get split across train and test datasets (to avoid leakage).

Can change grouping variable to larger scale if needed (e.g. unique subdistrict IDs, etc)

In [None]:
# choose train/test indices
grouping_var = "pc11_v_uid"
splitter = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state=0)
split = splitter.split(gdf_clean, groups=gdf_clean[grouping_var])
train_index, test_index = list(split)[0]

# split data into train and test
X_train, X_test, y_train, y_test = (
    X.iloc[train_index],
    X.iloc[test_index],
    y.iloc[train_index],
    y.iloc[test_index],
)

## Model A
Datapoints = latlong points

### Train model

In [None]:
model = Ridge()
model.fit(X_train, y_train) #, sample_weight=y_train

- target distribution
- optimise model

In [None]:
# # or with parameter search (grouped k-fold)

# group_kfold = GroupKFold(n_splits=5)
# cv_grouped = group_kfold.split(X, y, gdf_clean[grouping_var])

# model = RidgeCV(alphas=[0.01, 1, 10], cv=cv_grouped)
# model.fit(X_train, y_train, sample_weight=y_train)

# # summarize chosen configuration
# print('alpha: %f' % model.alpha_)

### Test

Next, we use the trained model to make predictions in the test set.

In [None]:
y_pred = model.predict(X_test)

In [None]:
show_results(y_test, y_pred, file_name="results")

In [None]:
y_pred_scaled = ((y_pred - 0.15) / (0.7 - 0.15))
show_results(y_test, y_pred_scaled, file_name="results_scaled", line=False, title="Scaled")

Maps

In [None]:
# select dataframe with only test target and location data
gdf_clean_test_y = gdf_clean.iloc[test_index][geo_cols + [y_name]]

# add predicted values
gdf_clean_test_y.loc[:, "predicted"] = y_pred
gdf_clean_test_y.loc[:, "predicted_scaled"] = y_pred_scaled

In [None]:
plot_prediction_maps(gdf_clean_test_y, y_name, "predicted", False, 0, 1, "points_prediction_map")

### Aggregate to `Subdistricts`

In [None]:
print("Total test datapoints: ", gdf_clean_test_y.shape[0])

In [None]:
sd_shapes = load_shrug_shapefiles(level="subdistrict")
sd_shapes["geometry_subdistrict"] = sd_shapes["geometry"]

In [None]:
gdf_clean_test_y_subdistricts = gdf_clean_test_y.sjoin(sd_shapes)

# change default geometry to subdistricts (for plotting)
gdf_clean_test_y_subdistricts["geometry_point"] = gdf_clean_test_y_subdistricts["geometry"]
gdf_clean_test_y_subdistricts["geometry"] = gdf_clean_test_y_subdistricts["geometry_subdistrict"]

In [None]:
subdistrict_ys = gdf_clean_test_y_subdistricts.groupby(
    [
        "pc11_state_id",
        "pc11_district_id",
        "pc11_subdistrict_id"
    ],
    as_index=False,
)[[y_name, "predicted"]].mean()
print("Datapoints with unique subdistricts: ", subdistrict_ys.shape[0])

show_results(subdistrict_ys[y_name], subdistrict_ys["predicted"], file_name="results_subdistricts")

In [None]:
plot_prediction_maps(gdf_clean_test_y_subdistricts, y_name, "predicted", False, 0, 1, "subdistricts_prediction_map")

#### Binarise poverty levels using quantiles

In [None]:
# binarise target using quantiles
gdf_clean_test_y_subdistricts.loc[:, "target_binarised"] = pd.qcut(gdf_clean_test_y_subdistricts[y_name], q=2, labels=False)

In [None]:
gdf_clean_test_y_subdistricts[y_name+"top_20_perc"] = (pd.qcut(gdf_clean_test_y_subdistricts[y_name], q=5, labels=False) == 4) * 1
gdf_clean_test_y_subdistricts["predicted_top_20_perc"] = (pd.qcut(gdf_clean_test_y_subdistricts["predicted"], q=5, labels=False) == 4) * 1

In [None]:
plot_prediction_maps(gdf_clean_test_y_subdistricts, y_name+"top_20_perc", "predicted_top_20_perc", False, 0, 1, "subdistricts_prediction_map_top20")

4 quadrant metrics