In [1]:
import gc

from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupShuffleSplit, GroupKFold #train_test_split, RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, precision_score, recall_score

from custom.preprocessing import load_and_merge_data, preserve_geometry, _add_shrid_to_mosaiks, _merge_mosaiks_and_secc
from custom.shrug_data import lengthen_shapefile_ID_names, load_shrug_shapefiles, load_shrug_secc
from custom.evaluation import show_results, plot_prediction_maps
from custom.mosaiks_data import load_mosaiks_data
from custom.utils import load_gdf, latlon_df_to_gdf

In [2]:
DATA_ROOT = "/home/jovyan/ds_nudge_up/data/"
mosaiks_path = DATA_ROOT+"/01_preprocessed/mosaiks_features/"

## Load data

Mosaiks features (with latlons)

In [3]:
# data_label = "_1km_v1024_L8"
# filename = 'mosaiks_2013_L8_0.005b_all_points.parquet.gzip'

data_label = "_3km_v1024_L8"
filename = 'mosaiks_2013_L8_0.015b_all_points.parquet.gzip'

In [4]:
mosaiks_features = pd.read_parquet(mosaiks_path+filename)
mosaiks_features_gdf = latlon_df_to_gdf(mosaiks_features)
mosaiks_features_gdf.shape

(94392, 1027)

In [5]:
mosaiks_features_gdf = mosaiks_features_gdf.dropna()
mosaiks_features_gdf.shape

(86382, 1027)

SHRUG geometries

In [None]:
shrug_key_geoms = load_gdf(
    "01_preprocessed/SHRUG/shrug_pc11r_key_with_shapes",
    "shrug_pc11r_key_with_shapes.shp",
)
shrug_key_geoms = lengthen_shapefile_ID_names(shrug_key_geoms)
shrug_key_geoms = preserve_geometry(shrug_key_geoms, level="village")

SHRUG SECC

In [None]:
shrug_secc = load_shrug_secc()

Merge Mosaiks featues and SECC target via village shapes

In [None]:
mosaiks_features_gdf = _add_shrid_to_mosaiks(mosaiks_features_gdf, shrug_key_geoms)

In [None]:
gdf = _merge_mosaiks_and_secc(mosaiks_features_gdf, shrug_secc)

In [None]:
## old
# gdf = load_and_merge_data()

In [None]:
del shrug_key_geoms
del shrug_secc
del mosaiks_features_gdf
gc.collect()

## Pre-process data

In [None]:
# Select target
y_name = "secc_pov_rate_rural"

In [None]:
# drop rows with unavailable target
gdf_clean = gdf.dropna(subset=y_name)

In [None]:
# select list of columns to drop from the data so only features and target are left
shrug_key_cols = [
    "pc11_state_id",
    "pc11_district_id",
    "pc11_subdistrict_id",
    "pc11_village_id",
    "tv_name",
    "shrid",
    "pc11_v_uid",
    "geometry",
    "geometry_village"
]
geo_cols = ["lat", "lon"] + shrug_key_cols

shrug_secc_cols = [
    "shrid",
    "secc_inc_cultiv_share",
    "nco2d_cultiv_share",
    "secc_cons_pc_rural",
    "secc_cons_pc_urban",
    "secc_pov_rate_rural",
    "secc_pov_rate_urban",
    "secc_pov_rate_tend_rural",
    "secc_pov_rate_tend_urban",
    "num_members_mean_rural",
    "num_members_mean_urban",
]
shrug_secc_cols.remove(y_name)

cols_to_drop = list(shrug_secc_cols) + geo_cols

In [None]:
# select features and target
X = gdf_clean.drop(cols_to_drop + [y_name], axis=1)
y = gdf_clean[y_name]

Grouped train-test split: making sure that datapoints from the same village don't get split across train and test datasets (to avoid leakage).

Can change grouping variable to larger scale if needed (e.g. unique subdistrict IDs, etc)

In [None]:
# choose train/test indices
grouping_var = "pc11_v_uid"
splitter = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state=0)
split = splitter.split(gdf_clean, groups=gdf_clean[grouping_var])
train_index, test_index = list(split)[0]

# split data into train and test
X_train, X_test, y_train, y_test = (
    X.iloc[train_index],
    X.iloc[test_index],
    y.iloc[train_index],
    y.iloc[test_index],
)

## Model A
Datapoints = latlong points

### Train model

In [None]:
model = Ridge()
model.fit(X_train, y_train) #, sample_weight=y_train

- target distribution
- optimise model

In [None]:
# # or with parameter search (grouped k-fold)

# group_kfold = GroupKFold(n_splits=5)
# cv_grouped = group_kfold.split(X, y, gdf_clean[grouping_var])

# model = RidgeCV(alphas=[0.01, 1, 10], cv=cv_grouped)
# model.fit(X_train, y_train, sample_weight=y_train)

# # summarize chosen configuration
# print('alpha: %f' % model.alpha_)

### Test

Next, we use the trained model to make predictions in the test set.

In [None]:
y_pred = model.predict(X_test)

In [None]:
show_results(y_test, y_pred, file_name="scatter"+data_label)

In [None]:
min, max = 0.1, 0.6
y_pred_scaled = ((y_pred - min) / (max - min))
show_results(y_test, y_pred_scaled, file_name="scatter_scaled"+data_label, line=False, title="Scaled")

Maps

In [None]:
# select dataframe with only test target and location data
gdf_clean_test_y = gdf_clean.iloc[test_index][geo_cols + [y_name]]

# add predicted values
gdf_clean_test_y.loc[:, "predicted"] = y_pred
gdf_clean_test_y.loc[:, "predicted_scaled"] = y_pred_scaled

In [None]:
plot_prediction_maps(gdf_clean_test_y, y_name, "predicted", False, 0, 1, "maps_points"+data_label)

### Aggregate to `Subdistricts`

In [None]:
print("Total test datapoints: ", gdf_clean_test_y.shape[0])

In [None]:
sd_shapes = load_shrug_shapefiles(level="subdistrict")
sd_shapes["geometry_subdistrict"] = sd_shapes["geometry"]

In [None]:
gdf_clean_test_y_subdistricts = gdf_clean_test_y.sjoin(sd_shapes)

# change default geometry to subdistricts (for plotting)
gdf_clean_test_y_subdistricts["geometry_point"] = gdf_clean_test_y_subdistricts["geometry"]
gdf_clean_test_y_subdistricts["geometry"] = gdf_clean_test_y_subdistricts["geometry_subdistrict"]

In [None]:
subdistrict_ys = gdf_clean_test_y_subdistricts.groupby(
    [
        "pc11_state_id",
        "pc11_district_id",
        "pc11_subdistrict_id"
    ],
    as_index=False,
)[[y_name, "predicted"]].mean()
print("Datapoints with unique subdistricts: ", subdistrict_ys.shape[0])

show_results(subdistrict_ys[y_name], subdistrict_ys["predicted"], file_name="scatter_subdistricts"+data_label)

In [None]:
plot_prediction_maps(gdf_clean_test_y_subdistricts, y_name, "predicted", False, 0, 1, "maps_sd"+data_label)

#### Top 20% poorest subdistricts

In [None]:
gdf_clean_test_y_subdistricts[y_name+"top_20_perc"] = (pd.qcut(gdf_clean_test_y_subdistricts[y_name], q=5, labels=False) == 4) * 1
gdf_clean_test_y_subdistricts["predicted_top_20_perc"] = (pd.qcut(gdf_clean_test_y_subdistricts["predicted"], q=5, labels=False) == 4) * 1

In [None]:
plot_prediction_maps(gdf_clean_test_y_subdistricts, y_name+"top_20_perc", "predicted_top_20_perc", False, 0, 1, "maps_sd_top20"+data_label)

## Under construction: 4 quadrant metrics

In [None]:
pred_top_quintile = np.percentile(y_pred, [80])[0]
y_pred_is_top_quintile = list(y_pred > pred_top_quintile)

In [None]:
test_top_quintile = np.percentile(y_test, [80])[0]
y_test_is_top_quintile = list(y_test > test_top_quintile)

In [None]:
sns.scatterplot(x=y_pred, y=y_test, alpha=0.2)
plt.xlim(-0.1, 1.1)
plt.axvline(pred_top_quintile, c="darkred")
plt.axhline(test_top_quintile, c="darkred")

In [None]:
y_pred_is_NOT_top_quintile = [not b for b in y_pred_is_top_quintile]
y_test_is_NOT_top_quintile = [not b for b in y_test_is_top_quintile]

In [None]:
TP = sum(y_pred_is_top_quintile and y_test_is_top_quintile)
FP = sum(y_pred_is_top_quintile and y_test_is_NOT_top_quintile)
TN = sum(y_pred_is_NOT_top_quintile and y_test_is_NOT_top_quintile)
FN = sum(y_pred_is_NOT_top_quintile and y_test_is_top_quintile)

In [None]:
TP

In [None]:
FP

In [None]:
TN

In [None]:
FN

In [None]:
cm_array = confusion_matrix(y_test_is_top_quintile, y_pred_is_top_quintile)
df_cm = pd.DataFrame(cm_array, [0,1], [0,1])

precision = round(precision_score(y_test_is_top_quintile, y_pred_is_top_quintile), 3)
recall = round(recall_score(y_test_is_top_quintile, y_pred_is_top_quintile), 3)

In [None]:
cm_array.sum()

In [None]:
plt.figure(figsize=(3,3))
sns.heatmap(df_cm, annot=True, fmt='.5g', cbar=False)
plt.title(f"Precision: {precision}\nRecall: {recall}")
plt.savefig(DATA_ROOT+"/04_modeloutput/"+"confusion_matrix"+data_label+".png")