In [None]:
%load_ext pyinstrument

from pathlib import Path

import geopandas as gpd
import pandas as pd

from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from custom.mosaiks_data import load_mosaiks_data
from custom.shrug_data import lengthen_shapefile_ID_names
from custom.utils import load_gdf
from custom.evaluation import show_results

## Load MOSAIKS features

In [None]:
mosaiks_features_gdf = load_mosaiks_data("INDIA_SHRUG_Mosaiks_features.csv")
mosaiks_features_gdf.head()

In [None]:
mosaiks_features_gdf.shape

## Load preprocessed SHRUG keys with shapes

In [None]:
shrug_key_geoms = load_gdf("01_preprocessed/SHRUG/shrug_pc11r_key_with_shapes", "shrug_pc11r_key_with_shapes.shp")
shrug_key_geoms = lengthen_shapefile_ID_names(shrug_key_geoms)
shrug_key_geoms.head()

## Use SHRUG shapes to add `shrid`s to MOSAIKS features

In [None]:
# add shrid column to mosaiks features, based on whether the the MOSAIKS coordinate is within the shrid area
shrug_key_geoms['shrug_key_geoms'] = shrug_key_geoms['geometry'].copy()
mosaiks_features_df = mosaiks_features_gdf.sjoin(shrug_key_geoms).drop(columns=["index_right"])
mosaiks_features_df.head()

## Import SHRUG SECC (target)

In [None]:
file_path = Path.cwd().parent / "data" / "00_raw" / "SHRUG" / "shrug-v1.5.samosa-secc-csv" / "shrug_secc.csv"
shrug_secc = pd.read_csv(file_path)
shrug_secc.head()

In [None]:
shrug_secc.shape

In [None]:
y_name = "secc_pov_rate_rural"
shrug_y = shrug_secc[["shrid", y_name]].copy()
# drop entries with missing shrid or y values
shrug_y.dropna(inplace=True)
shrug_y

## Match target to features using `shrid`s

In [None]:
# add MOSAIKS features to the SECC data
df = pd.merge(shrug_y, mosaiks_features_df, on="shrid")
df.head()

### Select X and y

In [None]:
geo_cols = [
    "shrid",
    "Lat",
    "Lon",
    "geometry",
    "pc11_state_id",
    "pc11_district_id",
    "pc11_subdistrict_id",
    "pc11_village_id",
    "tv_name",
    "shrug_key_geoms",
]

In [None]:
X = df.drop(geo_cols + [y_name], axis=1)

In [None]:
y = df[y_name]

## Model A
Datapoints = latlong points

### Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1,)

In [None]:
# define model evaluation method
cv = RepeatedKFold(n_splits=2, n_repeats=1, random_state=1)
# define model
model = RidgeCV(alphas=[1, 10], cv=cv) # alphas here refer to lambda values to try #0.001, 0.01, 

# fit model
model.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % model.alpha_)

In [None]:
# # or
# model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, n_jobs=-1)
# # fit model
# model.fit(X_train, y_train)

### Make predictions and evaluate performance

Next, we use the trained model to make predictions in the test set.

In [None]:
#predict model
y_pred = model.predict(X_test)

In [None]:
show_results(y_test, y_pred)

### Spatial visualization

In [None]:
# get geometries by subsetting original df to only test datapoints
y_gdf = df.loc[list(y_test.index), geo_cols]
# add true and predicted values
y_gdf['observed'] = y_test
y_gdf['predicted'] = y_pred
# y_gdf.head()

In [None]:
f, axes = plt.subplots(1, 2, sharey=True, figsize=(10, 5))

vmax = max(y_test.max(), y_pred.max())
vmin = min(y_test.min(), y_pred.min())

# plot observed
y_gdf.plot(
    kind="scatter",
    x="Lon",
    y="Lat",
    c="observed",
    colorbar=False,
    alpha=0.5,
    vmin=vmin,
    vmax=vmax,
    ax=axes[0],
)
axes[0].set_title("Observed")

# plot predicted
y_gdf.plot(
    kind="scatter",
    x="Lon",
    y="Lat",
    c="predicted",
    colorbar=False,
    alpha=0.5,
    vmin=vmin,
    vmax=vmax,
    ax=axes[1],
)
axes[1].set_title("Predicted")

plt.tight_layout()

### Evaluate predictions at the aggregated levels

In [None]:
print("Total test datapoints: ", y_gdf.shape[0])

#### shrid

In [None]:
shrid_ys = y_gdf.groupby(["shrid"])[["observed", "predicted"]].mean()
print("Datapoints with unique shrids: ", shrid_ys.shape[0])

show_results(shrid_ys["observed"], shrid_ys["predicted"])

#### village

In [None]:
village_ys = y_gdf.groupby(
    [
        "pc11_state_id",
        "pc11_district_id",
        "pc11_subdistrict_id",
        "pc11_village_id",
    ],
    as_index=False,
)[["observed", "predicted"]].mean()
print("Datapoints with unique villages: ", village_ys.shape[0])

show_results(village_ys["observed"], village_ys["predicted"])

#### subdistrict

In [None]:
subdistrict_ys = y_gdf.groupby(
    [
        "pc11_state_id",
        "pc11_district_id",
        "pc11_subdistrict_id"
    ],
    as_index=False,
)[["observed", "predicted"]].mean()
print("Datapoints with unique subdistricts: ", subdistrict_ys.shape[0])

show_results(subdistrict_ys["observed"], subdistrict_ys["predicted"])

In [None]:
subdistrict_ys = y_gdf.groupby(
    [
        "pc11_state_id",
        "pc11_district_id",
        "pc11_subdistrict_id"
    ],
    as_index=False,
)[["observed", "predicted"]].max()
print("Datapoints with unique subdistricts: ", subdistrict_ys.shape[0])

show_results(subdistrict_ys["observed"], subdistrict_ys["predicted"])

Map of subdistricts

4 quadrant evaluation stats