In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import matplotlib.pyplot as plt
import matplotlib.cm
import seaborn as sns

There are no shapefiles for SHRUG...

"The SHRUG does not include geographic data in the form of polygons or shapefiles because
we have not yet found a sufficiently accurate data source with open sharing privileges. We are
continuing to investigate sources of geographic data and may include shapefiles in a future version
of the SHRUG. Users interested in obtaining geocodes or polygons for SHRUG units are advised
to examine the open village maps offered by NASA-SEDAC at Columbia University. These can
be directly merged to the 2001 Population Census SHRUG keys in shrug pc01r key.dta and
shrug pc01u key.dta. Our own aggregate data was based on 2011 village polygons which we
believe are slightly more accurate but are not made available with an open data license."

## Load shapefiles (NASA-SEDAC)

In [None]:
ap_dbf = gpd.read_file(
    f"../data/NASA/india-india-village-level-geospatial-socio-econ-1991-2001-ap-2001-shp/india-village-census-2001-AP.dbf"
)
# convert to Lat-Long coords
ap_dbf = ap_dbf.to_crs(epsg=4326)
ap_dbf.head()

In [None]:
ap_dbf.plot(figsize=(5, 5))

### Create unique ID columns to match SHRUG keys

In [None]:
# NOTE: Number of towns with no village code
ap_dbf.dropna(subset=["TOWN_VILL"])["VILL_CODE"].isna().sum()

In [None]:
# drop rows that are missing any identifiers 
ap_dbf_clean = ap_dbf.dropna(subset=["SID", "DID", "TID", "VILL_CODE"]).copy()

# remove the leading 0s
for var in ["SID", "DID", "TID", "VILL_CODE"]:
    ap_dbf_clean[var] = ap_dbf_clean[var].str.lstrip("0")

# create combined ID column
ap_dbf_clean["ID"] = (
    ap_dbf_clean["SID"] + "-"
    + ap_dbf_clean["DID"] + "-"
    + ap_dbf_clean["TID"] + "-"
    + ap_dbf_clean["VILL_CODE"]
)
ap_dbf_clean.sort_values(by=["ID"], inplace=True)

print("Number of unique IDs: ", ap_dbf_clean["ID"].nunique())

In [None]:
ap_dbf_clean.plot(figsize=(5, 5))

## Import SHRUG keys

In [None]:
### RURAL
shrug_pc01r_key = pd.read_csv(
    "../data/SHRUG/shrug-v1.5.samosa-keys-csv/shrug_pc01r_key.csv"
)
shrug_pc01r_key.head()

### URBAN
# shrug_pc01u_key = pd.read_csv(
#     "../data/SHRUG/shrug-v1.5.samosa-keys-csv/shrug_pc01u_key.csv"
# )
# shrug_pc01u_key.head()

### Create ID column to match IDs in NASA-SEDAC

In [None]:
shrug_pc01r_key_clean = shrug_pc01r_key.dropna(subset=["pc01_state_id", "pc01_district_id", "pc01_subdistrict_id", "pc01_village_id"]).copy()

shrug_pc01r_key_clean["ID"] = (
    shrug_pc01r_key_clean["pc01_state_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_district_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_subdistrict_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_village_id"].astype(str)
)

## Match shapes to SHRUG

In [None]:
shrid_geom_df = pd.merge(ap_dbf_clean[["geometry", "ID"]], shrug_pc01r_key_clean, on="ID", how="inner")
shrid_geom_df

Note: multiple villages can be inside the same shrid ID - we can merge the shape of these villages.

In [None]:
# example
shrid_geom_df[shrid_geom_df["shrid"]=="11-28-803020"]

In [None]:
shrid_geom_df = shrid_geom_df.dissolve(by='shrid', aggfunc='sum').reset_index()
# Note: "ID" columns gets dropped here

In [None]:
shrid_geom_df.plot(figsize=(5, 5))

## Create list of coords to fetch from MOSAIKS

In [None]:
bounds = shrid_geom_df.bounds

min_long = bounds["minx"].min().round(2)
min_lat = bounds["miny"].min().round(2)
max_long = bounds["maxx"].max().round(2)
max_lat = bounds["maxy"].max().round(2)

print("LAT min:", min_lat, "max:", max_lat)
print("LONG min:", min_long, "max:", max_long)

In [None]:
# Create list containing grid coords
lat_list = np.arange(min_lat, max_lat, 0.05).round(2)
long_list = np.arange(min_long, max_long, 0.05).round(2)

coords_list = np.array([(lat, long) for lat in lat_list for long in long_list]).round(2)
print("Number of coord pairs in grid:", len(coords_list))

In [None]:
# Convert coords pairs to geopandas points
geometry = gpd.points_from_xy(x=coords_list[:, 1], y=coords_list[:, 0])
coords_gdf = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")

# Filter coords list to only include points that land within the areas of the shapefile
selected_coords_gdf = coords_gdf.sjoin(shrid_geom_df, how="inner")
selected_coords_gdf = selected_coords_gdf.drop("index_right", axis=1).sort_values(by=["shrid"])
selected_coords_gdf.head()

In [None]:
selected_coords_gdf.plot(column=selected_coords_gdf["pc01_subdistrict_id"].astype(str), figsize=(10, 10), markersize=2)

### Export list of coordinates...

...to download MOSAIKS features for through File Query

In [None]:
selected_coords = pd.DataFrame({"Latitude":selected_coords_gdf.geometry.y, "Longitude":selected_coords_gdf.geometry.x})
selected_coords.to_csv("../data/MOSAIKS/coords_request_AP.csv", index=False)

## Load MOSAIKS (features)

Load resulting MOSAIKS features download...

In [None]:
mosaiks_features = pd.read_csv("../data/MOSAIKS/Mosaiks_features.csv")
mosaiks_features

In [None]:
# Make into a GeoDataFrame
mosaiks_coords_points = gpd.points_from_xy(x=mosaiks_features["Lon"], y=mosaiks_features["Lat"])
mosaiks_features_gdf = gpd.GeoDataFrame(mosaiks_features, geometry=mosaiks_coords_points, crs="EPSG:4326")

mosaiks_features_gdf.plot(figsize=(10, 10), markersize=2)

In [None]:
# add shrid column to mosaiks features, based on whether the the MOSAIKS coordinate is within the shrid area
mosaiks_features_df = mosaiks_features_gdf.sjoin(shrid_geom_df)
mosaiks_features_df.drop(columns=["index_right"], inplace=True)
mosaiks_features_df

## Import SHRUG SECC (target)

In [None]:
shrug_secc = pd.read_csv(
    "../data/SHRUG/shrug-v1.5.samosa-secc-csv/shrug_secc.csv"
)
shrug_secc

In [None]:
# drop entries with no rural pov rate
shrug_secc_pov_r = shrug_secc[["shrid", "secc_pov_rate_tend_rural"]].copy()
shrug_secc_pov_r.dropna(inplace=True)
shrug_secc_pov_r

## Match target to features

In [None]:
# add MOSAIKS features to the SECC data
df = pd.merge(shrug_secc_pov_r, mosaiks_features_df, on="shrid")
df.head()

### Select X and y

In [None]:
X = df.iloc[:, 4:-6]
# X = X.sample(1000, axis=1, random_state=42)
X.insert(0, "Lat", df["Lat"])
X.insert(0, "Lon", df["Lon"])

In [None]:
y = df["secc_pov_rate_tend_rural"]

# Modelling borrowed from MOSAIKS Demo

## 3. Run a ridge regression of label on the MOSAIKS features

This step learns how the visual information in the imagery, as captured by the features, relates to the labels. 

Before performing a ridge regression, we first split our data into train (80%) and test (20%). We will estimate the models on the train set and then evaluate predictions in the test set. This separation of train and test set is important to address issues related to overfitting. 

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1,)

#save test set lat/lons for later
plotting_coords = X_test.loc[:, ["Lat", "Lon"]]

#remove lat/lons columns
X_train = X_train.drop(columns=["Lat", "Lon"])
X_test = X_test.drop(columns=["Lat", "Lon"])

In [None]:
# define model evaluation method
cv = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)
# define model
model = RidgeCV(alphas=[0.001, 0.01, 1, 10], cv=cv) # alphas here refer to lambda values to try

# fit model
model.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % model.alpha_)

In [None]:
# # or
# model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, n_jobs=-1)
# # fit model
# model.fit(X_train, y_train)

## 4. Make predictions and evaluate performance

Next, we use the trained model to make predictions in the test set.

In [None]:
#predict model
y_pred = model.predict(X_test)

We can then evaluate performance in the test set by comparing predictions to the label data.We can then evaluate performance in the test set by comparing predictions to the label data.

In [None]:
#get R^2 from true and predicted values
print('r2: %f' % r2_score(y_test,y_pred))

We can also make a scatter plot of labeled data against predictions.

In [None]:
#clip bounds so min value = 0 because we know that treecover cannot be below zero. 
y_pred[y_pred<0] = 0

#scatterplot
ax = sns.scatterplot(x = y_pred, y = y_test)
ax.set(xlabel='Predicted', ylabel='Observed')
ax.set_ylim(0,1)
ax.set_xlim(0,1)

Finally, we can compare the spatial distribution of label (i.e. observed) values with predicted values.

In [None]:
#merge lat/lons with test and predicted values
map_plot = pd.DataFrame(plotting_coords)
map_plot['predicted'] = y_pred
map_plot['observed'] = y_test
map_plot.head()

In [None]:
f, axes = plt.subplots(1, 2, sharey=True, figsize=(10, 5))

vmax = max(y_test.max(), y_pred.max())
vmin = min(y_test.min(), y_pred.min())

# plot observed
map_plot.plot(
    kind="scatter",
    x="Lon",
    y="Lat",
    c="observed",
    colorbar=False,
    alpha=0.5,
    vmin=vmin,
    vmax=vmax,
    ax=axes[0],
)
axes[0].set_title("Observed")

# plot predicted
map_plot.plot(
    kind="scatter",
    x="Lon",
    y="Lat",
    c="predicted",
    colorbar=False,
    alpha=0.5,
    vmin=vmin,
    vmax=vmax,
    ax=axes[1],
)
axes[1].set_title("Predicted")

plt.tight_layout()