In [None]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

There are no shapefiles for SHRUG...

"The SHRUG does not include geographic data in the form of polygons or shapefiles because
we have not yet found a sufficiently accurate data source with open sharing privileges. We are
continuing to investigate sources of geographic data and may include shapefiles in a future version
of the SHRUG. Users interested in obtaining geocodes or polygons for SHRUG units are advised
to examine the open village maps offered by NASA-SEDAC at Columbia University. These can
be directly merged to the 2001 Population Census SHRUG keys in shrug pc01r key.dta and
shrug pc01u key.dta. Our own aggregate data was based on 2011 village polygons which we
believe are slightly more accurate but are not made available with an open data license."

In [None]:
from custom.utils import import_shapefiles, get_bounds, create_coords_list, coords_list_to_gdf

## Load shapefiles (NASA-SEDAC)

In [None]:
dbf = import_shapefiles()

In [None]:
dbf.plot(figsize=(5, 5))

In [None]:
dbf.columns

In [None]:
dbf["SID"].unique()

### Create unique ID columns to match SHRUG keys

In [None]:
dbf["VILL_CODE"].shape

In [None]:
# NOTE: Number of towns with no village code
dbf.dropna(subset=["TOWN_VILL"])["VILL_CODE"].isna().sum()

In [None]:
# drop rows that are missing any identifiers 
dbf_clean = dbf.dropna(subset=["SID", "DID", "TID", "VILL_CODE"]).copy()

# remove the leading 0s
for var in ["SID", "DID", "TID", "VILL_CODE"]:
    dbf_clean[var] = dbf_clean[var].str.lstrip("0")

# create combined ID column
dbf_clean["ID"] = (
    dbf_clean["SID"] + "-"
    + dbf_clean["DID"] + "-"
    + dbf_clean["TID"] + "-"
    + dbf_clean["VILL_CODE"]
)
dbf_clean.sort_values(by=["ID"], inplace=True)

print("Number of unique IDs: ", dbf_clean["ID"].nunique())

In [None]:
dbf_clean.plot(figsize=(5, 5))

## Import SHRUG keys

In [None]:
### RURAL
shrug_pc01r_key = pd.read_csv(
    "../data/SHRUG/shrug-v1.5.samosa-keys-csv/shrug_pc01r_key.csv"
)
shrug_pc01r_key.head()

### URBAN
# shrug_pc01u_key = pd.read_csv(
#     "../data/SHRUG/shrug-v1.5.samosa-keys-csv/shrug_pc01u_key.csv"
# )
# shrug_pc01u_key.head()

### Create ID column to match IDs in NASA-SEDAC

In [None]:
shrug_pc01r_key_clean = shrug_pc01r_key.dropna(subset=["pc01_state_id", "pc01_district_id", "pc01_subdistrict_id", "pc01_village_id"]).copy()

shrug_pc01r_key_clean["ID"] = (
    shrug_pc01r_key_clean["pc01_state_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_district_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_subdistrict_id"].astype(int).astype(str) + "-"
    + shrug_pc01r_key_clean["pc01_village_id"].astype(str)
)

## Match shapes to SHRUG

In [None]:
shrid_geom_df = pd.merge(dbf_clean[["geometry", "ID"]], shrug_pc01r_key_clean, on="ID", how="inner")
shrid_geom_df

Note: multiple villages can be inside the same shrid ID - we can merge the shape of these villages.

In [None]:
# example
shrid_geom_df[shrid_geom_df["shrid"]=="11-28-803020"]

In [None]:
shrid_geom_df = shrid_geom_df.dissolve(by='shrid', aggfunc='sum').reset_index()
# Note: "ID" columns gets dropped here

In [None]:
shrid_geom_df.plot(figsize=(5, 5))