### Setup

In [None]:
# set libraries to refresh
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
import geopandas as gpd

# import kml reading and set supported driver
import fiona
fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [None]:
from clustering.utils import get_cluster_pivot_gdf
from utils import plot_weights_vs_radii, save_shapefiles

### Custom function for this notebook

In [None]:
def load_and_process_barangay(selected_barangay_filepath) -> gpd.GeoDataFrame:

    cluster_gdf = gpd.read_parquet(selected_barangay_filepath)
    unique_cluster_id = (
        "B_" + cluster_gdf["PSGC"].astype(str) + "_" + cluster_gdf["cluster_id"]
    )
    cluster_gdf.insert(8, "unique_cluster_id", unique_cluster_id)

    return cluster_gdf

### Load data

In [None]:
ROOT_DIR = Path("..")
INPUT_DATA_DIR = ROOT_DIR / "data" / "output" / "lambda_processed"
OUTPUT_DATA_DIR = ROOT_DIR / "data" / "output"
# OUTPUT_DATA_DIR = ROOT_DIR / "data_hpls" / "02_modeloutputs" / "online"

In [None]:
# get cercle folderpaths for selected regions
filepaths = list(INPUT_DATA_DIR.glob("*"))
filepaths = filepaths[:100]

In [None]:
# load and process all barangays
grid_gdf_list = []
for filepath in tqdm(filepaths):
    grid_gdf = load_and_process_barangay(filepath)
    grid_gdf_list.append(grid_gdf)

### Combine barangays

In [None]:
grid_gdf = pd.concat(grid_gdf_list)
grid_gdf

In [None]:
grid_gdf = grid_gdf.sort_values(by=["grid_id"])
grid_gdf = grid_gdf.reset_index(drop=True)

In [None]:
grid_gdf

### Save combined data

In [None]:
COMBINED_DIR = OUTPUT_DATA_DIR / "test"

In [None]:
# save grid-level dataset
save_shapefiles(grid_gdf, COMBINED_DIR, "grids_w_clusters", formats=["parquet", "csv"])

### Pivot to cluster-level and save

In [None]:
cluster_gdf = get_cluster_pivot_gdf(
    gdf_w_clusters=grid_gdf,
    cluster_id_col="unique_cluster_id",
    weight_col="population",
    cols_to_keep=[
        "PSGC",
        "urban",
        "dense_area_guess",
    ],
    with_stats=True,
    epsg=3121,
)

In [None]:
# save cluster-level dataset
save_shapefiles(cluster_gdf, COMBINED_DIR, "clusters", formats=["parquet", "csv"])

In [None]:
jointplot = plot_weights_vs_radii(
    cluster_df=cluster_gdf,
    output_filepath=COMBINED_DIR / "cluster_weights_vs_radii.png",
)