## Setup

In [None]:
# set libraries to refresh
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import geopandas as gpd

In [None]:
from clustering.kmeans import custom_kmeans, get_oversized_clusters, run_optuna_kmeans_study, kmeans_secondpass #, parallel_kmeans_secondpass
from utils import plot_weights_vs_radii

## Load data

In [None]:
ROOT_DIR = Path("..")
DATA_DIR = ROOT_DIR / "data"
INPUT_DATA_DIR = DATA_DIR / "input"
OUTPUT_DATA_DIR = DATA_DIR / "output"

In [None]:
barangay_grids = gpd.read_parquet(INPUT_DATA_DIR / "data.parquet")

## Clustering

In [None]:
# admin variables
id_col = "grid_id"
lat_col = "Lat"
lon_col = "Lon"
weight_col = "population"
epsg = 3121  # philippines
# both passes
weight_importance_factor = 1
n_jobs = 2
# first pass
desired_weight = 240
firstpass_n_trials = 96
# second pass
secondpass_cutoff_weight = 300
secondpass_n_trials = 24


def cluster_data(gdf_for_cluster):

    if gdf_for_cluster[weight_col].sum() == 0:
        gdf_w_clusters = gdf_for_cluster.copy()
        gdf_w_clusters.loc[:, "cluster_id"] = "CLUSTER_0"
        gdf_w_clusters.loc[:, "cluster_weight"] = 0.0
        gdf_w_clusters.loc[:, "dense_area_guess"] = 0
        return gdf_w_clusters

    # dynamic radius parameter
    if gdf_for_cluster["urban"].iloc[0]:
        desired_radius = 1000
    else:
        desired_radius = 2000
    print(f"Grids in barangay: {len(gdf_for_cluster)}")

    # first pass
    study_firstpass = run_optuna_kmeans_study(
        gdf=gdf_for_cluster,
        desired_cluster_weight=desired_weight,
        desired_cluster_radius=desired_radius,
        id_col=id_col,
        lat_col=lat_col,
        lon_col=lon_col,
        weight_col=weight_col,
        weight_importance_factor=desired_weight,
        epsg=epsg,
        n_trials=firstpass_n_trials,
        n_jobs=n_jobs,
    )

    # proper run with the best n_cluster
    clusters = custom_kmeans(
        df=gdf_for_cluster,
        n_clusters=study_firstpass.best_params["n_clusters"],
        id_col=id_col,
        lat_col=lat_col,
        lon_col=lon_col,
        weight_col=weight_col,
    )
    gdf_w_clusters = gdf_for_cluster.merge(clusters, on=id_col)
    gdf_w_clusters = gdf_w_clusters.sort_values(by="cluster_id")

    # second pass
    oversized_cluster_ids = get_oversized_clusters(
        gdf_w_clusters=gdf_w_clusters, cutoff_weight=secondpass_cutoff_weight
    )
    n_oversized = len(oversized_cluster_ids)
    print(f"Oversized clusters: {n_oversized}")

    # add urban_guess column
    gdf_w_clusters.loc[:, "dense_area_guess"] = 0
    gdf_w_clusters.loc[
        gdf_w_clusters["cluster_weight"] > secondpass_cutoff_weight,
        "dense_area_guess",
    ] = 1

    if n_oversized > 0:
        # run re-clustering
        gdf_w_clusters = kmeans_secondpass(
            gdf_w_clusters=gdf_w_clusters,
            oversized_cluster_ids=oversized_cluster_ids,
            desired_cluster_weight=desired_weight,
            desired_cluster_radius=desired_radius,
            id_col=id_col,
            lat_col=lat_col,
            lon_col=lon_col,
            weight_col=weight_col,
            weight_importance_factor=weight_importance_factor,
            epsg=epsg,
            n_trials=secondpass_n_trials,
            n_jobs=n_jobs,
        )
        gdf_w_clusters = gdf_w_clusters.sort_values(by="cluster_id")

    return gdf_w_clusters

In [None]:
gdf_w_clusters = cluster_data(barangay_grids)

In [None]:
plot_weights_vs_radii(point_gdf_w_cluster=gdf_w_clusters, point_weight_col=weight_col, point_projected_epsg=epsg)

In [None]:
gdf_w_clusters.plot(column="cluster_id", figsize=(5, 5))