## Setup

In [None]:
# set libraries to refresh
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import geopandas as gpd

In [None]:
from utils import plot_weights_vs_radii
from clustering.kmeans import TunedClustering

## Load data

In [None]:
ROOT_DIR = Path("..")
DATA_DIR = ROOT_DIR / "data"
INPUT_DATA_DIR = DATA_DIR / "input"
OUTPUT_DATA_DIR = DATA_DIR / "output"

In [None]:
# runs
n_jobs = -1
initial_max_trials = 100
max_passes = 100
subsequent_max_trials = 20

# data-specific
data_type = "rooftops"
# data_type = "grid_weights"

if data_type == "rooftops":
    gdf_for_cluster = gpd.read_parquet(INPUT_DATA_DIR / "rooftops.parquet")
    gdf_for_cluster.loc[:, "weight"] = 1
    # admin variables
    id_col = "rooftop_id"
    lat_col = "Lat_centroid"
    lon_col = "Lon_centroid"
    weight_col = "weight"
    projected_epsg = 26191  # morocco
    # clustering variables
    desired_cluster_radius = 550
    desired_cluster_weight = 30
    max_cluster_weight = 50
    weight_importance_factor = 1

else:
    gdf_for_cluster = gpd.read_parquet(INPUT_DATA_DIR / "grids.parquet")
    # admin variables
    id_col = "grid_id"
    lat_col = "Lat"
    lon_col = "Lon"
    weight_col = "population"
    projected_epsg = 3121  # philippines
    # clustering variables
    desired_cluster_radius = 1000
    desired_cluster_weight = 240
    max_cluster_weight = 300
    weight_importance_factor = 1

gdf_for_cluster

## Clustering

In [None]:
tuned_clustering = TunedClustering(
    desired_cluster_weight=desired_cluster_weight,
    desired_cluster_radius=desired_cluster_radius,
    weight_importance_factor=weight_importance_factor,
    initial_max_trials=initial_max_trials,
    max_passes=max_passes,
    max_cluster_weight=max_cluster_weight,
    subsequent_max_trials=subsequent_max_trials,
    n_jobs=n_jobs,
    show_progress_bar=True,
)

In [None]:
gdf_w_clusters = tuned_clustering.run(
    gdf=gdf_for_cluster,
    weight_col=weight_col,
    projected_epsg=projected_epsg,
    return_type="geodataframe",
)

In [None]:
plot_weights_vs_radii(
    point_gdf_w_cluster=gdf_w_clusters,
    point_weight_col=weight_col,
    point_projected_epsg=projected_epsg,
    y_human_readable=data_type,
    # output_filepath=OUTPUT_DATA_DIR / f"{data_type}_minibatch_init1_reassignment0.05_PROJECTED.png", 
)

In [None]:
# 1m 44s for n_init=1 (classic)

# changing MiniBatch reassignment_ratio
# 1m 41s for n_init=1 (minibatch, reassignment_ratio=0)
# 1m 38s for n_init=1 (minibatch, reassignment_ratio=0.01)
# 1m 31s for n_init=1 (minibatch, reassignment_ratio=0.05)
# 1m 28s for n_init=1 (minibatch, reassignment_ratio=0.10)

# changing n_init
# 3m 15s for n_init=5 (classic)
# 2m 42 for n_init=5 (minibatch)