In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import geohash2
import os
from haversine import haversine, Unit

In [None]:
data = pd.read_csv('./clustered_data/geohash_75cjt_cluster_0.csv')

In [None]:
def apply_geohash(lat, lon, precision=5):
    return geohash2.encode(lat, lon, precision=precision)

In [None]:
data['geohash'] = data.apply(lambda row: apply_geohash(row['LATITUDE'], row['LONGITUDE']), axis=1)

In [None]:
grouped = data.groupby('geohash')

In [None]:
output_dir = 'clustered_data'
os.makedirs(output_dir, exist_ok=True)

In [None]:
for geohash, group in grouped:
    num_clusters = min(len(group), 5)
    if num_clusters > 1:
        kmeans = KMeans(n_clusters=num_clusters)
        kmeans.fit(group[['LATITUDE', 'LONGITUDE']])
        group['cluster_label'] = kmeans.labels_
    else:
        group['cluster_label'] = 0
    
    clusters = group.groupby('cluster_label')
    for label, cluster in clusters:
        filename = f"{output_dir}/geohash_{geohash}_cluster_{label}.csv"
        cluster.to_csv(filename, index=False)
        print(f"Output written to {filename}")

        coords = cluster[['LATITUDE', 'LONGITUDE']].to_numpy()
        n = len(coords)
        distance_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i + 1, n):
                distance_matrix[i, j] = distance_matrix[j, i] = haversine(coords[i], coords[j], unit=Unit.KILOMETERS)

        distance_filename = f"{output_dir}/geohash_{geohash}_cluster_{label}_distances.csv"
        np.savetxt(distance_filename, distance_matrix, delimiter=',')
        print(f"Distance matrix written to {distance_filename}")