# K-means

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import random
import import_ipynb
import performance_calc as pc
import pickle

In [None]:

def k_means(delivery_points, k):
    scaler = StandardScaler()
    delivery_points_scaled = scaler.fit_transform(delivery_points[['lat', 'lon']])

    # -----------------------------
    # 5. Esegui K-Means - MODIFICARE QUI IL NUMERO DI CLUSTER!!!
    # -----------------------------
    best_k = 50  # puoi modificare questo numero
    kmeans = KMeans(n_clusters=best_k, random_state=random.randint(0, 100000), n_init=10)
    clusters = kmeans.fit_predict(delivery_points_scaled)

    # Aggiungi i cluster al DataFrame
    delivery_points['cluster'] = clusters

    # -----------------------------
    # 6. Crea un array con location_id per cluster
    # -----------------------------
    cluster_dict = {}
    for c in range(best_k):
        cluster_dict[c+1] = delivery_points.loc[delivery_points['cluster'] == c, 'location_id'].tolist()

    # Questa è la lista da dare in pasto a Google OR-Tools
    list_of_clusters = list(cluster_dict.values())

    # Stampa l'output finale
    print("\nARRAY FINALE DEI CLUSTER:")
    for k, ids in cluster_dict.items():
        print(f"Cluster {k}: {ids}")

    return cluster_dict, list_of_clusters


In [None]:
# -----------------------------
# 7. Visualizza i cluster su grafico
# -----------------------------
import matplotlib.pyplot as plt
import numpy as np

def plot_clusters(cluster_df, k):


    plt.figure(figsize=(25,20))

    # Genera tanti colori distinti quanti sono i cluster usando la colormap 'hsv' (tonalità uniformemente distribuite)
    colors = plt.cm.hsv(np.linspace(0, 1, k))

    for c in range(k):
        subset = cluster_df[cluster_df['cluster'] == c]
        plt.scatter(subset['lon'], subset['lat'], s=20, c=[colors[c]], label=f'Cluster {c+1}', edgecolors='k', linewidths=0.5)
    
        # Calcola il centroide del cluster per la posizione del numero
        center_lon = subset['lon'].mean()
        center_lat = subset['lat'].mean()
    
        # Inserisci il numero del cluster al centro
        plt.text(center_lon, center_lat, str(c+1), fontsize=12, fontweight='bold', ha='center', va='center',
             color='black', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', boxstyle='round,pad=0.3'))

    plt.xlabel('Longitudine')
    plt.ylabel('Latitudine')
    plt.title(f'Cluster KMeans con k={k}')
    plt.legend(loc='best', fontsize='small', ncol=2)
    plt.grid(True)
    plt.show()

Routing

In [None]:

# cluster_dict è per graficare - list_of_clusters è per fare routing
cluster_dict, list_of_clusters = k_means(pc.delivery_points_AS, 50)
clusters_performances = pc.calc_clusters_stats_AS(list_of_clusters)

clusters_performances.to_csv('clustering_methods_performances/k-means_performances_v3(k=50)_AS.csv')

# salva in pickle il file cluster_dict
with open('cluster_dicts/cluster_dict_k-means_v3(k=50)_AS.pkl', 'wb') as f:
    pickle.dump(cluster_dict, f)

In [None]:
# cluster_dict è per graficare - list_of_clusters è per fare routing
cluster_dict, list_of_clusters = k_means(pc.delivery_points_ON, 50)
clusters_performances = pc.calc_clusters_stats_ON(list_of_clusters)

clusters_performances.to_csv('clustering_methods_performances/k-means_performances_v3(k=50)_ON.csv')

# salva in pickle il file cluster_dict
with open('cluster_dicts/cluster_dict_k-means_v3(k=50)_ON.pkl', 'wb') as f:
    pickle.dump(cluster_dict, f)