# Versione 5

Split: max(mean_minutes) > 450 e almeno 5 punti, split in ceil(max/450) parti (nome "s_x_y").

Cluster buoni: max(mean_minutes) tra 350 e 450 incluso, vengono salvati.

Merge: merge solo se somma per ogni giorno di mean_minutes < 450 (nome "m_x_y").

Cluster troppo piccoli: quelli con 1 o 2 punti (min_cluster_size=3), sempre mergeabili.

Routing: ricalcolato solo per cluster effettivamente cambiati (split/merge).

Stop: max iterazioni, tempo massimo, oppure 3 iterazioni senza miglioramenti.

## full

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import time
import os
import pickle
import math
from itertools import combinations

import import_ipynb
import performance_calc as pc

class AdaptivePerformanceClustering:
    def __init__(self, 
                 n_cores: int = None,
                 cache_dir: str = "./cluster_cache",
                 max_iterations: int = 15,
                 max_execution_time_min: int = 500):
        # Inizializzazione parametri e strutture dati
        self.n_cores = n_cores or max(1, mp.cpu_count() - 1)
        self.cache_dir = cache_dir
        self.max_iterations = max_iterations
        self.max_execution_time_min = max_execution_time_min
        self.final_clusters = {}
        self.cluster_performances = {}
        self.min_cluster_size = 3 # i troppo piccoli sono quelli con 1 o 2 punti
        os.makedirs(cache_dir, exist_ok=True)
        print(f"üöÄ AdaptivePerformanceClustering avviato con {self.n_cores} core e timeout {max_execution_time_min} min")
    
    def _cache_key(self, location_ids):
        return hash(tuple(sorted(location_ids)))
    
    def _compute_batch_performances(self, cluster_dict, verbose=True):
        """
        Calcola le performance di routing per tutti i cluster usando pi√π thread,
        e salva tutto in self.cluster_performances.
        """
        if verbose:
            print(f"üîé Calcolo performance in batch ({len(cluster_dict)} cluster)...")
        valid_clusters = { k:v for k,v in cluster_dict.items() if len(v) >= self.min_cluster_size }
        small_clusters = { k:v for k,v in cluster_dict.items() if len(v) < self.min_cluster_size }
        clusters_list = list(valid_clusters.values())
        cluster_ids = list(valid_clusters.keys())
        # Chiamata batch parallela
        if valid_clusters:
            performance_df = pc.calc_clusters_stats(
                clusters=clusters_list,
                time_limit=3,
                parallel=True,
                max_workers=self.n_cores,
                verbose=False
            )
            for i, cluster_id in enumerate(cluster_ids):
                name = f'Cluster {i+1}'
                cluster_data = performance_df[performance_df['cluster'] == name]
                if not cluster_data.empty:
                    # Calcolo il massimo dei mean_minutes su tutti i giorni della settimana
                    max_mean = cluster_data['mean_minutes'].max()
                    self.cluster_performances[cluster_id] = {
                        'dataframe': cluster_data,
                        'max_mean_minutes': max_mean,
                        'is_valid': True
                    }
                else:
                    self.cluster_performances[cluster_id] = {
                        'dataframe': None,
                        'max_mean_minutes': float('inf'),
                        'is_valid': False
                    }
        # Gestione cluster troppo piccoli
        for cluster_id in small_clusters.keys():
            self.cluster_performances[cluster_id] = {
                'dataframe': None,
                'max_mean_minutes': 0,
                'is_valid': True,   # true cos√¨ non blocca i merge 
                'too_small': True
            }
        if verbose:
            print(f"    ‚úÖ Performance salvate in self.cluster_performances")
        return
    
    def _save_good_clusters(self, cluster_dict, verbose=True):
        """
        Trova i cluster PERFETTI: solo se il valore massimo dei mean_minutes su tutti i giorni della settimana
        √® COMPRESO tra 350 e 450 (inclusi). Aggiunge questi cluster a quelli finali e li rimuove dai temporanei.
        """
        saved_count = 0
        remaining_clusters = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            # Considera solo cluster che NON sono troppo piccoli
            if len(location_ids) < self.min_cluster_size:
                remaining_clusters[cluster_id] = location_ids
                continue
            if perf and perf['is_valid']:
                max_mean = perf['max_mean_minutes']
                # criterio: il pi√π grande dei mean_minutes per quel cluster e settimana
                if 350 <= max_mean <= 450:
                    # Salva come cluster perfetto
                    final_id = len(self.final_clusters)+1
                    self.final_clusters[final_id] = location_ids
                    saved_count += 1
                    if verbose:
                        print(f"  ‚úÖ Accettato cluster {cluster_id}: max(mean_minutes)={max_mean:.1f} min, punti={len(location_ids)}")
                    continue
            # Se non perfetto, tienilo per round successivo
            remaining_clusters[cluster_id] = location_ids
        if verbose:
            print(f"  üèÅ Salvati {saved_count} cluster ideali ({350}‚â§max‚â§450 min)")
        return remaining_clusters

    def _split_oversized_clusters(self, cluster_dict, verbose=True):
        """
        Divide ogni cluster che ha max dei mean_minutes > 450 in N cluster,
        dove N = ceil(max_mean_minutes / 450). (solo se pi√π di 4 punti)
        """
        new_clusters = {}
        clusters_to_recalc = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            max_mean = perf['max_mean_minutes']
            size = len(location_ids)
            # Split solo se supera la soglia e se pi√π di 4 punti
            if size > 4 and max_mean > 450:
                n_splits = int(np.ceil(max_mean / 450))
                if verbose:
                    print(f"  ‚úÇÔ∏è Cluster {cluster_id}: {size} punti, max(mean_minutes)={max_mean:.1f} split in {n_splits}")
                chunk_size = int(np.ceil(size / n_splits))
                for i in range(n_splits):
                    start = i * chunk_size
                    end = min(start + chunk_size, size)
                    chunk = location_ids[start:end]
                    new_id = f"s_{cluster_id}_{i+1}"
                    new_clusters[new_id] = chunk
                    clusters_to_recalc[new_id] = chunk
            else:
                new_clusters[cluster_id] = location_ids
        # Ricalcola solo per i nuovi cluster splittati
        if clusters_to_recalc:
            if verbose:
                print(f"    üîÑ Ricalcolo routing per {len(clusters_to_recalc)} nuovi cluster splittati")
            self._compute_batch_performances(clusters_to_recalc, verbose=False)
        return new_clusters

    def _can_merge_clusters(self, cluster_id1, cluster_id2):
        """
        Unisce solo se la somma dei mean_minutes per OGNI giorno della settimana √® <450.
        Cluster troppo piccoli: merge sempre permesso.
        """
        perf1 = self.cluster_performances.get(cluster_id1)
        perf2 = self.cluster_performances.get(cluster_id2)
        if (perf1 and perf1.get('too_small')) or (perf2 and perf2.get('too_small')):
            return True
        if not (perf1 and perf2 and perf1['is_valid'] and perf2['is_valid']):
            return False
        df1, df2 = perf1['dataframe'], perf2['dataframe']
        days1, days2 = set(df1['weekday']), set(df2['weekday'])
        common_days = days1 & days2
        if not common_days:
            return False
        for day in common_days:
            m1 = df1[df1['weekday'] == day]['mean_minutes'].iloc[0]
            m2 = df2[df2['weekday'] == day]['mean_minutes'].iloc[0]
            if m1 + m2 >= 450:
                return False
        return True
    
    def _divide_space_into_sectors(self, delivery_points):
        """
        Divide lo spazio geografico in 4 settori (NE, NW, SE, SW) usando le mediane
        di latitudine e longitudine. Questo aiuta a mergeare cluster geograficamente vicini.
        """
        lat_median = delivery_points['lat'].median()
        lon_median = delivery_points['lon'].median()
        
        sectors = {'NE': [], 'NW': [], 'SE': [], 'SW': []}
        
        # Per ogni cluster trova il centroide e assegnalo al settore corrispondente
        for cluster_id in delivery_points['cluster'].unique():
            cluster_points = delivery_points[delivery_points['cluster'] == cluster_id]
            if len(cluster_points) == 0:
                continue
                
            center_lat = cluster_points['lat'].mean()
            center_lon = cluster_points['lon'].mean()
            
            # Determina il settore basato su mediane
            if center_lat >= lat_median and center_lon >= lon_median:
                sector = 'NE'
            elif center_lat >= lat_median and center_lon < lon_median:
                sector = 'NW' 
            elif center_lat < lat_median and center_lon >= lon_median:
                sector = 'SE'
            else:
                sector = 'SW'
                
            sectors[sector].append(cluster_id)
        
        return sectors


    def _merge_clusters_by_sector(self, cluster_dict, delivery_points, use_sectors=True, verbose=True):
        """
        Tenta merge di cluster piccoli e poi merge compatibili sempre secondo criterio < 450 min/giorno.
        Crea i nuovi id 'm_x_y'. Ricalcola il routing solo per i merge eseguiti.
        """
        if verbose:
            print("üîó Tentativo di merge cluster tra settori" if use_sectors else "üîó Merge globale")

        if use_sectors:
            sectors = self._divide_space_into_sectors(delivery_points)
        else:
            sectors = {'ALL': list(cluster_dict.keys())}
        merged_clusters, merged_pairs = {}, set()
        clusters_for_recalc = {}

        small_clusters = set([k for k, v in cluster_dict.items() if len(v) < self.min_cluster_size])
        for sector, cluster_list in sectors.items():
            avail = [c for c in cluster_list if c in cluster_dict and c not in merged_pairs]
            sc = [c for c in avail if c in small_clusters]
            nc = [c for c in avail if c not in small_clusters]
            # 1. Merge cluster piccoli con altri
            for s in sc:
                if s in merged_pairs: continue
                for n in nc:
                    if n in merged_pairs: continue
                    merged_id = f"m_{s}_{n}"
                    can_merge = self._can_merge_clusters(s, n)
                    if can_merge:
                        merged_clusters[merged_id] = cluster_dict[s] + cluster_dict[n]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([s, n])
                        if verbose: print(f"  üîó Merge {s} + {n} ‚ûî {merged_id}")
                        break
            # 2. Merge tra cluster normali
            avail_norm = [c for c in nc if c not in merged_pairs]
            for i, c1 in enumerate(avail_norm):
                if c1 in merged_pairs: continue
                for c2 in avail_norm[i+1:]:
                    if c2 in merged_pairs: continue
                    merged_id = f"m_{c1}_{c2}"
                    if self._can_merge_clusters(c1, c2):
                        merged_clusters[merged_id] = cluster_dict[c1] + cluster_dict[c2]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([c1,c2])
                        if verbose: print(f"  üîó Merge {c1} + {c2} ‚ûî {merged_id}")
                        break
        # Tieni quelli che non sono stati uniti
        for k, v in cluster_dict.items():
            if k not in merged_pairs:
                merged_clusters[k] = v
        # Ricalcola solo per i nuovi mergiati
        if clusters_for_recalc:
            if verbose: print(f"    üîÅ Ricalcolo routing per {len(clusters_for_recalc)} nuovi cluster merged")
            self._compute_batch_performances(clusters_for_recalc, verbose=False)
        return merged_clusters

    def run_adaptive_clustering(self, delivery_points: pd.DataFrame, initial_k=50, verbose=True):
        """
        Esegue il ciclo: k-means, split dove max(mean_minutes)>450, merge <450, salva cluster buoni
        STOP: max iterazioni, timeout, 3 iter senza miglioramenti.
        """
        start_time = time.time()
        print(f"üéØ START: AdaptivePerformanceClustering su {len(delivery_points)} punti (k={initial_k})")
        scaler = StandardScaler()
        points_scaled = scaler.fit_transform(delivery_points[['lat', 'lon']])
        kmeans = KMeans(n_clusters=initial_k, init='k-means++', n_init=1, random_state=42)
        labels = kmeans.fit_predict(points_scaled)
        delivery_points['cluster'] = labels

        # Dizionario iniziale cluster (location_id)
        cluster_dict = {}
        for c in range(initial_k):
            locations = delivery_points.loc[delivery_points['cluster'] == c, 'location_id'].tolist()
            if locations:
                cluster_dict[c] = locations

        self._compute_batch_performances(cluster_dict, verbose=verbose)
        no_improv = 0
        best_remaining = len(cluster_dict)
        for iteration in range(self.max_iterations):
            elapsed = (time.time() - start_time) / 60
            if elapsed > self.max_execution_time_min:
                print(f"‚è∞ STOP: superato tempo massimo ({self.max_execution_time_min} minuti)")
                break
            print(f"\nüîÑ Iterazione {iteration+1}/{self.max_iterations} ({round(elapsed,2)} min)")

            # 1. Split cluster fuori soglia
            cluster_dict = self._split_oversized_clusters(cluster_dict, verbose=verbose)
            # 2. Salvare cluster "buoni" (finali)
            cluster_dict = self._save_good_clusters(cluster_dict, verbose=verbose)
            # 3. Merge settoriale
            cluster_dict = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=True, verbose=verbose)
            # 4. Merge globale (se necessario)
            merged2 = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=False, verbose=verbose)
            if len(merged2) < len(cluster_dict):
                cluster_dict = merged2

            # Controlla se la soluzione √® migliorata
            remaining = len(cluster_dict)
            print(f"   ‚ÑπÔ∏è  Cluster ancora da processare: {remaining}")
            if remaining < best_remaining:
                best_remaining = remaining
                no_improv = 0
            else:
                no_improv += 1
            if no_improv >= 10:
                print("üü° STOP: 10 iterazioni senza miglioramento")
                break
            if not cluster_dict:
                print("‚úÖ STOP: tutti i cluster sono buoni")
                break

            # Aggiorna cluster nel DataFrame
            mapping = {loc: cid for cid, locs in cluster_dict.items() for loc in locs}
            delivery_points['cluster'] = delivery_points['location_id'].map(mapping)

        print(f"\nüèÅ Concluso. Cluster finali: {len(self.final_clusters)}")
        sizes = [len(v) for v in self.final_clusters.values()]
        if sizes:
            print(f"   - min: {min(sizes)}  max: {max(sizes)}  media: {np.mean(sizes):.1f}")

        print(f"üìä Sono rimasti {len(cluster_dict)} cluster non accettati, primo cluster non accettato con ID = {len(self.final_clusters)+1}")
        
        # Inclusione nell'output anche dei cluster non accettati rimasti
        for cid, locs in cluster_dict.items():
            final_id = len(self.final_clusters) + 1
            self.final_clusters[final_id] = locs

        # Calcola performance finali
        perf_df = pc.calc_clusters_stats(list(self.final_clusters.values()), time_limit=3, parallel=True, max_workers=self.n_cores, verbose=False)
        return self.final_clusters, perf_df

def run_adaptive_performance_clustering(delivery_points, initial_k=50, max_iterations=15, n_cores=None, max_execution_time_min=500):
    clusterer = AdaptivePerformanceClustering(
        max_iterations=max_iterations,
        max_execution_time_min=max_execution_time_min,
        n_cores=n_cores
    )
    return clusterer.run_adaptive_clustering(
        delivery_points=delivery_points,
        initial_k=initial_k,
        verbose=True
    )


import time
start = time.time()

final_clusters, performance_df = run_adaptive_performance_clustering(
    delivery_points=pc.delivery_points,
    initial_k=50,
    max_iterations=100)

end = time.time()
print(f"Tempo di esecuzione algoritmo: {(end - start)/60:.2f} min")

### Save

In [None]:
performance_df.to_csv('clustering_methods_performances/k-means_euristics_5.csv')

with open('cluster_dicts/cluster_dict_k-means_euristics_5.pkl', 'wb') as f:
    pickle.dump(final_clusters, f)

# run AS

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import time
import os
import pickle
import math
from itertools import combinations

import import_ipynb
import performance_calc as pc

class AdaptivePerformanceClustering:
    def __init__(self, 
                 n_cores: int = None,
                 cache_dir: str = "./cluster_cache",
                 max_iterations: int = 15,
                 max_execution_time_min: int = 500):
        # Inizializzazione parametri e strutture dati
        self.n_cores = n_cores or max(1, mp.cpu_count() - 1)
        self.cache_dir = cache_dir
        self.max_iterations = max_iterations
        self.max_execution_time_min = max_execution_time_min
        self.final_clusters = {}
        self.cluster_performances = {}
        self.min_cluster_size = 3 # i troppo piccoli sono quelli con 1 o 2 punti
        os.makedirs(cache_dir, exist_ok=True)
        print(f"üöÄ AdaptivePerformanceClustering avviato con {self.n_cores} core e timeout {max_execution_time_min} min")
    
    def _cache_key(self, location_ids):
        return hash(tuple(sorted(location_ids)))
    
    def _compute_batch_performances(self, cluster_dict, verbose=True):
        """
        Calcola le performance di routing per tutti i cluster usando pi√π thread,
        e salva tutto in self.cluster_performances.
        """
        if verbose:
            print(f"üîé Calcolo performance in batch ({len(cluster_dict)} cluster)...")
        valid_clusters = { k:v for k,v in cluster_dict.items() if len(v) >= self.min_cluster_size }
        small_clusters = { k:v for k,v in cluster_dict.items() if len(v) < self.min_cluster_size }
        clusters_list = list(valid_clusters.values())
        cluster_ids = list(valid_clusters.keys())
        # Chiamata batch parallela
        if valid_clusters:
            performance_df = pc.calc_clusters_stats_AS(
                clusters=clusters_list,
                time_limit=3,
                parallel=True,
                max_workers=self.n_cores,
                verbose=False
            )
            for i, cluster_id in enumerate(cluster_ids):
                name = f'Cluster {i+1}'
                cluster_data = performance_df[performance_df['cluster'] == name]
                if not cluster_data.empty:
                    # Calcolo il massimo dei mean_minutes su tutti i giorni della settimana
                    max_mean = cluster_data['mean_minutes'].max()
                    self.cluster_performances[cluster_id] = {
                        'dataframe': cluster_data,
                        'max_mean_minutes': max_mean,
                        'is_valid': True
                    }
                else:
                    self.cluster_performances[cluster_id] = {
                        'dataframe': None,
                        'max_mean_minutes': float('inf'),
                        'is_valid': False
                    }
        # Gestione cluster troppo piccoli
        for cluster_id in small_clusters.keys():
            self.cluster_performances[cluster_id] = {
                'dataframe': None,
                'max_mean_minutes': 0,
                'is_valid': True,   # true cos√¨ non blocca i merge 
                'too_small': True
            }
        if verbose:
            print(f"    ‚úÖ Performance salvate in self.cluster_performances")
        return
    
    def _save_good_clusters(self, cluster_dict, verbose=True):
        """
        Trova i cluster PERFETTI: solo se il valore massimo dei mean_minutes su tutti i giorni della settimana
        √® COMPRESO tra 350 e 450 (inclusi). Aggiunge questi cluster a quelli finali e li rimuove dai temporanei.
        """
        saved_count = 0
        remaining_clusters = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            # Considera solo cluster che NON sono troppo piccoli
            if len(location_ids) < self.min_cluster_size:
                remaining_clusters[cluster_id] = location_ids
                continue
            if perf and perf['is_valid']:
                max_mean = perf['max_mean_minutes']
                # criterio: il pi√π grande dei mean_minutes per quel cluster e settimana
                if 350 <= max_mean <= 450:
                    # Salva come cluster perfetto
                    final_id = len(self.final_clusters)+1
                    self.final_clusters[final_id] = location_ids
                    saved_count += 1
                    if verbose:
                        print(f"  ‚úÖ Accettato cluster {cluster_id}: max(mean_minutes)={max_mean:.1f} min, punti={len(location_ids)}")
                    continue
            # Se non perfetto, tienilo per round successivo
            remaining_clusters[cluster_id] = location_ids
        if verbose:
            print(f"  üèÅ Salvati {saved_count} cluster ideali ({350}‚â§max‚â§450 min)")
        return remaining_clusters

    def _split_oversized_clusters(self, cluster_dict, verbose=True):
        """
        Divide ogni cluster che ha max dei mean_minutes > 450 in N cluster,
        dove N = ceil(max_mean_minutes / 450). (solo se pi√π di 4 punti)
        """
        new_clusters = {}
        clusters_to_recalc = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            max_mean = perf['max_mean_minutes']
            size = len(location_ids)
            # Split solo se supera la soglia e se pi√π di 4 punti
            if size > 4 and max_mean > 450:
                n_splits = int(np.ceil(max_mean / 450))
                if verbose:
                    print(f"  ‚úÇÔ∏è Cluster {cluster_id}: {size} punti, max(mean_minutes)={max_mean:.1f} split in {n_splits}")
                chunk_size = int(np.ceil(size / n_splits))
                for i in range(n_splits):
                    start = i * chunk_size
                    end = min(start + chunk_size, size)
                    chunk = location_ids[start:end]
                    new_id = f"s_{cluster_id}_{i+1}"
                    new_clusters[new_id] = chunk
                    clusters_to_recalc[new_id] = chunk
            else:
                new_clusters[cluster_id] = location_ids
        # Ricalcola solo per i nuovi cluster splittati
        if clusters_to_recalc:
            if verbose:
                print(f"    üîÑ Ricalcolo routing per {len(clusters_to_recalc)} nuovi cluster splittati")
            self._compute_batch_performances(clusters_to_recalc, verbose=False)
        return new_clusters

    def _can_merge_clusters(self, cluster_id1, cluster_id2):
        """
        Unisce solo se la somma dei mean_minutes per OGNI giorno della settimana √® <450.
        Cluster troppo piccoli: merge sempre permesso.
        """
        perf1 = self.cluster_performances.get(cluster_id1)
        perf2 = self.cluster_performances.get(cluster_id2)
        if (perf1 and perf1.get('too_small')) or (perf2 and perf2.get('too_small')):
            return True
        if not (perf1 and perf2 and perf1['is_valid'] and perf2['is_valid']):
            return False
        df1, df2 = perf1['dataframe'], perf2['dataframe']
        days1, days2 = set(df1['weekday']), set(df2['weekday'])
        common_days = days1 & days2
        if not common_days:
            return False
        for day in common_days:
            m1 = df1[df1['weekday'] == day]['mean_minutes'].iloc[0]
            m2 = df2[df2['weekday'] == day]['mean_minutes'].iloc[0]
            if m1 + m2 >= 450:
                return False
        return True
    
    def _divide_space_into_sectors(self, delivery_points):
        """
        Divide lo spazio geografico in 4 settori (NE, NW, SE, SW) usando le mediane
        di latitudine e longitudine. Questo aiuta a mergeare cluster geograficamente vicini.
        """
        lat_median = delivery_points['lat'].median()
        lon_median = delivery_points['lon'].median()
        
        sectors = {'NE': [], 'NW': [], 'SE': [], 'SW': []}
        
        # Per ogni cluster trova il centroide e assegnalo al settore corrispondente
        for cluster_id in delivery_points['cluster'].unique():
            cluster_points = delivery_points[delivery_points['cluster'] == cluster_id]
            if len(cluster_points) == 0:
                continue
                
            center_lat = cluster_points['lat'].mean()
            center_lon = cluster_points['lon'].mean()
            
            # Determina il settore basato su mediane
            if center_lat >= lat_median and center_lon >= lon_median:
                sector = 'NE'
            elif center_lat >= lat_median and center_lon < lon_median:
                sector = 'NW' 
            elif center_lat < lat_median and center_lon >= lon_median:
                sector = 'SE'
            else:
                sector = 'SW'
                
            sectors[sector].append(cluster_id)
        
        return sectors


    def _merge_clusters_by_sector(self, cluster_dict, delivery_points, use_sectors=True, verbose=True):
        """
        Tenta merge di cluster piccoli e poi merge compatibili sempre secondo criterio < 450 min/giorno.
        Crea i nuovi id 'm_x_y'. Ricalcola il routing solo per i merge eseguiti.
        """
        if verbose:
            print("üîó Tentativo di merge cluster tra settori" if use_sectors else "üîó Merge globale")

        if use_sectors:
            sectors = self._divide_space_into_sectors(delivery_points)
        else:
            sectors = {'ALL': list(cluster_dict.keys())}
        merged_clusters, merged_pairs = {}, set()
        clusters_for_recalc = {}

        small_clusters = set([k for k, v in cluster_dict.items() if len(v) < self.min_cluster_size])
        for sector, cluster_list in sectors.items():
            avail = [c for c in cluster_list if c in cluster_dict and c not in merged_pairs]
            sc = [c for c in avail if c in small_clusters]
            nc = [c for c in avail if c not in small_clusters]
            # 1. Merge cluster piccoli con altri
            for s in sc:
                if s in merged_pairs: continue
                for n in nc:
                    if n in merged_pairs: continue
                    merged_id = f"m_{s}_{n}"
                    can_merge = self._can_merge_clusters(s, n)
                    if can_merge:
                        merged_clusters[merged_id] = cluster_dict[s] + cluster_dict[n]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([s, n])
                        if verbose: print(f"  üîó Merge {s} + {n} ‚ûî {merged_id}")
                        break
            # 2. Merge tra cluster normali
            avail_norm = [c for c in nc if c not in merged_pairs]
            for i, c1 in enumerate(avail_norm):
                if c1 in merged_pairs: continue
                for c2 in avail_norm[i+1:]:
                    if c2 in merged_pairs: continue
                    merged_id = f"m_{c1}_{c2}"
                    if self._can_merge_clusters(c1, c2):
                        merged_clusters[merged_id] = cluster_dict[c1] + cluster_dict[c2]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([c1,c2])
                        if verbose: print(f"  üîó Merge {c1} + {c2} ‚ûî {merged_id}")
                        break
        # Tieni quelli che non sono stati uniti
        for k, v in cluster_dict.items():
            if k not in merged_pairs:
                merged_clusters[k] = v
        # Ricalcola solo per i nuovi mergiati
        if clusters_for_recalc:
            if verbose: print(f"    üîÅ Ricalcolo routing per {len(clusters_for_recalc)} nuovi cluster merged")
            self._compute_batch_performances(clusters_for_recalc, verbose=False)
        return merged_clusters

    def run_adaptive_clustering(self, delivery_points: pd.DataFrame, initial_k=50, verbose=True):
        """
        Esegue il ciclo: k-means, split dove max(mean_minutes)>450, merge <450, salva cluster buoni
        STOP: max iterazioni, timeout, 3 iter senza miglioramenti.
        """
        start_time = time.time()
        print(f"üéØ START: AdaptivePerformanceClustering su {len(delivery_points)} punti (k={initial_k})")
        scaler = StandardScaler()
        points_scaled = scaler.fit_transform(delivery_points[['lat', 'lon']])
        kmeans = KMeans(n_clusters=initial_k, init='k-means++', n_init=1, random_state=42)
        labels = kmeans.fit_predict(points_scaled)
        delivery_points['cluster'] = labels

        # Dizionario iniziale cluster (location_id)
        cluster_dict = {}
        for c in range(initial_k):
            locations = delivery_points.loc[delivery_points['cluster'] == c, 'location_id'].tolist()
            if locations:
                cluster_dict[c] = locations

        self._compute_batch_performances(cluster_dict, verbose=verbose)
        no_improv = 0
        best_remaining = len(cluster_dict)
        for iteration in range(self.max_iterations):
            elapsed = (time.time() - start_time) / 60
            if elapsed > self.max_execution_time_min:
                print(f"‚è∞ STOP: superato tempo massimo ({self.max_execution_time_min} minuti)")
                break
            print(f"\nüîÑ Iterazione {iteration+1}/{self.max_iterations} ({round(elapsed,2)} min)")

            # 1. Split cluster fuori soglia
            cluster_dict = self._split_oversized_clusters(cluster_dict, verbose=verbose)
            # 2. Salvare cluster "buoni" (finali)
            cluster_dict = self._save_good_clusters(cluster_dict, verbose=verbose)
            # 3. Merge settoriale
            cluster_dict = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=True, verbose=verbose)
            # 4. Merge globale (se necessario)
            merged2 = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=False, verbose=verbose)
            if len(merged2) < len(cluster_dict):
                cluster_dict = merged2

            # Controlla se la soluzione √® migliorata
            remaining = len(cluster_dict)
            print(f"   ‚ÑπÔ∏è  Cluster ancora da processare: {remaining}")
            if remaining < best_remaining:
                best_remaining = remaining
                no_improv = 0
            else:
                no_improv += 1
            if no_improv >= 10:
                print("üü° STOP: 10 iterazioni senza miglioramento")
                break
            if not cluster_dict:
                print("‚úÖ STOP: tutti i cluster sono buoni")
                break

            # Aggiorna cluster nel DataFrame
            mapping = {loc: cid for cid, locs in cluster_dict.items() for loc in locs}
            delivery_points['cluster'] = delivery_points['location_id'].map(mapping)

        print(f"\nüèÅ Concluso. Cluster finali: {len(self.final_clusters)}")
        sizes = [len(v) for v in self.final_clusters.values()]
        if sizes:
            print(f"   - min: {min(sizes)}  max: {max(sizes)}  media: {np.mean(sizes):.1f}")

        print(f"üìä Sono rimasti {len(cluster_dict)} cluster non accettati, primo cluster non accettato con ID = {len(self.final_clusters)+1}")
        
        # Inclusione nell'output anche dei cluster non accettati rimasti
        for cid, locs in cluster_dict.items():
            final_id = len(self.final_clusters) + 1
            self.final_clusters[final_id] = locs

        # Calcola performance finali
        perf_df = pc.calc_clusters_stats_AS(list(self.final_clusters.values()), time_limit=3, parallel=True, max_workers=self.n_cores, verbose=False)
        return self.final_clusters, perf_df

def run_adaptive_performance_clustering(delivery_points, initial_k=50, max_iterations=15, n_cores=None, max_execution_time_min=500):
    clusterer = AdaptivePerformanceClustering(
        max_iterations=max_iterations,
        max_execution_time_min=max_execution_time_min,
        n_cores=n_cores
    )
    return clusterer.run_adaptive_clustering(
        delivery_points=delivery_points,
        initial_k=initial_k,
        verbose=True
    )


import time
start = time.time()

final_clusters, performance_df = run_adaptive_performance_clustering(
    delivery_points=pc.delivery_points_AS,
    initial_k=50,
    max_iterations=100)


end = time.time()
print(f"Tempo di esecuzione algoritmo: {(end - start)/60:.2f} min")

In [None]:
performance_df.to_csv('clustering_methods_performances/k-means_euristics_AS_5.csv')

with open('cluster_dicts/cluster_dict_k-means_euristics_AS_5.pkl', 'wb') as f:
    pickle.dump(final_clusters, f)

# run ON

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import time
import os
import pickle
import math
from itertools import combinations

import import_ipynb
import performance_calc as pc

class AdaptivePerformanceClustering:
    def __init__(self, 
                 n_cores: int = None,
                 cache_dir: str = "./cluster_cache",
                 max_iterations: int = 15,
                 max_execution_time_min: int = 500):
        # Inizializzazione parametri e strutture dati
        self.n_cores = n_cores or max(1, mp.cpu_count() - 1)
        self.cache_dir = cache_dir
        self.max_iterations = max_iterations
        self.max_execution_time_min = max_execution_time_min
        self.final_clusters = {}
        self.cluster_performances = {}
        self.min_cluster_size = 3 # i troppo piccoli sono quelli con 1 o 2 punti
        os.makedirs(cache_dir, exist_ok=True)
        print(f"üöÄ AdaptivePerformanceClustering avviato con {self.n_cores} core e timeout {max_execution_time_min} min")
    
    def _cache_key(self, location_ids):
        return hash(tuple(sorted(location_ids)))
    
    def _compute_batch_performances(self, cluster_dict, verbose=True):
        """
        Calcola le performance di routing per tutti i cluster usando pi√π thread,
        e salva tutto in self.cluster_performances.
        """
        if verbose:
            print(f"üîé Calcolo performance in batch ({len(cluster_dict)} cluster)...")
        valid_clusters = { k:v for k,v in cluster_dict.items() if len(v) >= self.min_cluster_size }
        small_clusters = { k:v for k,v in cluster_dict.items() if len(v) < self.min_cluster_size }
        clusters_list = list(valid_clusters.values())
        cluster_ids = list(valid_clusters.keys())
        # Chiamata batch parallela
        if valid_clusters:
            performance_df = pc.calc_clusters_stats_ON(
                clusters=clusters_list,
                time_limit=3,
                parallel=True,
                max_workers=self.n_cores,
                verbose=False
            )
            for i, cluster_id in enumerate(cluster_ids):
                name = f'Cluster {i+1}'
                cluster_data = performance_df[performance_df['cluster'] == name]
                if not cluster_data.empty:
                    # Calcolo il massimo dei mean_minutes su tutti i giorni della settimana
                    max_mean = cluster_data['mean_minutes'].max()
                    self.cluster_performances[cluster_id] = {
                        'dataframe': cluster_data,
                        'max_mean_minutes': max_mean,
                        'is_valid': True
                    }
                else:
                    self.cluster_performances[cluster_id] = {
                        'dataframe': None,
                        'max_mean_minutes': float('inf'),
                        'is_valid': False
                    }
        # Gestione cluster troppo piccoli
        for cluster_id in small_clusters.keys():
            self.cluster_performances[cluster_id] = {
                'dataframe': None,
                'max_mean_minutes': 0,
                'is_valid': True,   # true cos√¨ non blocca i merge 
                'too_small': True
            }
        if verbose:
            print(f"    ‚úÖ Performance salvate in self.cluster_performances")
        return
    
    def _save_good_clusters(self, cluster_dict, verbose=True):
        """
        Trova i cluster PERFETTI: solo se il valore massimo dei mean_minutes su tutti i giorni della settimana
        √® COMPRESO tra 350 e 450 (inclusi). Aggiunge questi cluster a quelli finali e li rimuove dai temporanei.
        """
        saved_count = 0
        remaining_clusters = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            # Considera solo cluster che NON sono troppo piccoli
            if len(location_ids) < self.min_cluster_size:
                remaining_clusters[cluster_id] = location_ids
                continue
            if perf and perf['is_valid']:
                max_mean = perf['max_mean_minutes']
                # criterio: il pi√π grande dei mean_minutes per quel cluster e settimana
                if 350 <= max_mean <= 450:
                    # Salva come cluster perfetto
                    final_id = len(self.final_clusters)+1
                    self.final_clusters[final_id] = location_ids
                    saved_count += 1
                    if verbose:
                        print(f"  ‚úÖ Accettato cluster {cluster_id}: max(mean_minutes)={max_mean:.1f} min, punti={len(location_ids)}")
                    continue
            # Se non perfetto, tienilo per round successivo
            remaining_clusters[cluster_id] = location_ids
        if verbose:
            print(f"  üèÅ Salvati {saved_count} cluster ideali ({350}‚â§max‚â§450 min)")
        return remaining_clusters

    def _split_oversized_clusters(self, cluster_dict, verbose=True):
        """
        Divide ogni cluster che ha max dei mean_minutes > 450 in N cluster,
        dove N = ceil(max_mean_minutes / 450). (solo se pi√π di 4 punti)
        """
        new_clusters = {}
        clusters_to_recalc = {}
        for cluster_id, location_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            max_mean = perf['max_mean_minutes']
            size = len(location_ids)
            # Split solo se supera la soglia e se pi√π di 4 punti
            if size > 4 and max_mean > 450:
                n_splits = int(np.ceil(max_mean / 450))
                if verbose:
                    print(f"  ‚úÇÔ∏è Cluster {cluster_id}: {size} punti, max(mean_minutes)={max_mean:.1f} split in {n_splits}")
                chunk_size = int(np.ceil(size / n_splits))
                for i in range(n_splits):
                    start = i * chunk_size
                    end = min(start + chunk_size, size)
                    chunk = location_ids[start:end]
                    new_id = f"s_{cluster_id}_{i+1}"
                    new_clusters[new_id] = chunk
                    clusters_to_recalc[new_id] = chunk
            else:
                new_clusters[cluster_id] = location_ids
        # Ricalcola solo per i nuovi cluster splittati
        if clusters_to_recalc:
            if verbose:
                print(f"    üîÑ Ricalcolo routing per {len(clusters_to_recalc)} nuovi cluster splittati")
            self._compute_batch_performances(clusters_to_recalc, verbose=False)
        return new_clusters

    def _can_merge_clusters(self, cluster_id1, cluster_id2):
        """
        Unisce solo se la somma dei mean_minutes per OGNI giorno della settimana √® <450.
        Cluster troppo piccoli: merge sempre permesso.
        """
        perf1 = self.cluster_performances.get(cluster_id1)
        perf2 = self.cluster_performances.get(cluster_id2)
        if (perf1 and perf1.get('too_small')) or (perf2 and perf2.get('too_small')):
            return True
        if not (perf1 and perf2 and perf1['is_valid'] and perf2['is_valid']):
            return False
        df1, df2 = perf1['dataframe'], perf2['dataframe']
        days1, days2 = set(df1['weekday']), set(df2['weekday'])
        common_days = days1 & days2
        if not common_days:
            return False
        for day in common_days:
            m1 = df1[df1['weekday'] == day]['mean_minutes'].iloc[0]
            m2 = df2[df2['weekday'] == day]['mean_minutes'].iloc[0]
            if m1 + m2 >= 450:
                return False
        return True
    
    def _divide_space_into_sectors(self, delivery_points):
        """
        Divide lo spazio geografico in 4 settori (NE, NW, SE, SW) usando le mediane
        di latitudine e longitudine. Questo aiuta a mergeare cluster geograficamente vicini.
        """
        lat_median = delivery_points['lat'].median()
        lon_median = delivery_points['lon'].median()
        
        sectors = {'NE': [], 'NW': [], 'SE': [], 'SW': []}
        
        # Per ogni cluster trova il centroide e assegnalo al settore corrispondente
        for cluster_id in delivery_points['cluster'].unique():
            cluster_points = delivery_points[delivery_points['cluster'] == cluster_id]
            if len(cluster_points) == 0:
                continue
                
            center_lat = cluster_points['lat'].mean()
            center_lon = cluster_points['lon'].mean()
            
            # Determina il settore basato su mediane
            if center_lat >= lat_median and center_lon >= lon_median:
                sector = 'NE'
            elif center_lat >= lat_median and center_lon < lon_median:
                sector = 'NW' 
            elif center_lat < lat_median and center_lon >= lon_median:
                sector = 'SE'
            else:
                sector = 'SW'
                
            sectors[sector].append(cluster_id)
        
        return sectors


    def _merge_clusters_by_sector(self, cluster_dict, delivery_points, use_sectors=True, verbose=True):
        """
        Tenta merge di cluster piccoli e poi merge compatibili sempre secondo criterio < 450 min/giorno.
        Crea i nuovi id 'm_x_y'. Ricalcola il routing solo per i merge eseguiti.
        """
        if verbose:
            print("üîó Tentativo di merge cluster tra settori" if use_sectors else "üîó Merge globale")

        if use_sectors:
            sectors = self._divide_space_into_sectors(delivery_points)
        else:
            sectors = {'ALL': list(cluster_dict.keys())}
        merged_clusters, merged_pairs = {}, set()
        clusters_for_recalc = {}

        small_clusters = set([k for k, v in cluster_dict.items() if len(v) < self.min_cluster_size])
        for sector, cluster_list in sectors.items():
            avail = [c for c in cluster_list if c in cluster_dict and c not in merged_pairs]
            sc = [c for c in avail if c in small_clusters]
            nc = [c for c in avail if c not in small_clusters]
            # 1. Merge cluster piccoli con altri
            for s in sc:
                if s in merged_pairs: continue
                for n in nc:
                    if n in merged_pairs: continue
                    merged_id = f"m_{s}_{n}"
                    can_merge = self._can_merge_clusters(s, n)
                    if can_merge:
                        merged_clusters[merged_id] = cluster_dict[s] + cluster_dict[n]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([s, n])
                        if verbose: print(f"  üîó Merge {s} + {n} ‚ûî {merged_id}")
                        break
            # 2. Merge tra cluster normali
            avail_norm = [c for c in nc if c not in merged_pairs]
            for i, c1 in enumerate(avail_norm):
                if c1 in merged_pairs: continue
                for c2 in avail_norm[i+1:]:
                    if c2 in merged_pairs: continue
                    merged_id = f"m_{c1}_{c2}"
                    if self._can_merge_clusters(c1, c2):
                        merged_clusters[merged_id] = cluster_dict[c1] + cluster_dict[c2]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([c1,c2])
                        if verbose: print(f"  üîó Merge {c1} + {c2} ‚ûî {merged_id}")
                        break
        # Tieni quelli che non sono stati uniti
        for k, v in cluster_dict.items():
            if k not in merged_pairs:
                merged_clusters[k] = v
        # Ricalcola solo per i nuovi mergiati
        if clusters_for_recalc:
            if verbose: print(f"    üîÅ Ricalcolo routing per {len(clusters_for_recalc)} nuovi cluster merged")
            self._compute_batch_performances(clusters_for_recalc, verbose=False)
        return merged_clusters

    def run_adaptive_clustering(self, delivery_points: pd.DataFrame, initial_k=50, verbose=True):
        """
        Esegue il ciclo: k-means, split dove max(mean_minutes)>450, merge <450, salva cluster buoni
        STOP: max iterazioni, timeout, 3 iter senza miglioramenti.
        """
        start_time = time.time()
        print(f"üéØ START: AdaptivePerformanceClustering su {len(delivery_points)} punti (k={initial_k})")
        scaler = StandardScaler()
        points_scaled = scaler.fit_transform(delivery_points[['lat', 'lon']])
        kmeans = KMeans(n_clusters=initial_k, init='k-means++', n_init=1, random_state=42)
        labels = kmeans.fit_predict(points_scaled)
        delivery_points['cluster'] = labels

        # Dizionario iniziale cluster (location_id)
        cluster_dict = {}
        for c in range(initial_k):
            locations = delivery_points.loc[delivery_points['cluster'] == c, 'location_id'].tolist()
            if locations:
                cluster_dict[c] = locations

        self._compute_batch_performances(cluster_dict, verbose=verbose)
        no_improv = 0
        best_remaining = len(cluster_dict)
        for iteration in range(self.max_iterations):
            elapsed = (time.time() - start_time) / 60
            if elapsed > self.max_execution_time_min:
                print(f"‚è∞ STOP: superato tempo massimo ({self.max_execution_time_min} minuti)")
                break
            print(f"\nüîÑ Iterazione {iteration+1}/{self.max_iterations} ({round(elapsed,2)} min)")

            # 1. Split cluster fuori soglia
            cluster_dict = self._split_oversized_clusters(cluster_dict, verbose=verbose)
            # 2. Salvare cluster "buoni" (finali)
            cluster_dict = self._save_good_clusters(cluster_dict, verbose=verbose)
            # 3. Merge settoriale
            cluster_dict = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=True, verbose=verbose)
            # 4. Merge globale (se necessario)
            merged2 = self._merge_clusters_by_sector(cluster_dict, delivery_points, use_sectors=False, verbose=verbose)
            if len(merged2) < len(cluster_dict):
                cluster_dict = merged2

            # Controlla se la soluzione √® migliorata
            remaining = len(cluster_dict)
            print(f"   ‚ÑπÔ∏è  Cluster ancora da processare: {remaining}")
            if remaining < best_remaining:
                best_remaining = remaining
                no_improv = 0
            else:
                no_improv += 1
            if no_improv >= 10:
                print("üü° STOP: 10 iterazioni senza miglioramento")
                break
            if not cluster_dict:
                print("‚úÖ STOP: tutti i cluster sono buoni")
                break

            # Aggiorna cluster nel DataFrame
            mapping = {loc: cid for cid, locs in cluster_dict.items() for loc in locs}
            delivery_points['cluster'] = delivery_points['location_id'].map(mapping)

        print(f"\nüèÅ Concluso. Cluster finali: {len(self.final_clusters)}")
        sizes = [len(v) for v in self.final_clusters.values()]
        if sizes:
            print(f"   - min: {min(sizes)}  max: {max(sizes)}  media: {np.mean(sizes):.1f}")

        print(f"üìä Sono rimasti {len(cluster_dict)} cluster non accettati, primo cluster non accettato con ID = {len(self.final_clusters)+1}")
        
        # Inclusione nell'output anche dei cluster non accettati rimasti
        for cid, locs in cluster_dict.items():
            final_id = len(self.final_clusters) + 1
            self.final_clusters[final_id] = locs

        # Calcola performance finali
        perf_df = pc.calc_clusters_stats_ON(list(self.final_clusters.values()), time_limit=3, parallel=True, max_workers=self.n_cores, verbose=False)
        return self.final_clusters, perf_df

def run_adaptive_performance_clustering(delivery_points, initial_k=50, max_iterations=15, n_cores=None, max_execution_time_min=500):
    clusterer = AdaptivePerformanceClustering(
        max_iterations=max_iterations,
        max_execution_time_min=max_execution_time_min,
        n_cores=n_cores
    )
    return clusterer.run_adaptive_clustering(
        delivery_points=delivery_points,
        initial_k=initial_k,
        verbose=True
    )


import time
start = time.time()

final_clusters, performance_df = run_adaptive_performance_clustering(
    delivery_points=pc.delivery_points_ON,
    initial_k=50,
    max_iterations=100)


end = time.time()
print(f"Tempo di esecuzione algoritmo: {(end - start)/60:.2f} min")

In [None]:
performance_df.to_csv('clustering_methods_performances/k-means_euristics_ON_5.csv')

with open('cluster_dicts/cluster_dict_k-means_euristics_ON_5.pkl', 'wb') as f:
    pickle.dump(final_clusters, f)