# Versione 5 - Con Gruppi Pre-Aggregati

Adatta l'algoritmo euristico V5 per lavorare con gruppi pre-aggregati da `clusters_output_punti_simili_ON.pkl`.

**Criteri V5:**
- Split: max(mean_minutes) > 450 e almeno 5 GRUPPI, split in ceil(max/450) parti
- Cluster buoni: max(mean_minutes) tra 350 e 450 incluso
- Merge: somma mean_minutes < 450 per ogni giorno
- Cluster troppo piccoli: quelli con meno di 3 GRUPPI (min_cluster_size=3)

**Logica gruppi:**
- I gruppi contano come singoli punti nel K-means (usando centroidi)
- Per la stima dei tempi: ogni punto nel gruppo = 10 min di unloading
- Per il routing: si espandono i gruppi in punti reali

## Import e setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import time
import os
import pickle
import math
from itertools import combinations

import import_ipynb
import performance_calc as pc

## Caricamento gruppi pre-aggregati

In [None]:
# Carica il dizionario dei gruppi pre-aggregati
with open('clusters_output_punti_simili_ON.pkl', 'rb') as f:
    aggregated_groups = pickle.load(f)

print(f"Gruppi caricati: {len(aggregated_groups)}")
print(f"Esempio: {list(aggregated_groups.items())[0]}")

## Funzioni di supporto per gruppi

In [None]:
def prepare_aggregated_data(aggregated_groups, delivery_points):
    """
    Prepara i dati per clustering:
    - group_mapping: centroid_id -> [location_ids nel gruppo]
    - group_sizes: centroid_id -> numero di punti reali nel gruppo
    - aggregated_points_df: DataFrame con solo i centroidi dei gruppi
    """
    group_mapping = {}
    group_sizes = {}
    centroid_ids = []
    
    for key, (centroid_id, location_ids) in aggregated_groups.items():
        group_mapping[centroid_id] = location_ids
        group_sizes[centroid_id] = len(location_ids)
        centroid_ids.append(centroid_id)
    
    # DataFrame con solo i centroidi
    aggregated_points_df = delivery_points[
        delivery_points['location_id'].isin(centroid_ids)
    ].copy()
    
    print(f"üì¶ Gruppi totali: {len(group_mapping)}")
    print(f"üìç Punti reali totali: {sum(group_sizes.values())}")
    print(f"üìä Media punti per gruppo: {sum(group_sizes.values()) / len(group_sizes):.2f}")
    
    return aggregated_points_df, group_mapping, group_sizes


def expand_cluster_with_groups(cluster_centroid_ids, group_mapping):
    """
    Espande un cluster di centroidi in tutti i location_id reali.
    """
    expanded = []
    for centroid_id in cluster_centroid_ids:
        expanded.extend(group_mapping[centroid_id])
    return expanded

## Classe AdaptivePerformanceClustering con Gruppi

In [None]:
class AdaptivePerformanceClusteringAggregated:
    def __init__(self, 
                 aggregated_groups: dict,
                 delivery_points: pd.DataFrame,
                 n_cores: int = None,
                 cache_dir: str = "./cluster_cache",
                 max_iterations: int = 15,
                 max_execution_time_min: int = 500):
        
        self.n_cores = n_cores or max(1, mp.cpu_count() - 1)
        self.cache_dir = cache_dir
        self.max_iterations = max_iterations
        self.max_execution_time_min = max_execution_time_min
        
        # Prepara i dati aggregati
        self.aggregated_points_df, self.group_mapping, self.group_sizes = prepare_aggregated_data(
            aggregated_groups, delivery_points
        )
        
        self.final_clusters = {}  # Cluster definitivi (con centroidi)
        self.cluster_performances = {}
        self.min_cluster_size = 3  # Minimo 3 GRUPPI per cluster
        
        os.makedirs(cache_dir, exist_ok=True)
        print(f"üöÄ AdaptivePerformanceClusteringAggregated avviato con {self.n_cores} core")
    
    def _cache_key(self, centroid_ids):
        return hash(tuple(sorted(centroid_ids)))
    
    def _compute_batch_performances(self, cluster_dict, verbose=True):
        """
        Calcola performance per i cluster.
        cluster_dict contiene CENTROIDI, ma il routing viene fatto sui PUNTI REALI espansi.
        """
        if verbose:
            print(f"üîé Calcolo performance in batch ({len(cluster_dict)} cluster)...")
        
        # Separa cluster validi da quelli troppo piccoli
        valid_clusters = {k: v for k, v in cluster_dict.items() if len(v) >= self.min_cluster_size}
        small_clusters = {k: v for k, v in cluster_dict.items() if len(v) < self.min_cluster_size}
        
        # ESPANDI i centroidi in punti reali per il calcolo delle performance
        if valid_clusters:
            clusters_expanded = [
                expand_cluster_with_groups(centroid_ids, self.group_mapping)
                for centroid_ids in valid_clusters.values()
            ]
            cluster_ids = list(valid_clusters.keys())
            
            # Chiamata al calcolo performance sui PUNTI REALI
            performance_df = pc.calc_clusters_stats_ON(
                clusters=clusters_expanded,
                time_limit=3,
                parallel=True,
                max_workers=self.n_cores,
                verbose=False
            )
            
            # Salva performance
            for i, cluster_id in enumerate(cluster_ids):
                name = f'Cluster {i+1}'
                cluster_data = performance_df[performance_df['cluster'] == name]
                
                if not cluster_data.empty:
                    max_mean = cluster_data['mean_minutes'].max()
                    self.cluster_performances[cluster_id] = {
                        'dataframe': cluster_data,
                        'max_mean_minutes': max_mean,
                        'is_valid': True
                    }
                else:
                    self.cluster_performances[cluster_id] = {
                        'dataframe': None,
                        'max_mean_minutes': float('inf'),
                        'is_valid': False
                    }
        
        # Gestione cluster troppo piccoli (sempre mergeabili)
        for cluster_id in small_clusters.keys():
            self.cluster_performances[cluster_id] = {
                'dataframe': None,
                'max_mean_minutes': 0,
                'is_valid': True,
                'too_small': True
            }
        
        if verbose:
            print(f" ‚úÖ Performance salvate in self.cluster_performances")
        return
    
    def _save_good_clusters(self, cluster_dict, verbose=True):
        """
        CRITERIO V5: Salva cluster con 350 <= max(mean_minutes) <= 450.
        Cluster con meno di min_cluster_size GRUPPI NON sono salvabili.
        """
        saved_count = 0
        remaining_clusters = {}
        
        for cluster_id, centroid_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            
            # I cluster troppo piccoli NON possono essere salvati
            if len(centroid_ids) < self.min_cluster_size:
                remaining_clusters[cluster_id] = centroid_ids
                continue
            
            if perf and perf['is_valid']:
                max_mean = perf['max_mean_minutes']
                
                # CRITERIO V5: 350 <= max(mean_minutes) <= 450
                if 350 <= max_mean <= 450:
                    final_id = len(self.final_clusters) + 1
                    self.final_clusters[final_id] = centroid_ids
                    saved_count += 1
                    
                    if verbose:
                        total_points = sum(self.group_sizes[cid] for cid in centroid_ids)
                        print(f" ‚úÖ Accettato cluster {cluster_id}: max(mean_minutes)={max_mean:.1f} min, "
                              f"{len(centroid_ids)} gruppi ({total_points} punti)")
                    continue
            
            remaining_clusters[cluster_id] = centroid_ids
        
        if verbose:
            print(f" üèÅ Salvati {saved_count} cluster ideali (350‚â§max‚â§450 min)")
        
        return remaining_clusters
    
    def _split_oversized_clusters(self, cluster_dict, verbose=True):
        """
        CRITERIO V5: Divide cluster con max(mean_minutes) > 450 in N parti,
        dove N = ceil(max_mean_minutes / 450).
        Solo se il cluster ha pi√π di 4 GRUPPI.
        """
        new_clusters = {}
        clusters_to_recalc = {}
        
        for cluster_id, centroid_ids in cluster_dict.items():
            perf = self.cluster_performances.get(cluster_id)
            max_mean = perf['max_mean_minutes'] if perf else float('inf')
            size = len(centroid_ids)  # Numero di GRUPPI
            
            # CRITERIO V5: split solo se > 450 min E pi√π di 4 gruppi
            if size > 4 and max_mean > 450:
                n_splits = int(np.ceil(max_mean / 450))
                
                if verbose:
                    total_points = sum(self.group_sizes[cid] for cid in centroid_ids)
                    print(f" ‚úÇÔ∏è Cluster {cluster_id}: {size} gruppi ({total_points} punti), "
                          f"max(mean_minutes)={max_mean:.1f} ‚Üí split in {n_splits}")
                
                chunk_size = int(np.ceil(size / n_splits))
                for i in range(n_splits):
                    start = i * chunk_size
                    end = min(start + chunk_size, size)
                    chunk = centroid_ids[start:end]
                    new_id = f"s_{cluster_id}_{i+1}"
                    new_clusters[new_id] = chunk
                    clusters_to_recalc[new_id] = chunk
            else:
                new_clusters[cluster_id] = centroid_ids
        
        # Ricalcola performance solo per i cluster splittati
        if clusters_to_recalc:
            if verbose:
                print(f" üîÑ Ricalcolo routing per {len(clusters_to_recalc)} nuovi cluster splittati")
            self._compute_batch_performances(clusters_to_recalc, verbose=False)
        
        return new_clusters
    
    def _can_merge_clusters(self, cluster_id1, cluster_id2):
        """
        CRITERIO V5: Merge solo se somma mean_minutes < 450 per OGNI giorno.
        Cluster troppo piccoli: sempre mergeabili.
        """
        perf1 = self.cluster_performances.get(cluster_id1)
        perf2 = self.cluster_performances.get(cluster_id2)
        
        # Cluster troppo piccoli: sempre mergeabili
        if (perf1 and perf1.get('too_small')) or (perf2 and perf2.get('too_small')):
            return True
        
        if not (perf1 and perf2 and perf1['is_valid'] and perf2['is_valid']):
            return False
        
        df1 = perf1['dataframe']
        df2 = perf2['dataframe']
        
        days1 = set(df1['weekday'])
        days2 = set(df2['weekday'])
        common_days = days1 & days2
        
        if not common_days:
            return False
        
        # CRITERIO V5: somma < 450 per OGNI giorno
        for day in common_days:
            m1 = df1[df1['weekday'] == day]['mean_minutes'].iloc[0]
            m2 = df2[df2['weekday'] == day]['mean_minutes'].iloc[0]
            if m1 + m2 >= 450:
                return False
        
        return True
    
    def _divide_space_into_sectors(self, delivery_points):
        """
        Divide lo spazio in 4 settori (NE, NW, SE, SW) usando mediane.
        """
        lat_median = delivery_points['lat'].median()
        lon_median = delivery_points['lon'].median()
        
        sectors = {'NE': [], 'NW': [], 'SE': [], 'SW': []}
        
        for cluster_id in delivery_points['cluster'].unique():
            cluster_points = delivery_points[delivery_points['cluster'] == cluster_id]
            if len(cluster_points) == 0:
                continue
            
            center_lat = cluster_points['lat'].mean()
            center_lon = cluster_points['lon'].mean()
            
            if center_lat >= lat_median and center_lon >= lon_median:
                sector = 'NE'
            elif center_lat >= lat_median and center_lon < lon_median:
                sector = 'NW'
            elif center_lat < lat_median and center_lon >= lon_median:
                sector = 'SE'
            else:
                sector = 'SW'
            
            sectors[sector].append(cluster_id)
        
        return sectors
    
    def _merge_clusters_by_sector(self, cluster_dict, delivery_points, use_sectors=True, verbose=True):
        """
        CRITERIO V5: Merge cluster secondo la regola somma < 450 per ogni giorno.
        Prima priorit√† ai cluster piccoli, poi agli altri.
        """
        if verbose:
            print("üîó Tentativo di merge cluster tra settori" if use_sectors else "üîó Merge globale")
        
        if use_sectors:
            sectors = self._divide_space_into_sectors(delivery_points)
        else:
            sectors = {'ALL': list(cluster_dict.keys())}
        
        merged_clusters = {}
        merged_pairs = set()
        clusters_for_recalc = {}
        
        small_clusters = set([k for k, v in cluster_dict.items() if len(v) < self.min_cluster_size])
        
        for sector, cluster_list in sectors.items():
            avail = [c for c in cluster_list if c in cluster_dict and c not in merged_pairs]
            sc = [c for c in avail if c in small_clusters]
            nc = [c for c in avail if c not in small_clusters]
            
            # 1. Merge cluster piccoli con altri
            for s in sc:
                if s in merged_pairs:
                    continue
                for n in nc:
                    if n in merged_pairs:
                        continue
                    
                    if self._can_merge_clusters(s, n):
                        merged_id = f"m_{s}_{n}"
                        merged_clusters[merged_id] = cluster_dict[s] + cluster_dict[n]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([s, n])
                        
                        if verbose:
                            print(f" üîó Merge {s} + {n} ‚Üí {merged_id}")
                        break
            
            # 2. Merge tra cluster normali
            avail_norm = [c for c in nc if c not in merged_pairs]
            for i, c1 in enumerate(avail_norm):
                if c1 in merged_pairs:
                    continue
                for c2 in avail_norm[i+1:]:
                    if c2 in merged_pairs:
                        continue
                    
                    if self._can_merge_clusters(c1, c2):
                        merged_id = f"m_{c1}_{c2}"
                        merged_clusters[merged_id] = cluster_dict[c1] + cluster_dict[c2]
                        clusters_for_recalc[merged_id] = merged_clusters[merged_id]
                        merged_pairs.update([c1, c2])
                        
                        if verbose:
                            print(f" üîó Merge {c1} + {c2} ‚Üí {merged_id}")
                        break
        
        # Tieni quelli non mergiati
        for k, v in cluster_dict.items():
            if k not in merged_pairs:
                merged_clusters[k] = v
        
        # Ricalcola performance solo per i cluster mergiati
        if clusters_for_recalc:
            if verbose:
                print(f" üîÅ Ricalcolo routing per {len(clusters_for_recalc)} nuovi cluster merged")
            self._compute_batch_performances(clusters_for_recalc, verbose=False)
        
        return merged_clusters
    
    def run_adaptive_clustering(self, initial_k=50, verbose=True):
        """
        Esegue il ciclo completo: K-means, split, save, merge.
        STOP: max iterazioni, timeout, o 10 iterazioni senza miglioramenti.
        """
        start_time = time.time()
        
        print(f"üéØ START: AdaptivePerformanceClusteringAggregated")
        print(f"   ‚Ä¢ Gruppi: {len(self.aggregated_points_df)}")
        print(f"   ‚Ä¢ Punti reali totali: {sum(self.group_sizes.values())}")
        print(f"   ‚Ä¢ K iniziale: {initial_k}")
        
        # K-means sui CENTROIDI
        scaler = StandardScaler()
        points_scaled = scaler.fit_transform(self.aggregated_points_df[['lat', 'lon']])
        kmeans = KMeans(n_clusters=initial_k, init='k-means++', n_init=1, random_state=42)
        labels = kmeans.fit_predict(points_scaled)
        self.aggregated_points_df['cluster'] = labels
        
        # Crea dizionario cluster (con centroidi)
        cluster_dict = {}
        for c in range(initial_k):
            locations = self.aggregated_points_df.loc[
                self.aggregated_points_df['cluster'] == c, 'location_id'
            ].tolist()
            if locations:
                cluster_dict[c] = locations
        
        print(f"‚úÖ K-means completato: {len(cluster_dict)} cluster iniziali")
        
        # Calcola performance iniziali
        self._compute_batch_performances(cluster_dict, verbose=verbose)
        
        no_improv = 0
        best_remaining = len(cluster_dict)
        
        for iteration in range(self.max_iterations):
            elapsed = (time.time() - start_time) / 60
            
            if elapsed > self.max_execution_time_min:
                print(f"‚è∞ STOP: superato tempo massimo ({self.max_execution_time_min} minuti)")
                break
            
            print(f"\nüîÑ Iterazione {iteration+1}/{self.max_iterations} ({round(elapsed, 2)} min)")
            
            # 1. Split cluster fuori soglia
            cluster_dict = self._split_oversized_clusters(cluster_dict, verbose=verbose)
            
            # 2. Salva cluster buoni
            cluster_dict = self._save_good_clusters(cluster_dict, verbose=verbose)
            
            # 3. Merge settoriale
            cluster_dict = self._merge_clusters_by_sector(
                cluster_dict, self.aggregated_points_df, use_sectors=True, verbose=verbose
            )
            
            # 4. Merge globale (se necessario)
            merged2 = self._merge_clusters_by_sector(
                cluster_dict, self.aggregated_points_df, use_sectors=False, verbose=verbose
            )
            if len(merged2) < len(cluster_dict):
                cluster_dict = merged2
            
            # Controlla miglioramenti
            remaining = len(cluster_dict)
            print(f" ‚ÑπÔ∏è Cluster ancora da processare: {remaining}")
            
            if remaining < best_remaining:
                best_remaining = remaining
                no_improv = 0
            else:
                no_improv += 1
                if no_improv >= 10:
                    print("üü° STOP: 10 iterazioni senza miglioramento")
                    break
            
            if not cluster_dict:
                print("‚úÖ STOP: tutti i cluster sono buoni")
                break
            
            # Aggiorna mapping cluster
            mapping = {loc: cid for cid, locs in cluster_dict.items() for loc in locs}
            self.aggregated_points_df['cluster'] = self.aggregated_points_df['location_id'].map(mapping)
        
        # Conclusione
        print(f"\nüèÅ Concluso. Cluster finali: {len(self.final_clusters)}")
        
        # Aggiungi cluster rimanenti non accettati
        print(f"üìä Sono rimasti {len(cluster_dict)} cluster non accettati")
        for cid, locs in cluster_dict.items():
            final_id = len(self.final_clusters) + 1
            self.final_clusters[final_id] = locs
        
        # ESPANDI tutti i cluster finali da centroidi a punti reali
        print(f"üìä Espansione {len(self.final_clusters)} cluster da centroidi a punti reali...")
        final_clusters_expanded = {}
        for cluster_id, centroid_ids in self.final_clusters.items():
            expanded = expand_cluster_with_groups(centroid_ids, self.group_mapping)
            final_clusters_expanded[cluster_id] = expanded
        
        sizes = [len(v) for v in final_clusters_expanded.values()]
        if sizes:
            print(f"üìè Distribuzione cluster (punti reali):")
            print(f" - min: {min(sizes)} max: {max(sizes)} media: {np.mean(sizes):.1f}")
            print(f" - totale punti: {sum(sizes)}")
        
        # Calcola performance finali sui punti reali
        print(f"üìä Calcolo performance finali sui punti reali...")
        perf_df = pc.calc_clusters_stats_ON(
            list(final_clusters_expanded.values()),
            time_limit=3,
            parallel=True,
            max_workers=self.n_cores,
            verbose=False
        )
        
        return final_clusters_expanded, perf_df


def run_adaptive_performance_clustering_aggregated(
    aggregated_groups, 
    delivery_points, 
    initial_k=50, 
    max_iterations=15, 
    n_cores=None, 
    max_execution_time_min=500
):
    """
    Funzione wrapper per eseguire l'algoritmo con gruppi pre-aggregati.
    """
    clusterer = AdaptivePerformanceClusteringAggregated(
        aggregated_groups=aggregated_groups,
        delivery_points=delivery_points,
        max_iterations=max_iterations,
        max_execution_time_min=max_execution_time_min,
        n_cores=n_cores
    )
    
    return clusterer.run_adaptive_clustering(
        initial_k=initial_k,
        verbose=True
    )

## Esecuzione

In [None]:
# Esegui il clustering adattivo con gruppi pre-aggregati
final_clusters, performance_df = run_adaptive_performance_clustering_aggregated(
    aggregated_groups=aggregated_groups,
    delivery_points=pc.delivery_points_ON,
    initial_k=50,
    max_iterations=100,
    n_cores=8
)

## Salvataggio risultati

In [None]:
# Salva i risultati
performance_df.to_csv('clustering_methods_performances/k-means_euristics_aggregated_ON_v5.csv')

with open('cluster_dicts/cluster_dict_k-means_euristics_aggregated_ON_v5.pkl', 'wb') as f:
    pickle.dump(final_clusters, f)

print("‚úÖ Risultati salvati con successo!")