In [3]:
import import_ipynb
import performance_calc as pc
import pandas as pd
import pickle


## Also with number of days that exceeded the parcel capacity

In [None]:
def analyze_cluster_deliveries(delivery_data, cluster_dict_path, threshold=2100):
    """
    Analyzes delivery statistics for each cluster.
    
    Args:
        delivery_data: DataFrame with columns location_id, delivery_date, quantity
        cluster_dict_path: Path to the pickle file containing the cluster dictionary
        threshold: Threshold value to count days exceeding this limit (default: 2100)
        
    Returns:
        DataFrame with cluster statistics (mean, max, min packages per day, days exceeding threshold)
    """
    
    # Load the cluster dictionary
    with open(cluster_dict_path, 'rb') as f:
        cluster_dict = pickle.load(f)
    
    # Create a mapping from location_id to cluster_id
    location_to_cluster = {}
    for cluster_id, location_ids in cluster_dict.items():
        for location_id in location_ids:
            location_to_cluster[location_id] = cluster_id
    
    # Add cluster_id column to delivery_data
    delivery_data['cluster_id'] = delivery_data['location_id'].map(location_to_cluster)
    
    # Remove rows where location_id is not in any cluster (if any)
    delivery_data_clustered = delivery_data.dropna(subset=['cluster_id'])
    
    # Group by cluster and delivery_date, sum quantities for each day
    daily_packages_per_cluster = delivery_data_clustered.groupby(
        ['cluster_id', 'delivery_date']
    )['quantity'].sum().reset_index()
    
    # Calculate statistics for each cluster
    cluster_stats = daily_packages_per_cluster.groupby('cluster_id')['quantity'].agg([
        ('mean_packages', 'mean'),
        ('max_packages', 'max'),
        ('min_packages', 'min')
    ]).reset_index()
    
    # Count days exceeding threshold for each cluster
    days_exceeding_threshold = daily_packages_per_cluster[
        daily_packages_per_cluster['quantity'] > threshold
    ].groupby('cluster_id').size().reset_index(name='days_over_threshold')
    
    # Merge the threshold count with the main statistics
    cluster_stats = cluster_stats.merge(
        days_exceeding_threshold, 
        on='cluster_id', 
        how='left'
    )
    
    # Fill NaN values with 0 for clusters that never exceeded the threshold
    cluster_stats['days_over_threshold'] = cluster_stats['days_over_threshold'].fillna(0).astype(int)
    
    # Round the mean to 2 decimal places for readability
    cluster_stats['mean_packages'] = cluster_stats['mean_packages'].round(2)
    
    return cluster_stats


### Run

In [None]:

cluster_dict_path = 'cluster_dicts/rectangular_base_case.pkl'
result = analyze_cluster_deliveries(pc.delivery_data, cluster_dict_path)
print(result)


   cluster_id  mean_packages  max_packages  min_packages  days_over_threshold
0      (0, 3)         233.17           483            42                    0
1      (0, 4)          82.53           231             7                    0
2      (0, 5)          19.65            42             7                    0
3      (1, 2)         130.29           420             7                    0
4      (1, 3)         137.18           385            21                    0
5      (1, 4)         211.74           602             7                    0
6      (1, 5)         283.08           749             7                    0
7      (1, 6)          32.08            91             7                    0
8      (2, 2)          64.58           161            14                    0
9      (2, 3)         239.11           805            14                    0
10     (2, 4)         605.62          1141           112                    0
11     (2, 5)         602.63          1239            91        