In [1]:
import torch
import torch_geometric
import uproot  # For loading ROOT files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score  # For purity evaluation, purity_score can be custom-defined

from torch_geometric.data import DataLoader 
from train import CCV1

from collections import defaultdict

In [2]:
# Open the ROOT file
testpath = "/vols/cms/mm1221/Data/2pi/test/"  # Replace with your ROOT file path
data_test = CCV1(testpath, max_events=12000, inp = 'test')

test_loader = DataLoader(data_test, batch_size=1, shuffle=False, follow_batch=['x_lc'])

### Loading data


100%|█████████████████████████████████████████████| 1/1 [01:35<00:00, 95.96s/it]


In [3]:
print(data_test)

CCV1(3530)


In [4]:
# Initialize model and load state dictionary
from train import Net
model = Net(hidden_dim=128, k_value=24, contrastive_dim=8)
checkpoint= torch.load('/vols/cms/mm1221/hgcal/CLpi/Hyper/results/Layer6Complex/results_lr0.005_bs32_hd128_k24_temp0.1_cd8/best_model.pt', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)  # Directly use the loaded checkpoint
model.eval()  # Set model to evaluation mode

Net(
  (lc_encode): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=128, out_features=128, bias=True)
  )
  (conv1): DynamicEdgeConv(nn=Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ELU(alpha=1.0)
  ), k=24)
  (norm1): LayerNorm(128, affine=True, mode=graph)
  (conv2): DynamicEdgeConv(nn=Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ELU(alpha=1.0)
  ), k=24)
  (norm2): LayerNorm(128, affine=True, mode=graph)
  (conv3): DynamicEdgeConv(nn=Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=128, out_features=128, b

In [None]:
import awkward as ak

# Ground truth data
sim_vertices = data_test.stsCP_vertices_indexes
sim_x = data_test.stsCP_vertices_x
sim_y = data_test.stsCP_vertices_y
sim_z = data_test.stsCP_vertices_z
sim_energy = data_test.stsCP_vertices_energy
sim_centre_x = data_test.stsCP_barycenter_x
sim_centre_y = data_test.stsCP_barycenter_y
sim_centre_z = data_test.stsCP_barycenter_z


# TICL data
v_path = '/vols/cms/mm1221/Data/2pi/test/raw/test.root'
v_file = uproot.open(v_path)
track = v_file['tracksters']
track_ind = track['vertices_indexes'].array()
track_en = track['vertices_energy'].array()
print(track_ind)

# Statistics of TICL data
num_tracksters_per_event = ak.num(track_ind, axis=1)
print(f"Number of tracksters per event: {num_tracksters_per_event}")
print(f"Total number of events: {len(track_ind)}")

# alternative definitions of ground truth data
CP = v_file['simtrackstersCP']
CP_ind = CP['vertices_indexes'].array()
CP_energy = CP['vertices_energy'].array()

[[[14, 16, 18, 23, 24, 28, 29, 30, 32, 33, ... 173, 177, 182, 183], [165, 175, 179]]]
Number of tracksters per event: [5, 11, 11, 14, 17, 5, 11, 14, 5, 15, 10, ... 14, 11, 16, 3, 13, 0, 22, 17, 8, 13, 4]
Total number of events: 4000


In [58]:
print(len(CP_ind[130]))

2


In [59]:
skim_mask =[]
for e in CP_ind:
    if len(e) == 2:
        skim_mask.append(True)
    else:
        skim_mask.append(False)

In [61]:
print(len(skim_mask))

4000


In [62]:
CP_ind = CP_ind[skim_mask]
CP_energy = CP_energy[skim_mask]

track_ind = track_ind[skim_mask]
track_en = track_en[skim_mask]

In [65]:
# Model data - convert into array objects provided by the awkward array package - demonstration that model_ind works
i=0
all_predictions = []
all_model_energy = []


for event_index, data in enumerate(test_loader):
    print(data)
    predictions_tuple = model(data.x_lc, data.x_lc_batch) 
    predictions = predictions_tuple[0]  
    predictions = predictions.detach().cpu().numpy()  
    n_clusters = len(sim_vertices[event_index])  # Number of simulated clusters in this event
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=0).fit(predictions)
    cluster_labels = kmeans.labels_
    [1000100111]
    predicted_clusters = defaultdict(list)
    for idx, label in enumerate(cluster_labels):
        predicted_clusters[label].append(idx)
    
    # Convert the reconstructed_clusters dictionary values into a list of lists of lists
    predicted_clusters_list = [predicted_clusters[i] for i in range(n_clusters)]
    all_predictions.append(predicted_clusters_list)
    

    # Convert the list of lists directly to an Awkward Array
    model_ind = ak.from_iter(all_predictions)
    
    i+=1

    
print(f"Model indices:\n {model_ind}")

    
    

DataBatch(x=[308, 8], edge_index=[2, 0], y=[308], x_lc=[308, 8], x_lc_batch=[308], x_lc_ptr=[2], x_pe=[308, 2], x_ne=[308, 2], x_counts=[1], batch=[308], ptr=[2])
DataBatch(x=[225, 8], edge_index=[2, 0], y=[225], x_lc=[225, 8], x_lc_batch=[225], x_lc_ptr=[2], x_pe=[225, 2], x_ne=[225, 2], x_counts=[1], batch=[225], ptr=[2])
DataBatch(x=[564, 8], edge_index=[2, 0], y=[564], x_lc=[564, 8], x_lc_batch=[564], x_lc_ptr=[2], x_pe=[564, 2], x_ne=[564, 2], x_counts=[1], batch=[564], ptr=[2])
DataBatch(x=[248, 8], edge_index=[2, 0], y=[248], x_lc=[248, 8], x_lc_batch=[248], x_lc_ptr=[2], x_pe=[248, 2], x_ne=[248, 2], x_counts=[1], batch=[248], ptr=[2])
DataBatch(x=[454, 8], edge_index=[2, 0], y=[454], x_lc=[454, 8], x_lc_batch=[454], x_lc_ptr=[2], x_pe=[454, 2], x_ne=[454, 2], x_counts=[1], batch=[454], ptr=[2])
DataBatch(x=[404, 8], edge_index=[2, 0], y=[404], x_lc=[404, 8], x_lc_batch=[404], x_lc_ptr=[2], x_pe=[404, 2], x_ne=[404, 2], x_counts=[1], batch=[404], ptr=[2])
DataBatch(x=[440, 8], 

DataBatch(x=[160, 8], edge_index=[2, 0], y=[160], x_lc=[160, 8], x_lc_batch=[160], x_lc_ptr=[2], x_pe=[160, 2], x_ne=[160, 2], x_counts=[1], batch=[160], ptr=[2])
DataBatch(x=[516, 8], edge_index=[2, 0], y=[516], x_lc=[516, 8], x_lc_batch=[516], x_lc_ptr=[2], x_pe=[516, 2], x_ne=[516, 2], x_counts=[1], batch=[516], ptr=[2])
DataBatch(x=[686, 8], edge_index=[2, 0], y=[686], x_lc=[686, 8], x_lc_batch=[686], x_lc_ptr=[2], x_pe=[686, 2], x_ne=[686, 2], x_counts=[1], batch=[686], ptr=[2])
DataBatch(x=[146, 8], edge_index=[2, 0], y=[146], x_lc=[146, 8], x_lc_batch=[146], x_lc_ptr=[2], x_pe=[146, 2], x_ne=[146, 2], x_counts=[1], batch=[146], ptr=[2])
DataBatch(x=[232, 8], edge_index=[2, 0], y=[232], x_lc=[232, 8], x_lc_batch=[232], x_lc_ptr=[2], x_pe=[232, 2], x_ne=[232, 2], x_counts=[1], batch=[232], ptr=[2])
DataBatch(x=[473, 8], edge_index=[2, 0], y=[473], x_lc=[473, 8], x_lc_batch=[473], x_lc_ptr=[2], x_pe=[473, 2], x_ne=[473, 2], x_counts=[1], batch=[473], ptr=[2])
DataBatch(x=[654, 8], 

DataBatch(x=[462, 8], edge_index=[2, 0], y=[462], x_lc=[462, 8], x_lc_batch=[462], x_lc_ptr=[2], x_pe=[462, 2], x_ne=[462, 2], x_counts=[1], batch=[462], ptr=[2])
DataBatch(x=[287, 8], edge_index=[2, 0], y=[287], x_lc=[287, 8], x_lc_batch=[287], x_lc_ptr=[2], x_pe=[287, 2], x_ne=[287, 2], x_counts=[1], batch=[287], ptr=[2])
DataBatch(x=[742, 8], edge_index=[2, 0], y=[742], x_lc=[742, 8], x_lc_batch=[742], x_lc_ptr=[2], x_pe=[742, 2], x_ne=[742, 2], x_counts=[1], batch=[742], ptr=[2])
DataBatch(x=[318, 8], edge_index=[2, 0], y=[318], x_lc=[318, 8], x_lc_batch=[318], x_lc_ptr=[2], x_pe=[318, 2], x_ne=[318, 2], x_counts=[1], batch=[318], ptr=[2])
DataBatch(x=[298, 8], edge_index=[2, 0], y=[298], x_lc=[298, 8], x_lc_batch=[298], x_lc_ptr=[2], x_pe=[298, 2], x_ne=[298, 2], x_counts=[1], batch=[298], ptr=[2])


InvalidParameterError: The 'n_clusters' parameter of KMeans must be an int in the range [1, inf). Got 0 instead.

In [66]:
print(len(all_model_energy))
print(len(all_predictions))

0
107


In [None]:
# Model data - convert into array objects provided by the awkward array package - demonstration that model_ind and model_energy works
# used really inefficient way of searching to create model_energy - definitely room for improvement


all_predictions = []
all_model_energy = []

for event_index, data in enumerate(test_loader):
    predictions_tuple = model(data.x_lc, data.x_lc_batch) 
    predictions = predictions_tuple[0]  
    predictions = predictions.detach().cpu().numpy()  
    n_clusters = len(sim_vertices[event_index])  # Number of simulated clusters in this event
    kmeans = KMeans(n_clusters=2, n_init=10, random_state=0).fit(predictions)
    cluster_labels = kmeans.labels_
    predicted_clusters = defaultdict(list)
    for idx, label in enumerate(cluster_labels):
        predicted_clusters[label].append(idx)
    
    # Convert the reconstructed_clusters dictionary values into a list of lists of lists
    predicted_clusters_list = [predicted_clusters[i] for i in range(n_clusters)]
    all_predictions.append(predicted_clusters_list)
    
    cluster_energy_list = []
    for cluster in predicted_clusters_list:
        # For a given cluster list, find corresponding energies
        energies_for_cluster = []
        for idx in cluster:
            energy_found = False

            # Search through the CaloParticles to find `idx` in CP_ind[event_index]
            for calo_particle_indices, calo_particle_energies in zip(CP_ind[event_index], CP_energy[event_index]):
                if idx in calo_particle_indices:
                    energy_idx = np.where(calo_particle_indices == idx)[0][0]
                    energies_for_cluster.append(calo_particle_energies[energy_idx])
                    energy_found = True
                    break  # Stop searching once a match is found

            if not energy_found:
                energies_for_cluster.append(0.0)  # Placeholder if no match is found
                print("The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes")
                
        cluster_energy_list.append(energies_for_cluster)


    all_model_energy.append(cluster_energy_list)

    # Convert the list of lists directly to an Awkward Array
    model_ind = ak.from_iter(all_predictions)
    model_energy = ak.from_iter(all_model_energy)

    
print(f"Model indices:\n {model_ind}")
print(f"Corresponding energies:\n {model_energy}")

The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_indexes
The Layer cluster from the data was not found in simtrackstersCP's vertices_

In [8]:
#------ defining functions for sim_to_reco and reco_to_sim score - definitely worth defining another function here to avoid
#------ repetition

from collections import defaultdict
import numpy as np

def sim_to_reco(event_index, sim_vertices, sim_energy, track_vertices, track_energy):
    simulated_clusters_vert = defaultdict(list)
    simulated_clusters_energy = defaultdict(list)
    
    for i, vertices in enumerate(sim_vertices[event_index]):
        simulated_clusters_vert[i] = list(vertices)
        simulated_clusters_energy[i] = list(sim_energy[event_index][i])
        
    reconstructed_clusters_vert = defaultdict(list)
    reconstructed_clusters_energy = defaultdict(list)
    
    for j, vertices in enumerate(track_vertices[event_index]):
        reconstructed_clusters_vert[j] = list(vertices)
        reconstructed_clusters_energy[j] = list(track_energy[event_index][j])

    # Determine the size of the matrix
    max_calo_id = max(simulated_clusters_vert.keys())
    max_trackster_id = max(reconstructed_clusters_vert.keys())
    
    # Initialize the scores matrix
    scores_matrix = np.zeros((max_calo_id + 1, max_trackster_id + 1))

    # Calculate the scores for each CaloParticle-Trackster pair
    for calo_id, calo_layer_indexes in simulated_clusters_vert.items():
        calo_layer_energies = simulated_clusters_energy[calo_id]

        for trackster_id, trackster_layer_indexes in reconstructed_clusters_vert.items():
            trackster_layer_energies = reconstructed_clusters_energy[trackster_id]

            # Initialize variables for the numerator and denominator
            numerator = 0.0
            denominator = 0.0

            # Step 1: Calculate the numerator
            for k, layer_index in enumerate(calo_layer_indexes):
                if layer_index in trackster_layer_indexes:
                    # Find the index of this layer in the Trackster
                    trackster_index = trackster_layer_indexes.index(layer_index)

                    # Calculate the fractions
                    fr_SC_MC_k = calo_layer_energies[k] / sum(calo_layer_energies)
                    fr_TST_reco_k = trackster_layer_energies[trackster_index] / sum(trackster_layer_energies)

                    # Add to the numerator
                    numerator += ((fr_TST_reco_k - fr_SC_MC_k) ** 2) * (calo_layer_energies[k] ** 2)

            # Step 2: Calculate the denominator
            for h, layer_index in enumerate(calo_layer_indexes):
                fr_SC_MC_h = calo_layer_energies[h] / sum(calo_layer_energies)
                denominator += (fr_SC_MC_h * calo_layer_energies[h]) ** 2

            # Step 3: Compute the final score
            if denominator > 0:  # To avoid division by zero
                score_3D = numerator / denominator
            else:
                score_3D = 0.0

            # Store the score in the matrix
            scores_matrix[calo_id][trackster_id] = score_3D

    return scores_matrix

print(sim_to_reco(0,CP_ind,CP_energy,track_ind,track_en))

def reco_to_sim(event_index, sim_vertices, sim_energy, track_vertices, track_energy):
    simulated_clusters_vert = defaultdict(list)
    simulated_clusters_energy = defaultdict(list)
    
    for i, vertices in enumerate(sim_vertices[event_index]):
        simulated_clusters_vert[i] = list(vertices)
        simulated_clusters_energy[i] = list(sim_energy[event_index][i])
        
    reconstructed_clusters_vert = defaultdict(list)
    reconstructed_clusters_energy = defaultdict(list)
    
    for j, vertices in enumerate(track_vertices[event_index]):
        reconstructed_clusters_vert[j] = list(vertices)
        reconstructed_clusters_energy[j] = list(track_energy[event_index][j])

    # Determine the size of the matrix
    max_calo_id = max(simulated_clusters_vert.keys())
    max_trackster_id = max(reconstructed_clusters_vert.keys())
    
    # Initialize the scores matrix
    scores_matrix = np.zeros((max_calo_id + 1, max_trackster_id + 1))

    # Calculate the scores for each Trackster-CaloParticle pair
    for trackster_id, trackster_layer_indexes in reconstructed_clusters_vert.items():
        trackster_layer_energies = reconstructed_clusters_energy[trackster_id]

        for calo_id, calo_layer_indexes in simulated_clusters_vert.items():
            calo_layer_energies = simulated_clusters_energy[calo_id]

            # Initialize variables for the numerator and denominator
            numerator = 0.0
            denominator = 0.0

            # Step 1: Calculate the numerator
            for k, layer_index in enumerate(trackster_layer_indexes):
                if layer_index in calo_layer_indexes:
                    # Find the index of this layer in the CaloParticle
                    calo_index = calo_layer_indexes.index(layer_index)

                    # Calculate the fractions
                    fr_TST_reco_k = trackster_layer_energies[k] / sum(trackster_layer_energies)
                    fr_SC_MC_k = calo_layer_energies[calo_index] / sum(calo_layer_energies)

                    # Add to the numerator
                    numerator += ((fr_TST_reco_k - fr_SC_MC_k) ** 2) * (trackster_layer_energies[k] ** 2)

            # Step 2: Calculate the denominator
            for h, layer_index in enumerate(trackster_layer_indexes):
                fr_TST_reco_h = trackster_layer_energies[h] / sum(trackster_layer_energies)
                denominator += (fr_TST_reco_h * trackster_layer_energies[h]) ** 2

            # Step 3: Compute the final score
            if denominator > 0:  # To avoid division by zero
                score_3D = numerator / denominator
            else:
                score_3D = 0.0

            # Store the score in the matrix
            scores_matrix[calo_id][trackster_id] = score_3D

    return scores_matrix

print(reco_to_sim(0,CP_ind,CP_energy,track_ind,track_en))

[[0.         0.19740443 0.         0.34145134 0.31136226]
 [0.39041839 0.         0.14397996 0.         0.        ]]
[[0.         0.97447955 0.         0.13632692 0.96710424]
 [0.15149494 0.         0.91058102 0.         0.        ]]


In [12]:
#------Loop over all events and determine overall single value for efficiency and fake rate-------# (STOLEN FROM TICL)

# Initialize variables
efficiency_numerator = 0
efficiency_denominator = 0
purity_numerator = 0
purity_denominator = 0

# Loop over each event to calculate efficiency and fake rate
for event_idx, data in enumerate(test_loader):
    # Calculate the scores matrix for the current event
    scores_matrix_eff = sim_to_reco(event_idx, CP_ind, CP_energy, track_ind, track_en)
    scores_matrix_purity = reco_to_sim(event_idx, CP_ind, CP_energy, track_ind, track_en)

    # Efficiency Calculation
    total_calo_particles = scores_matrix_eff.shape[0]
    efficiency_denominator += total_calo_particles  # Add to efficiency denominator

    # Count matched calo particles for efficiency
    for row in scores_matrix_eff:
        if any(score < 0.2 for score in row):  # Match condition
            efficiency_numerator += 1

    # Purity Calculation
    total_reconstructed_tracksters = scores_matrix_purity.shape[1]
    purity_denominator += total_reconstructed_tracksters  # Add to purity denominator

    # Count matched tracksters for purity
    for column in scores_matrix_purity.T:  # Transpose matrix to loop over columns (tracksters)
        if any(score < 0.2 for score in column):  # Match condition
            purity_numerator += 1

# Calculate metrics
efficiency = efficiency_numerator / efficiency_denominator if efficiency_denominator > 0 else 0
purity = purity_numerator / purity_denominator if purity_denominator > 0 else 0
fake_rate = 1 - purity

# Results
print(f"Efficiency: {efficiency:.2f}")
print(f"Fake Rate: {fake_rate:.2f}")




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


ValueError: max() arg is an empty sequence

In [41]:
#--------plotting efficiency and fake rate over distance for both model and TICL----------------#
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

def calculate_distance(sim_centre_x, sim_centre_y, sim_centre_z):
    """
    Calculate pairwise distances between simulated cluster centers.
    """
    distances = []
    for event_idx in range(len(sim_centre_x)):
        x1, x2 = sim_centre_x[event_idx]
        y1, y2 = sim_centre_y[event_idx]
        z1, z2 = sim_centre_z[event_idx]
        distance = np.sqrt((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)
        distances.append(distance)
    return distances

def calculate_efficiency_vs_distance(
    CP_ind, CP_energy, track_ind, track_en, 
    model_ind, model_energy,  # New inputs for model-generated tracksters
    sim_centre_x, sim_centre_y, sim_centre_z, 
    distance_bins, threshold=0.2
):
    """
    Calculate efficiency as a function of distance between simulated clusters,
    comparing TICL-generated and model-generated tracksters.
    """
    # Calculate distances
    distances = calculate_distance(sim_centre_x, sim_centre_y, sim_centre_z)

    # Initialize results
    efficiency_per_bin_ticl = []
    efficiency_per_bin_model = []
    error_per_bin_ticl = []
    error_per_bin_model = []
    bin_centers = []
    event_counts = []

    for i in range(len(distance_bins) - 1):
        bin_min, bin_max = distance_bins[i], distance_bins[i + 1]
        bin_events = [idx for idx, d in enumerate(distances) if bin_min <= d < bin_max]

        # Skip bins with no events
        if not bin_events:
            continue

        # Initialize numerator and denominator for efficiency in this bin
        numerator_ticl = 0
        numerator_model = 0
        denominator_ticl = 0
        denominator_model = 0

        for event_idx in bin_events:
            # For TICL-generated tracksters
            scores_matrix_ticl = sim_to_reco(event_idx, CP_ind, CP_energy, track_ind, track_en)
            total_calo_particles_ticl = scores_matrix_ticl.shape[0]
            denominator_ticl += total_calo_particles_ticl

            for row in scores_matrix_ticl:
                if any(score < threshold for score in row):
                    numerator_ticl += 1

            # For model-generated tracksters
            scores_matrix_model = sim_to_reco(event_idx, CP_ind, CP_energy, model_ind, model_energy)
            total_calo_particles_model = scores_matrix_model.shape[0]
            denominator_model += total_calo_particles_model

            for row in scores_matrix_model:
                if any(score < threshold for score in row):
                    numerator_model += 1

        # Calculate efficiency and error for TICL-generated tracksters in this bin
        if denominator_ticl > 0:
            efficiency_ticl = numerator_ticl / denominator_ticl
            error_ticl = np.sqrt(efficiency_ticl * (1 - efficiency_ticl) / denominator_ticl)
        else:
            efficiency_ticl = 0
            error_ticl = 0

        # Calculate efficiency and error for model-generated tracksters in this bin
        if denominator_model > 0:
            efficiency_model = numerator_model / denominator_model
            error_model = np.sqrt(efficiency_model * (1 - efficiency_model) / denominator_model)
        else:
            efficiency_model = 0
            error_model = 0

        efficiency_per_bin_ticl.append(efficiency_ticl)
        error_per_bin_ticl.append(error_ticl)

        efficiency_per_bin_model.append(efficiency_model)
        error_per_bin_model.append(error_model)

        bin_centers.append((bin_min + bin_max) / 2)
        event_counts.append(len(bin_events))

    # Print event counts for each bin
    print("Distance Bins and Event Counts (Non-Zero Bins Only):")
    for center, count in zip(bin_centers, event_counts):
        print(f"Bin {center - 2.5}-{center + 2.5} cm: {count} events")

    # Plot the efficiency vs distance for both TICL-generated and model-generated tracksters
    plt.figure(figsize=(8, 6))

    plt.errorbar(bin_centers, efficiency_per_bin_ticl, yerr=error_per_bin_ticl, fmt='o', capsize=3, color='b', label='TICL Efficiency')
    plt.errorbar(bin_centers, efficiency_per_bin_model, yerr=error_per_bin_model, fmt='o', capsize=3, color='r', label='Model Efficiency')
    plt.plot(bin_centers, efficiency_per_bin_ticl, linestyle='-', color='b', alpha=0.7)
    plt.plot(bin_centers, efficiency_per_bin_model, linestyle='-', color='r', alpha=0.7)

    plt.xlabel('Distance [cm]')
    plt.ylim(0, 1.2)
    plt.xlim(min(bin_centers) - 5, max(bin_centers) + 5)
    plt.ylabel('Efficiency')
    plt.title('Efficiency vs Distance')
    plt.grid()
    plt.legend()
    plt.show()

    # Return results for further use
    return {
        "bin_centers": bin_centers,
        "efficiency_per_bin_ticl": efficiency_per_bin_ticl,
        "error_per_bin_ticl": error_per_bin_ticl,
        "efficiency_per_bin_model": efficiency_per_bin_model,
        "error_per_bin_model": error_per_bin_model,
        "event_counts": event_counts
    }


# Call the function with sample inputs
results = calculate_efficiency_vs_distance(
    CP_ind, CP_energy, track_ind, track_en, 
    model_ind, model_energy,
    sim_centre_x, sim_centre_y, sim_centre_z, 
    distance_bins=np.arange(0, 160, 5), threshold=0.2
)

In [51]:
def calculate_fake_rate_vs_distance(
    CP_ind, CP_energy, track_ind, track_en, 
    model_ind, model_energy,  # New inputs for model-generated tracksters
    sim_centre_x, sim_centre_y, sim_centre_z, 
    distance_bins, threshold=0.2
):
    """
    Calculate fake rate as a function of distance between simulated clusters,
    comparing TICL-generated and model-generated tracksters.
    """
    # Calculate distances
    distances = calculate_distance(sim_centre_x, sim_centre_y, sim_centre_z)
    
    # Initialize results
    fake_rate_per_bin_ticl = []
    fake_rate_per_bin_model = []
    error_per_bin_ticl = []
    error_per_bin_model = []
    bin_centers = []
    event_counts = []

    for i in range(len(distance_bins) - 1):
        bin_min, bin_max = distance_bins[i], distance_bins[i + 1]
        bin_events = [idx for idx, d in enumerate(distances) if bin_min <= d < bin_max]

        # Skip bins with no events
        if not bin_events:
            continue

        # Initialize numerator and denominator for fake rate in this bin
        numerator_ticl = 0
        numerator_model = 0
        denominator_ticl = 0
        denominator_model = 0

        for event_idx in bin_events:
            # For TICL-generated tracksters
            scores_matrix_ticl = reco_to_sim(event_idx, CP_ind, CP_energy, track_ind, track_en)
            total_reconstructed_tracksters_ticl = scores_matrix_ticl.shape[1]
            denominator_ticl += total_reconstructed_tracksters_ticl

            for column in scores_matrix_ticl.T:  # Iterate over columns (tracksters)
                if any(score < threshold for score in column):  # Match condition
                    numerator_ticl += 1

            # For model-generated tracksters
            scores_matrix_model = reco_to_sim(event_idx, CP_ind, CP_energy, model_ind, model_energy)
            total_reconstructed_tracksters_model = scores_matrix_model.shape[1]
            denominator_model += total_reconstructed_tracksters_model

            for column in scores_matrix_model.T:
                if any(score < threshold for score in column):  # Match condition
                    numerator_model += 1

        # Calculate fake rate and error for TICL-generated tracksters in this bin
        if denominator_ticl > 0:
            purity_ticl = numerator_ticl / denominator_ticl
            fake_rate_ticl = 1 - purity_ticl
            error_ticl = np.sqrt(purity_ticl * (1 - purity_ticl) / denominator_ticl)
        else:
            fake_rate_ticl = 0
            error_ticl = 0

        # Calculate fake rate and error for model-generated tracksters in this bin
        if denominator_model > 0:
            purity_model = numerator_model / denominator_model
            fake_rate_model = 1 - purity_model
            error_model = np.sqrt(purity_model * (1 - purity_model) / denominator_model)
        else:
            fake_rate_model = 0
            error_model = 0

        fake_rate_per_bin_ticl.append(fake_rate_ticl)
        error_per_bin_ticl.append(error_ticl)

        fake_rate_per_bin_model.append(fake_rate_model)
        error_per_bin_model.append(error_model)

        bin_centers.append((bin_min + bin_max) / 2)
        event_counts.append(len(bin_events))

    # Print event counts for each bin
    print("Distance Bins and Event Counts (Non-Zero Bins Only):")
    for center, count in zip(bin_centers, event_counts):
        print(f"Bin {center - 2.5}-{center + 2.5} cm: {count} events")

    # Plot the fake rate vs distance for both TICL-generated and model-generated tracksters
    plt.figure(figsize=(8, 6))

    plt.errorbar(bin_centers, fake_rate_per_bin_ticl, yerr=error_per_bin_ticl, fmt='o', capsize=3, color='b', label='TICL Fake Rate')
    plt.errorbar(bin_centers, fake_rate_per_bin_model, yerr=error_per_bin_model, fmt='o', capsize=3, color='r', label='Model Fake Rate')
    plt.plot(bin_centers, fake_rate_per_bin_ticl, linestyle='-', color='b', alpha=0.7)
    plt.plot(bin_centers, fake_rate_per_bin_model, linestyle='-', color='r', alpha=0.7)

    plt.xlabel('Distance [cm]')
    plt.ylim(-0.01, 0.6)
    plt.xlim(min(bin_centers) - 5, max(bin_centers) + 5)
    plt.ylabel('Fake Rate')
    plt.title('Fake Rate vs Distance')
    plt.grid()
    plt.legend()
    plt.show()

    # Return results for further use
    return {
        "bin_centers": bin_centers,
        "fake_rate_per_bin_ticl": fake_rate_per_bin_ticl,
        "error_per_bin_ticl": error_per_bin_ticl,
        "fake_rate_per_bin_model": fake_rate_per_bin_model,
        "error_per_bin_model": error_per_bin_model,
        "event_counts": event_counts
    }


# Call the function with sample inputs
distance_bins = np.arange(0, 160, 5)
results_fake_rate = calculate_fake_rate_vs_distance(
    CP_ind, CP_energy, track_ind, track_en, 
    model_ind, model_energy,
    sim_centre_x, sim_centre_y, sim_centre_z, 
    distance_bins, threshold=0.2
)


0.056074766355140186
