In [None]:
#0: imports

import uproot 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from imports.data import CCV1
from torch_geometric.data import DataLoader 
from imports.models import Net_SEC, Net_GAT, Net_Trans
from torch_geometric.nn import knn_graph

import numpy as np
import awkward as ak
import time
from imports.Agglomerative import Aggloremative

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

Ppath = '/vols/cms/mm1221/Data/100k/5pi/test/'
Epath = '/vols/cms/mm1221/Data/100k/5e/test/'
# Load test data
data_P = CCV1(Ppath, max_events=500)
P_loader = DataLoader(data_P, batch_size=1, shuffle=False, follow_batch=['x'])

data_E = CCV1(Epath, max_events=500)
E_loader = DataLoader(data_E, batch_size=1, shuffle=False, follow_batch=['x'])


import uproot

# Define the file paths
data_Ep_path = '/vols/cms/mm1221/Data/100k/5e/test/raw/test.root'  # electron file
data_Pp_path = '/vols/cms/mm1221/Data/100k/5pi/test/raw/test.root'  # pion file

# Open the ROOT files using uproot
file_Ep = uproot.open(data_Ep_path)
file_Pp = uproot.open(data_Pp_path)

# Load the branches for electrons (data_Ep)
GT_ind_Ep   = file_Ep['simtrackstersCP']['vertices_indexes'].array()
GT_mult_Ep  = file_Ep['simtrackstersCP']['vertices_multiplicity'].array()
energies_Ep = file_Ep['clusters']['energy'].array()
LC_x_Ep     = file_Ep['clusters']['position_x'].array()
Track_ind_Ep = file_Ep['tracksters']['vertices_indexes'].array()
bc_x_Ep = file_Ep['tracksters']['barycenter_x'].array()

# Load the branches for pions (data_Pp)
GT_ind_Pp   = file_Pp['simtrackstersCP']['vertices_indexes'].array()
GT_mult_Pp  = file_Pp['simtrackstersCP']['vertices_multiplicity'].array()
energies_Pp = file_Pp['clusters']['energy'].array()
Track_ind_Pp = file_Pp['tracksters']['vertices_indexes'].array()
LC_x_Pp     = file_Pp['clusters']['position_x'].array()
bc_x_Pp = file_Pp['tracksters']['barycenter_x'].array()

# Create a skim mask to filter out events with 0 calorimeter particles (for electrons)
skim_mask_Ep = [len(e) >= 1 for e in bc_x_Ep]

# Apply the skim mask to filter the electron arrays
GT_ind_Ep   = GT_ind_Ep[skim_mask_Ep]
GT_mult_Ep  = GT_mult_Ep[skim_mask_Ep]
energies_Ep = energies_Ep[skim_mask_Ep]
Track_ind_Ep = Track_ind_Ep[skim_mask_Ep]

# Create a skim mask for pions
skim_mask = []
for e in bc_x_Pp:
    if len(e) == 0:
        skim_mask.append(False)
    else:
        skim_mask.append(True)

# Apply the skim mask to filter the pion arrays
GT_ind_Pp   = GT_ind_Pp[skim_mask]
GT_mult_Pp  = GT_mult_Pp[skim_mask]
energies_Pp = energies_Pp[skim_mask]
Track_ind_Pp = Track_ind_Pp[skim_mask]

print(len(Track_ind_Pp))

import awkward as ak

# ----- For pions -----


# Initialize lists to store filtered results for pions
filtered_GT_ind_P = []
filtered_GT_mult_P = []

# Loop over events for pions
for event_idx, track_indices in enumerate(Track_ind_Pp):
    # Flatten the current event's track indices and convert to a set
    track_flat = set(ak.flatten(track_indices).tolist())
    
    # Get the current event's GT arrays
    event_GT_ind = GT_ind_Pp[event_idx]
    event_GT_mult = GT_mult_Pp[event_idx]
    
    # Initialize lists for the filtered sublists in the current event
    filtered_event_GT_ind = []
    filtered_event_GT_mult = []
    
    # Loop over the sublists and filter using the track_flat set
    for sublist_ind, sublist_mult in zip(event_GT_ind, event_GT_mult):
        filtered_sublist_ind = [idx for idx in sublist_ind if idx in track_flat]
        filtered_sublist_mult = [mult for idx, mult in zip(sublist_ind, sublist_mult) if idx in track_flat]
        filtered_event_GT_ind.append(filtered_sublist_ind)
        filtered_event_GT_mult.append(filtered_sublist_mult)
    
    # Append the filtered event arrays to the output lists for pions
    filtered_GT_ind_P.append(filtered_event_GT_ind)
    filtered_GT_mult_P.append(filtered_event_GT_mult)

# Convert the filtered results back into awkward Arrays for pions
GT_ind_filt_P = ak.Array(filtered_GT_ind_P)
GT_mult_filt_P = ak.Array(filtered_GT_mult_P)

# ----- For electrons -----

# Initialize lists to store filtered results for electrons
filtered_GT_ind_E = []
filtered_GT_mult_E = []

# Loop over events for electrons
for event_idx, track_indices in enumerate(Track_ind_Ep):

    # Flatten the current event's track indices and convert to a set
    track_flat = set(ak.flatten(track_indices).tolist())
    
    # Get the current event's GT arrays
    event_GT_ind = GT_ind_Ep[event_idx]
    event_GT_mult = GT_mult_Ep[event_idx]
    
    # Initialize lists for the filtered sublists in the current event
    filtered_event_GT_ind = []
    filtered_event_GT_mult = []
    
    # Loop over the sublists and filter using the track_flat set
    for sublist_ind, sublist_mult in zip(event_GT_ind, event_GT_mult):
        filtered_sublist_ind = [idx for idx in sublist_ind if idx in track_flat]
        filtered_sublist_mult = [mult for idx, mult in zip(sublist_ind, sublist_mult) if idx in track_flat]
        filtered_event_GT_ind.append(filtered_sublist_ind)
        filtered_event_GT_mult.append(filtered_sublist_mult)
    
    # Append the filtered event arrays to the output lists for electrons
    filtered_GT_ind_E.append(filtered_event_GT_ind)
    filtered_GT_mult_E.append(filtered_event_GT_mult)

# Convert the filtered results back into awkward Arrays for electrons
GT_ind_filt_E = ak.Array(filtered_GT_ind_E)
GT_mult_filt_E = ak.Array(filtered_GT_mult_E)

model = Net_GAT(128,3, dropout=0.3, contrastive_dim=512, heads=16)
checkpoint= torch.load('/vols/cms/mm1221/hgcal/Mixed/Track/NegativeMining/runs/GATNEW/hd128nl3cd512k64h16/epoch-100.pt',  map_location=torch.device('cpu'))
#checkpoint= torch.load('/vols/cms/er421/hgcal/code/code/Mixed/LC/Full/results/hd128nl3cd16k64/epoch-100.pt',  map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model'])  
model.eval()  

import time
import numpy as np
import matplotlib.pyplot as plt

def calculate_metrics(df, model_name):
    # ----- Efficiency Calculation -----
    # Step 1: Filter out rows where 'cp_id' is NaN
    cp_valid = df.dropna(subset=['cp_id']).copy()

    # Step 2: Group by 'event_index' and 'cp_id' to proess each CaloParticle individually
    cp_grouped = cp_valid.groupby(['event_index', 'cp_id'])

    # Step 3: For each CaloParticle, check if any 'shared_energy' >= 50% of 'cp_raw_energy'
    def is_cp_associated(group):
        cp_raw_energy = group['cp_energy'].iloc[0]  # Assuming 'cp_raw_energy' is consistent within the group
        threshold = 0.8 * cp_raw_energy
        return (group['shared_energy'] >= threshold).any()

    # Apply the association function to each group
    cp_associated = cp_grouped.apply(is_cp_associated)

    # Step 4: Calculate the number of associated CaloParticles and total CaloParticles
    num_associated_cp = cp_associated.sum()
    total_cp = cp_associated.count()
    efficiency = num_associated_cp / total_cp if total_cp > 0 else 0

    # ----- Purity Calculation -----
    tst_valid = df.dropna(subset=['trackster_id']).copy()
    tst_grouped = tst_valid.groupby(['event_index', 'trackster_id'])
    tst_associated = tst_grouped['reco_to_sim_score'].min() < 0.2
    num_associated_tst = tst_associated.sum()
    total_tst = tst_associated.count()
    purity = num_associated_tst / total_tst if total_tst > 0 else 0
    
    # ----- Ratio between num of tracksters to caloparticles Calculation -----
    num_tracksters_ratio = total_tst / total_cp if total_cp > 0 else 0
    
    # Print results for the model
    print(f"\nModel: {model_name}")
    print(f"Efficiency: {efficiency:.4f} ({num_associated_cp} associated CPs out of {total_cp} total CPs)")
    print(f"FR: {1 - purity:.4f} ({num_associated_tst} associated Tracksters out of {total_tst} total Tracksters)")
    print(f"Num tracksters ratio: {num_tracksters_ratio}")

    return {
        'efficiency': efficiency,
        'purity': purity,
        'Num_tracksters_ratio': num_tracksters_ratio,
    }

from tqdm import tqdm
def calculate_reco_to_sim_score_and_sharedE(ReconstructedTrackster, energies_indices, CaloParticle, calo_mult):
    """
    Calculate the reco-to-sim score for a given ReconstructedTrackster and CaloParticle.

    Parameters:
    - ReconstructedTrackster: array of DetIds in the ReconstructedTrackster.
    - energies_indices: array of energies associated with all DetIds (indexed by DetId).
    - CaloParticle: array of DetIds in the CaloParticle.

    Returns:
    - reco_to_sim_score: the calculated reco-to-sim score.
    """
    numerator = 0.0
    denominator = 0.0
    sharedEnergy = 0.0

    # Iterate over all DetIds in the ReconstructedTrackster
    for i, det_id in enumerate(ReconstructedTrackster):
        energy_k = energies_indices[det_id]  # Energy for the current DetId in the Trackster
        
        # Fraction of energy in the Trackster (fr_k^TST)
        fr_tst_k = 1 

        #Fraction of energy in the caloparticle
        if det_id in CaloParticle:
            index = np.where(CaloParticle == det_id)[0][0]
            fr_sc_k = 1 / calo_mult[index]
            
        else:
            fr_sc_k = 0 # binary function also for CaloParticle
            
        # Update numerator using the min function
        numerator += min(
            (fr_tst_k - fr_sc_k) ** 2,  # First term in the min function
            fr_tst_k ** 2               # Second term in the min function
        ) * (energy_k ** 2)

        # Update denominator
        denominator += (fr_tst_k ** 2) * (energy_k ** 2)
        
        #shared_energy calculation
        recosharedEnergy = energy_k * fr_tst_k
        simsharedEnergy = energy_k * fr_sc_k
        sharedEnergy += min(simsharedEnergy,recosharedEnergy)
        
        

    # Calculate score
    reco_to_sim_score = numerator / denominator if denominator != 0 else 1.0
    return reco_to_sim_score, sharedEnergy


def calculate_all_event_scores(GT_ind, GT_mult, energies, recon_ind,  num_events = 100):
    """
    Calculate sim-to-reco and reco-to-sim scores for all CaloParticle and ReconstructedTrackster combinations across all events.

    Parameters:
    - GT_ind: List of CaloParticle indices for all events.
    - energies: List of energy arrays for all events.
    - recon_ind: List of ReconstructedTrackster indices for all events.
    - LC_x, LC_y, LC_z, LC_eta: Lists of x, y, z positions and eta values for all DetIds across events.

    Returns:
    - DataFrame containing scores and additional features for each CaloParticle-Trackster combination across all events.
    """
    # Initialize an empty list to store results
    all_results = []

    # Loop over all events with a progress bar
    for event_index in tqdm(range(num_events)):
        caloparticles = GT_ind[event_index]  # Indices for all CaloParticles in the event
        tracksters = recon_ind[event_index]  # Indices for all ReconstructedTracksters in the event
        event_energies = energies[event_index]  # Energies for this event
        event_GT_mult = GT_mult[event_index]
        
        # Loop over all CaloParticles
        for calo_idx, caloparticle in enumerate(caloparticles):
            calo_mult = event_GT_mult[calo_idx]
            cp_raw_energy_lc = event_energies[caloparticle] / calo_mult
            cp_raw_energy = np.sum(cp_raw_energy_lc)

            
            for trackster_idx, trackster in enumerate(tracksters):

                
                # Calculate reco-to-sim score
                reco_to_sim_score, shared_energy = calculate_reco_to_sim_score_and_sharedE(trackster, event_energies, caloparticle,calo_mult)
                # Calculate trackster energy

                # Append results
                all_results.append({
                    "event_index": event_index,
                    "cp_id": calo_idx,
                    "trackster_id": trackster_idx,
                    "reco_to_sim_score": reco_to_sim_score,
                    "cp_energy": cp_raw_energy,
                    "shared_energy": shared_energy,
                })

    # Convert results to a DataFrame
    df = pd.DataFrame(all_results)
    return df

import numpy as np
import awkward as ak
import pandas as pd

# ---------------------------------------------------------
# 1) Define your threshold scan with more data points:
#    - From 0 to 0.4: every 0.005
#    - From 0.4 to 1: every 0.02
# ---------------------------------------------------------
first_segment = np.arange(0, 0.3, 0.002)
second_segment = np.arange(0.3, 1.01, 0.01)  # 1.02 to ensure inclusion of 1.0
threshold_values = np.concatenate((first_segment, second_segment))

# Optionally, you can save these DataFrames to CSV files:


all_predictions_P = []
for i, data in enumerate(P_loader):  # P_loader => DataLoader for pion data
    edge_index = knn_graph(data.x[:, :3], k=64, batch=data.x_batch)
    pred = model(data.x,edge_index, data.x_batch)
    all_predictions_P.append(pred[0].detach().cpu().numpy())

efficiencies_P, fakerates_P, ratio_P = [], [], []
# ---------------------------------------------------------
# 5) Loop over thresholds and compute metrics for pions
# ---------------------------------------------------------




### Loading tracksters data


  0%|                                                                                                                     | 0/1 [00:00<?, ?it/s]

/vols/cms/mm1221/Data/100k/5pi/test/raw/test.root


  0%|                                                                                                                     | 0/1 [00:16<?, ?it/s]


Reached 500 events!
### Loading tracksters data


  0%|                                                                                                                     | 0/1 [00:00<?, ?it/s]

/vols/cms/mm1221/Data/100k/5e/test/raw/test.root


  0%|                                                                                                                     | 0/1 [00:11<?, ?it/s]

Reached 500 events!





9763


In [11]:
for t in threshold_values:
    print(f"\n[Pions] Threshold = {t:.3f}")
    
    # --- 4.1 Cluster using your Aggloremative function (or other) ---
    all_cluster_labels = Aggloremative(all_predictions_P, threshold=t)
    print(len(all_cluster_labels))
    # --- 4.2 Reconstruct tracksters ---
    # Build "recon_ind_E" by grouping cluster indices for each event
    recon_ind = []

    for event_idx, labels in enumerate(all_cluster_labels):

        event_clusters = {} 

        for cluster_idx, cluster_label in enumerate(labels):
            if cluster_label not in event_clusters:
                event_clusters[cluster_label] = []
            event_clusters[cluster_label].extend(Track_ind_Pp[event_idx][cluster_idx])

        recon_ind.append([event_clusters[label] for label in sorted(event_clusters.keys())])
    

    # --- 5.3 Calculate event scores ---
    # Use your *filtered* GT arrays and energies for pions
    df_CL_temp_P = calculate_all_event_scores(
        GT_ind_filt_P, 
        GT_mult_filt_P, 
        energies_Pp, 
        recon_ind, 
        num_events=300  # using 50 events
    )
    
    # --- 5.4 Compute metrics and store in lists ---
    metrics_P = calculate_metrics(df_CL_temp_P, f"Threshold {t:.3f}")
    efficiencies_P.append(metrics_P['efficiency'])
    fakerates_P.append(1.0 - metrics_P['purity'])  
    ratio_P.append(metrics_P['Num_tracksters_ratio'])

# ---------------------------------------------------------
# 6) Build separate DataFrames for electrons and pions
# ---------------------------------------------------------


df_pions = pd.DataFrame({
    'threshold': threshold_values,
    'efficiency': efficiencies_P,
    'fake_rate': fakerates_P,
    'num_tracksters_ratio': ratio_P,
})

df_pions.to_csv("pion_GAT.csv", index=False)


[Pions] Threshold = 0.000
9763


ValueError: in ListOffsetArray64 attempting to get 0, index out of range

(https://github.com/scikit-hep/awkward-1.0/blob/1.10.3/src/libawkward/array/ListOffsetArray.cpp#L682)

In [10]:
print(len(Track_ind_Pp))

9837


In [7]:
print(Track_ind_Pp[0])

[[100, 102, 151, 180, 203, 234, 235, 266, ... 1181, 1193, 1216, 1217], [1194, 1215]]
