In [20]:
"""
This code is crucial for calculating the class weights (f1, f2, f3) 
that  will  be  used  in the loss function to account for the class 
imbalance  in the training data. Since my training dataset contains 
varying  numbers  of photons (pid = 22), neutrons (pid = 2112), and 
other  particle  types,  applying inverse frequency weights ensures 
that the loss function does not overfit to the more frequent classes. 
Without these weights, the model would prioritize minimizing the loss 
for the most common particles, ignoring the minority classes, which 
would lead to poor generalization and performance on underrepresented 
particles.  By computing N1, N2, and N3 across the desired number of 
files and deriving class weights based on their inverse frequency, I 
can effectively balance the contribution of each class to the overall 
loss. This will help the model learn better representations for all 
particle types, regardless of their frequency in the training sample, 
which is essential for a more robust model.
"""

import sys
sys.path.append("../../src")
from ECALDataReader import ECALDataReader
import glob
import numpy as np
from tqdm import tqdm

# Set the project directory and the number of files to read
project_dir = "../../projects/flightb741.09.24.2024.20.53/"
Nfiles = 25

# Collect all relevant files in the destination directory
files = glob.glob(f"{project_dir}/dst/*")

# Limit the number of files to read based on user input or available files
Nfiles = np.amin([Nfiles, len(files)])
files = files[:Nfiles]

# Initialize counts for each particle type
N1, N2, N3 = 0, 0, 0  # N1: photons (pid = 22), N2: neutrons (pid = 2112), N3: others

# Loop over each file
for i, file in enumerate(tqdm(files, desc="Processing files")):
    # Initialize the data reader for the current file
    reader = ECALDataReader(file)
    
    # Loop through events in the file
    for j, event in enumerate(reader.file):
        # Extract Monte Carlo particle data
        mc_parts = reader.get_dict("MC::Particle")
        
        # Count occurrences of each particle type
        pids = mc_parts.pid
        N1 += np.sum(pids == 22)      # Count photons (pid = 22)
        N2 += np.sum(pids == 2112)    # Count neutrons (pid = 2112)
        N3 += np.sum((pids != 22) & (pids != 2112))  # Count all other particles

# Total number of events processed
N_total = N1 + N2 + N3

# Compute inverse frequency weights for each class
f1 = N_total / N1 if N1 > 0 else 1.0  # Handle potential division by zero
f2 = N_total / N2 if N2 > 0 else 1.0
f3 = N_total / N3 if N3 > 0 else 1.0

# Display class counts and the computed inverse frequency weights
print(f"Total photon count (pid=22): {N1}")
print(f"Total neutron count (pid=2112): {N2}")
print(f"Total other particles count: {N3}")
print(f"Inverse frequency weights: f1={f1:.2f}, f2={f2:.2f}, f3={f3:.2f}")


Processing files: 100%|████████████████████████| 25/25 [00:37<00:00,  1.49s/it]

Total photon count (pid=22): 65459
Total neutron count (pid=2112): 11033
Total other particles count: 102677
Inverse frequency weights: f1=2.74, f2=16.24, f3=1.74





In [18]:
mc_parts

Unnamed: 0,pid,px,py,pz,vx,vy,vz,vt
0,11,-0.7339,1.1613,6.0915,-0.0022,-0.0019,0.0,124.0
1,2212,0.4007,-0.3131,1.2699,-0.0022,-0.0019,0.0,124.0
2,22,-0.0043,-0.0255,0.1302,-0.0022,-0.0019,0.0,124.0
3,22,0.4042,-0.6407,1.9643,-0.0022,-0.0019,0.0,124.0
4,-211,-0.0734,0.1009,0.4665,-0.0022,-0.0019,0.0,124.0
5,211,-0.0316,0.049,0.0316,-0.0022,-0.0019,0.0,124.0
6,22,0.0402,-0.0357,0.0468,-0.0022,-0.0019,0.0,124.0
7,22,-0.002,-0.2964,0.5992,-0.0022,-0.0019,0.0,124.0
