In [1]:
import numpy as np
import h5py
import glob
import re
from pathlib import Path
import json

# Deep learning imports
import tensorflow as tf

# Scikit-learn for preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-04-30 00:40:25.557844: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-30 00:40:25.856954: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-30 00:40:26.021259: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745998826.132700 3708235 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745998826.275788 3708235 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745998826.936433 3708235 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [2]:
class SignalDataReader:
    """Handles reading and preprocessing of signal physics event data."""
    
    def __init__(self, data_dir, scaler_path, job_id_to_model_map=None):
        self.data_dir = Path(data_dir)
        self.scaler_path = Path(scaler_path)
        self.job_id_to_model_map = job_id_to_model_map or {}

        # Initialize storage for signal models
        self.signal_models = []

        # Load feature lists and scaler parameters
        with open(self.scaler_path, 'r') as f:
            self.scaler_params = json.load(f)

        self.file_electron_features_list = [
            'electron_E', 'electron_pt', 'electron_eta', 'electron_phi',
            'electron_time',
            'electron_d0', 'electron_z0', 'electron_dpt',
            'electron_nPIX', 'electron_nMissingLayers',
            'electron_chi2', 'electron_numberDoF',  # Keep original features
            'electron_f1', 'electron_f3', 'electron_z'
        ]
        
        # Define feature lists based on README
        self.electron_features_list = [
            'electron_E', 'electron_pt', 'electron_eta', 'electron_phi',
            'electron_time',
            'electron_d0', 'electron_z0', 'electron_dpt',
            'electron_nPIX', 'electron_nMissingLayers',
            'electron_chi2', 'electron_numberDoF',
            'electron_f1', 'electron_f3', 'electron_z'
        ]
        
        self.photon_features_list = [
            'photon_E', 'photon_pt', 'photon_eta', 'photon_phi',
            'photon_time',
            'photon_maxEcell_E',
            'photon_f1', 'photon_f3', 'photon_r1', 'photon_r2',
            'photon_etas1', 'photon_phis1', 'photon_z'
        ]
        
        # Initialize and load scalers
        self._initialize_and_load_scalers()

        # Load and preprocess all data
        self.load_all_data()

    def _extract_job_id(self, filename):
        """Extract the job ID from a filename."""
        # Pattern: signal_JOBID.other_parts
        match = re.search(r'signal_(\d+)\.', str(filename))
        if match:
            return match.group(1)
        return None

    def _get_signal_model(self, job_id):
        """Get the signal model name from the job ID."""
        if not job_id:
            return "unknown"
        return self.job_id_to_model_map.get(job_id, f"job_{job_id}")

    def _initialize_and_load_scalers(self):
        """Initialize specialized scalers for each feature group based on saved parameters."""
        print(f"Loading specialized scalers from {self.scaler_path}...")
        
        # Initialize electron scalers and feature groups
        self.electron_scalers = {}
        self.electron_feature_groups = {}
        
        for group_name, params in self.scaler_params['electron'].items():
            # Get feature indices
            self.electron_feature_groups[group_name] = params.get('feature_indices', [])
            
            # Create the appropriate scaler type based on the saved parameters
            scaler_type = params.get('type', 'StandardScaler')
            
            # Initialize the scaler based on type and load its parameters
            self._initialize_scaler(self.electron_scalers, group_name, scaler_type, params)
        
        # Initialize photon scalers and feature groups
        self.photon_scalers = {}
        self.photon_feature_groups = {}
        
        for group_name, params in self.scaler_params['photon'].items():
            # Get feature indices
            self.photon_feature_groups[group_name] = params.get('feature_indices', [])
            
            # Create the appropriate scaler type based on the saved parameters
            scaler_type = params.get('type', 'StandardScaler')
            
            # Initialize the scaler based on type and load its parameters
            self._initialize_scaler(self.photon_scalers, group_name, scaler_type, params)
        
        # Initialize vertex scaler (simple case)
        vertex_params = self.scaler_params['vertex']
        self.vertex_scaler = StandardScaler()
        if 'mean' in vertex_params and 'scale' in vertex_params:
            self.vertex_scaler.mean_ = np.array(vertex_params['mean'])
            self.vertex_scaler.scale_ = np.array(vertex_params['scale'])
            self.vertex_scaler.var_ = np.square(self.vertex_scaler.scale_)
        
        print("Scalers loaded successfully.")

    def _initialize_scaler(self, scalers_dict, group_name, scaler_type, params):
        """Initialize a specific scaler based on its type and parameters."""
        if scaler_type == 'StandardScaler':
            scaler = StandardScaler()
            if 'mean' in params and 'scale' in params:
                scaler.mean_ = np.array(params['mean'])
                scaler.scale_ = np.array(params['scale'])
                scaler.var_ = np.square(scaler.scale_)
        
        elif scaler_type == 'RobustScaler':
            scaler = RobustScaler()
            if 'center' in params and 'scale' in params:
                scaler.center_ = np.array(params['center'])
                scaler.scale_ = np.array(params['scale'])
        
        elif scaler_type == 'MinMaxScaler':
            scaler = MinMaxScaler(feature_range=(-1, 1))
            scaler.min_ = np.array(params["min_"])
            scaler.scale_ = np.array(params["scale_"])
        
        elif scaler_type == 'PowerTransformer':
            method = params.get('method', 'yeo-johnson')
            standardize = params.get('standardize', True)
            scaler = PowerTransformer(method=method, standardize=standardize)
            scaler.lambdas_ = np.array(params['lambdas'])

            # Restore internal StandardScaler for standardization
            if standardize:
                scaler._scaler = StandardScaler()
                scaler._scaler.mean_ = np.array(params['mean'])
                scaler._scaler.scale_ = np.array(params['scale'])
            
            # Restore internal scaler manually if standardization was used
            if standardize and 'mean' in params and 'scale' in params:
                scaler._scaler = StandardScaler()
                scaler._scaler.mean_ = np.array(params['mean'])
                scaler._scaler.scale_ = np.array(params['scale'])
        
        else:
            # Default to StandardScaler if unknown type
            print(f"Warning: Unknown scaler type '{scaler_type}' for {group_name}, using StandardScaler")
            scaler = StandardScaler()
        
        # Store the initialized scaler
        scalers_dict[group_name] = scaler

    def load_all_data(self):
        """Load and preprocess signal data from HDF5 files."""
        print("Loading all signal data...")
        
        # Initialize as None for first file
        self.electron_features = None
        self.photon_features = None
        self.vertex_features = None
        
        file_count = 0
        
        for file_path in self.data_dir.glob("*.h5"):
            # Extract job ID and signal model from filename
            job_id = self._extract_job_id(file_path.name)
            signal_model = self._get_signal_model(job_id)
            
            with h5py.File(file_path, 'r', rdcc_nbytes=10*1024*1024) as f:
                n_events = len(f['events/PV_x'])
                print(f"Processing {file_path.name}: {n_events} events, Signal model: {signal_model}")
                
                # Load all data at once
                electrons = {feat: f[f'events/electrons/{feat}'][:] for feat in self.file_electron_features_list}
                photons = {feat: f[f'events/photons/{feat}'][:] for feat in self.photon_features_list}
                vertices = np.stack([
                    f['events/PV_x'][:],
                    f['events/PV_y'][:],
                    f['events/PV_z'][:]
                ], axis=1)
                
                # Process all events at once - for signal, only filter based on energy
                # e_mask = (electrons['electron_E'] > 0)
                
                # Initialize arrays for all events
                e_feats = np.zeros((n_events, 4, len(self.electron_features_list)))
                p_feats = np.zeros((n_events, 4, len(self.photon_features_list)))
                
                # Process all events at once
                for feat_idx, feat in enumerate(self.electron_features_list):
                    if feat == 'electron_chi2_over_nDoF':
                        # Calculate chi2/ndof ratio
                        chi2 = electrons['electron_chi2']
                        ndof = electrons['electron_numberDoF']
                        
                        # Initialize ratio with zeros
                        ratio = np.zeros_like(chi2)
                        
                        # Only calculate ratio where ndof > 0
                        valid_mask = ndof > 0
                        ratio[valid_mask] = chi2[valid_mask] / ndof[valid_mask]
                        e_feats[..., feat_idx] = ratio
                    else:
                        e_feats[..., feat_idx] = electrons[feat]
                    
                    # e_feats[..., feat_idx] = np.where(e_mask, e_feats[..., feat_idx], 0)  # Zero out electrons failing selection

                for feat_idx, feat in enumerate(self.photon_features_list):
                    p_feats[..., feat_idx] = photons[feat]

                # Apply event filtering: Require at least two objects
                electron_count = np.sum(e_feats[:, :, 0] > 0, axis=1)
                photon_count = np.sum(p_feats[:, :, 0] > 0, axis=1)
                total_count = electron_count + photon_count

                # Create mask for events with at least 2 objects
                valid_events = total_count >= 2
                
                # Apply the filter
                e_feats = e_feats[valid_events]
                p_feats = p_feats[valid_events]
                vertices = vertices[valid_events]
                
                # Create signal model labels for all events that passed filtering
                models = np.array([signal_model] * len(e_feats))
                job_ids = np.array([job_id] * len(e_feats))

                # Add to main arrays
                if self.electron_features is None:
                    self.electron_features = e_feats
                    self.photon_features = p_feats
                    self.vertex_features = vertices
                    self.signal_models = models
                    self.job_ids = job_ids
                else:
                    self.electron_features = np.concatenate([self.electron_features, e_feats])
                    self.photon_features = np.concatenate([self.photon_features, p_feats])
                    self.vertex_features = np.concatenate([self.vertex_features, vertices])
                    self.signal_models = np.concatenate([self.signal_models, models])
                    self.job_ids = np.concatenate([self.job_ids, job_ids])
                
                file_count += 1
                print(f"Processed {file_count} files, total events: {len(self.electron_features):,}")
        
        print(f"\nFinal dataset:")
        print(f"Total files processed: {file_count}")
        print(f"Total events: {len(self.electron_features):,}")
        print(f"Shapes: electrons {self.electron_features.shape}, photons {self.photon_features.shape}, vertices {self.vertex_features.shape}")
        print(f"Signal models: {len(np.unique(self.signal_models))} unique models")
        
        # Apply saved scalers
        print("\nApplying saved scalers to signal data...")
        # self._transform_features()
        
        print(f"Final dataset: {len(self.electron_features):,} events")
        print(f"Shapes: electrons {self.electron_features.shape}, photons {self.photon_features.shape}, vertices {self.vertex_features.shape}")    
    
    def _transform_features(self):
        """Transform features using loaded scalers."""
        # Create working copies
        e_feats_transformed = self.electron_features.copy()
        p_feats_transformed = self.photon_features.copy()
        
        # Process electron features by group
        for group_name, feature_indices in self.electron_feature_groups.items():
            # Get all feature data for this group at once
            group_values = np.column_stack([
                self.electron_features[:, :, idx].reshape(-1, 1) 
                for idx in feature_indices
            ])
            
            # Transform all features in the group together
            transformed_values = self.electron_scalers[group_name].transform(group_values)
            
            # Split back into individual features and update
            for i, feat_idx in enumerate(feature_indices):
                feat_transformed = transformed_values[:, i].reshape(
                    self.electron_features.shape[0], self.electron_features.shape[1]
                )
                e_feats_transformed[:, :, feat_idx] = feat_transformed
        
        # Process photon features by group
        for group_name, feature_indices in self.photon_feature_groups.items():
            # Get all feature data for this group at once
            group_values = np.column_stack([
                self.photon_features[:, :, idx].reshape(-1, 1) 
                for idx in feature_indices
            ])
            
            # Transform all features in the group together
            transformed_values = self.photon_scalers[group_name].transform(group_values)
            
            # Split back into individual features and update
            for i, feat_idx in enumerate(feature_indices):
                feat_transformed = transformed_values[:, i].reshape(
                    self.photon_features.shape[0], self.photon_features.shape[1]
                )
                p_feats_transformed[:, :, feat_idx] = feat_transformed
        
        # For vertices, simple standard scaling
        self.vertex_features = self.vertex_scaler.transform(self.vertex_features)
        
        # Update features with transformed versions
        self.electron_features = e_feats_transformed
        self.photon_features = p_feats_transformed 
    
    def transform_new_data(self, electron_features, photon_features, vertex_features):
        """Transform new data using loaded scalers."""
        # Create working copies
        # e_feats_transformed = electron_features.copy()
        # p_feats_transformed = photon_features.copy()
        
        # # Process electron features by group
        # for group_name, feature_indices in self.electron_feature_groups.items():
        #     # Get all feature data for this group at once
        #     group_values = np.column_stack([
        #         electron_features[:, :, idx].reshape(-1, 1) 
        #         for idx in feature_indices
        #     ])
            
        #     # Transform all features in the group together
        #     transformed_values = self.electron_scalers[group_name].transform(group_values)
            
        #     # Split back into individual features and update
        #     for i, feat_idx in enumerate(feature_indices):
        #         feat_transformed = transformed_values[:, i].reshape(
        #             electron_features.shape[0], electron_features.shape[1]
        #         )
        #         e_feats_transformed[:, :, feat_idx] = feat_transformed
        
        # # Process photon features by group
        # for group_name, feature_indices in self.photon_feature_groups.items():
        #     # Get all feature data for this group at once
        #     group_values = np.column_stack([
        #         photon_features[:, :, idx].reshape(-1, 1) 
        #         for idx in feature_indices
        #     ])
            
        #     # Transform all features in the group together
        #     transformed_values = self.photon_scalers[group_name].transform(group_values)
            
        #     # Split back into individual features and update
        #     for i, feat_idx in enumerate(feature_indices):
        #         feat_transformed = transformed_values[:, i].reshape(
        #             photon_features.shape[0], photon_features.shape[1]
        #         )
        #         p_feats_transformed[:, :, feat_idx] = feat_transformed
        
        # # For vertices, simple standard scaling
        # v_feats_transformed = self.vertex_scaler.transform(vertex_features)
        
        return electron_features, photon_features, vertex_features

    def get_all(self):
        """Return all processed data as a tuple."""
        all_data = (
            self.electron_features,
            self.photon_features,
            self.vertex_features,
            self.signal_models,
            self.job_ids
        )
        return all_data
    
    def save_processed_data(self, output_dir):
        """Save processed data and model information to files."""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save features
        np.save(output_dir / 'signal_data_e_v2_preserve.npy', self.electron_features)
        np.save(output_dir / 'signal_data_p_v2_preserve.npy', self.photon_features)
        np.save(output_dir / 'signal_data_v_v2_preserve.npy', self.vertex_features)
        
        # Save model information
        np.save(output_dir / 'signal_models_v2_preserve.npy', self.signal_models)
        np.save(output_dir / 'signal_job_ids_preserve.npy', self.job_ids)
        
        # Create a summary of the dataset
        model_summary = {}
        for model, job_id in zip(np.unique(self.signal_models), np.unique(self.job_ids)):
            mask = self.signal_models == model
            count = np.sum(mask)
            model_summary[model] = {
                'job_id': job_id,
                'event_count': int(count),
                'percentage': float(count / len(self.signal_models) * 100)
            }
        
        # Save summary as JSON
        with open(output_dir / 'signal_summary_preserve.json', 'w') as f:
            json.dump(model_summary, f, indent=2)
            
        print(f"Data saved to {output_dir}")
        print(f"Total events saved: {len(self.electron_features)}")
        print(f"Model summary saved to {output_dir / 'signal_summary_preserve.json'}")

In [3]:
job_id_to_model_map = {
  "543783": "110_30_0p1ns",
  "543784": "110_30_0p5ns",
  "543785": "110_30_10ns",
  "543786": "110_30_2ns",
  "543787": "200_10_0p1ns",
  "543789": "200_10_10ns",
  "543790": "200_10_2ns",
  "543792": "200_15_0p5ns",
  "543793": "200_15_10ns",
  "543794": "200_15_2ns",
  "543796": "200_50_0p5ns",
  "543797": "200_50_10ns",
  "543798": "200_50_2ns",
  "543799": "200_90_0p1ns",
  "543800": "200_90_0p5ns",
  "543801": "200_90_10ns",
  "543802": "200_90_2ns",
  "543803": "400_100_0p1ns",
  "543804": "400_100_0p5ns",
  "543805": "400_100_10ns",
  "543806": "400_100_2ns",
  "543807": "400_10_0p1ns",
  "543808": "400_10_0p5ns",
  "543809": "400_10_10ns",
  "543811": "400_15_0p1ns",
  "543812": "400_15_0p5ns",
  "543813": "400_15_10ns",
  "543814": "400_15_2ns",
  "543815": "400_190_0p1ns",
  "543816": "400_190_0p5ns",
  "543817": "400_190_10ns",
  "543819": "600_10_0p1ns",
  "543821": "600_10_10ns",
  "543822": "600_10_2ns",
  "543823": "600_150_0p1ns",
  "543824": "600_150_0p5ns",
  "543825": "600_150_10ns",
  "543826": "600_150_2ns",
  "543827": "600_15_0p1ns",
  "543828": "600_15_0p5ns",
  "543829": "600_15_10ns",
  "543830": "600_15_2ns",
  "543831": "600_290_0p1ns",
  "543832": "600_290_0p5ns",
  "543833": "600_290_10ns",
  "543834": "600_290_2ns",
  "543836": "60_25_0p5ns",
  "543837": "60_25_10ns",
  "543838": "60_25_2ns"
}

In [4]:
# Test data loading and preprocessing
data_dir = "/fs/ddn/sdf/group/atlas/d/hjia625/VLL-DP/VLL_classifier/hdf5_signal_output"
scalar_path = "/fs/ddn/sdf/group/atlas/d/hjia625/VLL-DP/VLL_classifier/src/output/scaler_params_v2.json"
reader = SignalDataReader(data_dir, scalar_path, job_id_to_model_map=job_id_to_model_map)

Loading specialized scalers from /fs/ddn/sdf/group/atlas/d/hjia625/VLL-DP/VLL_classifier/src/output/scaler_params_v2.json...
Scalers loaded successfully.
Loading all signal data...
Processing signal_543784.e8564_e8528_s4277_s4114_r15530_r15514_p6069.43297402._000001.trees.h5: 50000 events, Signal model: 110_30_0p5ns


Processed 1 files, total events: 38,720
Processing signal_543785.e8564_e8528_s4277_s4114_r15530_r15514_p6069.43297406._000001.trees.h5: 50000 events, Signal model: 110_30_10ns
Processed 2 files, total events: 46,987
Processing signal_543828.e8564_e8528_s4277_s4114_r15530_r15514_p6069.43297651._000001.trees.h5: 50000 events, Signal model: 600_15_0p5ns
Processed 3 files, total events: 59,845
Processing signal_543822.e8564_e8528_s4277_s4114_r15530_r15514_p6069.43297639._000001.trees.h5: 50000 events, Signal model: 600_10_2ns
Processed 4 files, total events: 61,561
Processing signal_543832.e8564_e8528_s4237_s4114_r15540_r15516_p6069.43297658._000001.trees.h5: 50000 events, Signal model: 600_290_0p5ns
Processed 5 files, total events: 110,277
Processing signal_543790.e8564_e8528_s4277_s4114_r15530_r15514_p6069.43297432._000001.trees.h5: 50000 events, Signal model: 200_10_2ns
Processed 6 files, total events: 118,060
Processing signal_543815.e8564_e8528_s4237_s4114_r15540_r15516_p6069.43297624

In [5]:
# Process and get data
data = reader.get_all()
electron_features, photon_features, vertex_features, signal_models, job_ids = data

# Or save everything to files
reader.save_processed_data("output_model_directory")

Data saved to output_model_directory
Total events saved: 2080280
Model summary saved to output_model_directory/signal_summary_preserve.json
