In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.math as tfmath
import tensorflow.keras as keras
from tensorflow.keras import layers, Model
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import load_model
from sklearn.metrics import roc_curve, auc
import sklearn.metrics as sk

import os
import glob

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Concatenate, Dense, Conv2D, LeakyReLU, ReLU, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.metrics import Precision
from tensorflow.keras.regularizers import l1_l2

2025-07-13 12:40:24.211551: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
events = None  # Set to None to use all events, or specify a number to limit
test_size = 0.2  # 20% for test set
val_size = 0.2   # 20% of training set for validation (so 16% of total)
input_shape = -1

# Create output directory if it doesn't exist
os.makedirs('../data/processed_data/', exist_ok=True)

# Find all h5 files in the data directory
data_files = glob.glob('../data/*.h5')

# Separate background and signal files
bkg_files = [f for f in data_files if os.path.basename(f).lower().startswith('background')]
signal_files = [f for f in data_files if not os.path.basename(f).lower().startswith('background')]

# Process BACKGROUND data
if bkg_files:
    # Assuming there's only one background file, take the first one
    bkg_file = bkg_files[0]
    
    with h5py.File(bkg_file, 'r') as file:
        full_data = file['Particles'][:,:,:-1]
        np.random.shuffle(full_data)
        if events: full_data = full_data[:events,:,:]
    
    # Split off 10% for discovery testing
    discovery_data, remaining_data = train_test_split(full_data, test_size=0.9, shuffle=True)
    
    # define training, test and validation datasets from the remaining 90%
    X_train, X_test = train_test_split(remaining_data, test_size=test_size, shuffle=True)
    X_train, X_val = train_test_split(X_train, test_size=val_size)
    
    del full_data
    del remaining_data
    
    # flatten the data for model input
    X_train = X_train.reshape(X_train.shape[0], input_shape)
    X_test = X_test.reshape(X_test.shape[0], input_shape)
    X_val = X_val.reshape(X_val.shape[0], input_shape)
    discovery_data = discovery_data.reshape(discovery_data.shape[0], input_shape)
    
    # Save background dataset
    bkg_basename = os.path.splitext(os.path.basename(bkg_file))[0]
    with h5py.File(f'../data/processed_data/{bkg_basename}_dataset.h5', 'w') as h5f:
        h5f.create_dataset('X_train', data = X_train)
        h5f.create_dataset('X_test', data = X_test)
        h5f.create_dataset('X_val', data = X_val)
        h5f.create_dataset('X_discovery', data = discovery_data)

# Process SIGNAL files
for signal_file in signal_files:
    with h5py.File(signal_file, 'r') as f:
        signal_data = f['Particles'][:,:,:-1]
        signal_data = signal_data.reshape(signal_data.shape[0], input_shape)
        
        # Save signal dataset
        signal_basename = os.path.splitext(os.path.basename(signal_file))[0]
        with h5py.File(f'../data/processed_data/{signal_basename}_dataset.h5', 'w') as h5f2:
            h5f2.create_dataset('Data', data = signal_data)