In [2]:
import pandas as pd
import numpy as np
import h5py

def process_datasets(numbers):
    with h5py.File('/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/data.hdf5', 'w') as h5f:
        for charge_type in ['positive-charge', 'negative-charge']:
            sig_input_list = []
            sig_target_list = []
            bkg_input_list = []
            bkg_target_list = []

            for number in numbers:
                print("Processing ", charge_type, number)
                
                # Read the target data
                target_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/labels_d{number}.csv'
                target_df = pd.read_csv(target_file)

                # Read the input data
                input_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/recon8t_d{number}.csv'
                input_df = pd.read_csv(input_file)

                # Filter for significant and background data
                sig_indices = target_df['pt'].abs() >= 2
                bkg_indices = ~sig_indices

                # Reshape and append the significant input data
                sig_input_reshaped = input_df[sig_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float16)
                sig_input_list.append(sig_input_reshaped)

                # Reshape and append the significant target data
                sig_target_reshaped = target_df[sig_indices].to_numpy().reshape(-1, 13).astype(np.float16)
                sig_target_list.append(sig_target_reshaped)

                # Reshape background data
                bkg_input_reshaped = input_df[bkg_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float16)
                bkg_target_reshaped = target_df[bkg_indices].to_numpy().reshape(-1, 13).astype(np.float16)

                # Random undersampling of the background data
                num_sig_samples = sig_input_reshaped.shape[0]
                random_indices = np.random.choice(bkg_input_reshaped.shape[0], num_sig_samples, replace=False)
                bkg_input_undersampled = bkg_input_reshaped[random_indices]
                bkg_target_undersampled = bkg_target_reshaped[random_indices]

                bkg_input_list.append(bkg_input_undersampled)
                bkg_target_list.append(bkg_target_undersampled)

            # Convert lists to NumPy arrays
            sig_input_combined = np.vstack(sig_input_list)
            sig_target_combined = np.vstack(sig_target_list)
            bkg_input_combined = np.vstack(bkg_input_list)
            bkg_target_combined = np.vstack(bkg_target_list)

            # Save data in HDF5 format
            h5f.create_dataset(f'{charge_type}_sig_input', data=sig_input_combined)
            h5f.create_dataset(f'{charge_type}_sig_target', data=sig_target_combined)
            h5f.create_dataset(f'{charge_type}_bkg_input', data=bkg_input_combined)
            h5f.create_dataset(f'{charge_type}_bkg_target', data=bkg_target_combined)

number_list = list(range(16501,16651))
process_datasets(number_list)  # Replace with actual numbers

Processing  positive-charge 16501
Processing  positive-charge 16502
Processing  positive-charge 16503
Processing  positive-charge 16504
Processing  positive-charge 16505
Processing  positive-charge 16506
Processing  positive-charge 16507
Processing  positive-charge 16508
Processing  positive-charge 16509
Processing  positive-charge 16510
Processing  positive-charge 16511
Processing  positive-charge 16512
Processing  positive-charge 16513
Processing  positive-charge 16514
Processing  positive-charge 16515
Processing  positive-charge 16516
Processing  positive-charge 16517
Processing  positive-charge 16518
Processing  positive-charge 16519
Processing  positive-charge 16520
Processing  positive-charge 16521
Processing  positive-charge 16522
Processing  positive-charge 16523
Processing  positive-charge 16524
Processing  positive-charge 16525
Processing  positive-charge 16526
Processing  positive-charge 16527
Processing  positive-charge 16528
Processing  positive-charge 16529
Processing  po