In [None]:
import pandas as pd
import numpy as np
import h5py

def process_datasets(numbers):
    with h5py.File('/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/data.hdf5', 'w') as h5f:
        for charge_type in ['positive-charge', 'negative-charge']:
            sig_input_list = []
            sig_target_list = []
            bkg_input_list = []
            bkg_target_list = []

            for number in numbers:
                print("Processing ", charge_type, number)
                
                # Read the target data
                target_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/labels_d{number}.csv'
                target_df = pd.read_csv(target_file)

                # Read the input data
                input_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/recon8t_d{number}.csv'
                input_df = pd.read_csv(input_file)

                # Filter for significant and background data
                sig_indices = target_df['pt'].abs() >= 2
                bkg_indices = ~sig_indices

                # Reshape and append the significant input data
                sig_input_reshaped = input_df[sig_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float16)
                sig_input_list.append(sig_input_reshaped)

                # Reshape and append the significant target data
                sig_target_reshaped = target_df[sig_indices].to_numpy().reshape(-1, 13).astype(np.float16)
                sig_target_list.append(sig_target_reshaped)

                # Reshape background data
                bkg_input_reshaped = input_df[bkg_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float16)
                bkg_target_reshaped = target_df[bkg_indices].to_numpy().reshape(-1, 13).astype(np.float16)

                # Random undersampling of the background data
                num_sig_samples = sig_input_reshaped.shape[0]
                random_indices = np.random.choice(bkg_input_reshaped.shape[0], num_sig_samples, replace=False)
                bkg_input_undersampled = bkg_input_reshaped[random_indices]
                bkg_target_undersampled = bkg_target_reshaped[random_indices]

                bkg_input_list.append(bkg_input_undersampled)
                bkg_target_list.append(bkg_target_undersampled)

            # Convert lists to NumPy arrays
            sig_input_combined = np.vstack(sig_input_list)
            sig_target_combined = np.vstack(sig_target_list)
            bkg_input_combined = np.vstack(bkg_input_list)
            bkg_target_combined = np.vstack(bkg_target_list)

            # Save data in HDF5 format
            h5f.create_dataset(f'{charge_type}_sig_input', data=sig_input_combined)
            h5f.create_dataset(f'{charge_type}_sig_target', data=sig_target_combined)
            h5f.create_dataset(f'{charge_type}_bkg_input', data=bkg_input_combined)
            h5f.create_dataset(f'{charge_type}_bkg_target', data=bkg_target_combined)

number_list = list(range(16501,16651))
process_datasets(number_list)  # Replace with actual numbers

In [1]:
import pandas as pd
import numpy as np
import h5py

def process_datasets(numbers):
    with h5py.File('/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/fl32_data_1.hdf5', 'w') as h5f:
        for charge_type in ['positive-charge', 'negative-charge']:
            sig_input_list = []
            sig_target_list = []
            bkg_input_list = []
            bkg_target_list = []

            for number in numbers:
                print("Processing ", charge_type, number)
                
                # Read the target data
                target_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/labels_d{number}.csv'
                target_df = pd.read_csv(target_file)

                # Read the input data
                input_file = f'/gpfs/slac/atlas/fs1/d/hjia625/Smart_Pixel/{charge_type}/recon8t_d{number}.csv'
                input_df = pd.read_csv(input_file)

                # Filter for significant and background data
                sig_indices = target_df['pt'].abs() >= 2
                bkg_indices = ~sig_indices

                # Reshape and append the significant input data
                sig_input_reshaped = input_df[sig_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float32)
                sig_input_list.append(sig_input_reshaped)

                # Reshape and append the significant target data
                sig_target_reshaped = target_df[sig_indices].to_numpy().reshape(-1, 13).astype(np.float32)
                sig_target_list.append(sig_target_reshaped)

                # Reshape background data
                bkg_input_reshaped = input_df[bkg_indices].to_numpy().reshape(-1, 8*13*21).astype(np.float32)
                bkg_target_reshaped = target_df[bkg_indices].to_numpy().reshape(-1, 13).astype(np.float32)

                # Random undersampling of the background data
                num_sig_samples = sig_input_reshaped.shape[0]
                random_indices = np.random.choice(bkg_input_reshaped.shape[0], num_sig_samples, replace=False)
                bkg_input_undersampled = bkg_input_reshaped[random_indices]
                bkg_target_undersampled = bkg_target_reshaped[random_indices]

                bkg_input_list.append(bkg_input_undersampled)
                bkg_target_list.append(bkg_target_undersampled)

            # Convert lists to NumPy arrays
            sig_input_combined = np.vstack(sig_input_list)
            sig_target_combined = np.vstack(sig_target_list)
            bkg_input_combined = np.vstack(bkg_input_list)
            bkg_target_combined = np.vstack(bkg_target_list)

            # Save data in HDF5 format
            h5f.create_dataset(f'{charge_type}_sig_input', data=sig_input_combined)
            h5f.create_dataset(f'{charge_type}_sig_target', data=sig_target_combined)
            h5f.create_dataset(f'{charge_type}_bkg_input', data=bkg_input_combined)
            h5f.create_dataset(f'{charge_type}_bkg_target', data=bkg_target_combined)

#number_list = list(range(16501,16626))
number_list = list(range(16625,16726))
process_datasets(number_list)  # Replace with actual numbers

Processing  positive-charge 16625
Processing  positive-charge 16626
Processing  positive-charge 16627
Processing  positive-charge 16628
Processing  positive-charge 16629
Processing  positive-charge 16630
Processing  positive-charge 16631
Processing  positive-charge 16632
Processing  positive-charge 16633
Processing  positive-charge 16634
Processing  positive-charge 16635
Processing  positive-charge 16636
Processing  positive-charge 16637
Processing  positive-charge 16638
Processing  positive-charge 16639
Processing  positive-charge 16640
Processing  positive-charge 16641
Processing  positive-charge 16642
Processing  positive-charge 16643
Processing  positive-charge 16644
Processing  positive-charge 16645
Processing  positive-charge 16646
Processing  positive-charge 16647
Processing  positive-charge 16648
Processing  positive-charge 16649
Processing  positive-charge 16650
Processing  positive-charge 16651
Processing  positive-charge 16652
Processing  positive-charge 16653
Processing  po

In [1]:
import pandas as pd
import numpy as np
import h5py

def process_datasets(numbers):
    with h5py.File('/fs/ddn/sdf/group/atlas/d/hjia625/Smart_Pixel/fl32_data_v2.hdf5', 'w') as h5f:
        sig_input_list = []
        sig_target_list = []
        bkg_input_list = []
        bkg_target_list = []

        for number in numbers:
            
            print("Processing dataset number", number)
            
            # # Read the target data from Parquet
            # target_file = f'/fs/ddn/sdf/group/atlas/d/hjia625/Smart_Pixel/data_v2/labels/labels_d{number}.parquet'
            # target_df = pd.read_parquet(target_file)

            # # Read the input data from Parquet
            # input_file = f'/fs/ddn/sdf/group/atlas/d/hjia625/Smart_Pixel/data_v2/recon3D/recon3D_d{number}.parquet'
            # input_df = pd.read_parquet(input_file)

            # Initialize file paths
            target_file = f'/fs/ddn/sdf/group/atlas/d/hjia625/Smart_Pixel/data_v2/labels/labels_d{number}.parquet'
            input_file = f'/fs/ddn/sdf/group/atlas/d/hjia625/Smart_Pixel/data_v2/recon3D/recon3D_d{number}.parquet'

            # Read the target data from Parquet with integrity check
            try:
                target_df = pd.read_parquet(target_file)
            except Exception as e:
                print(f"Failed to read target file {number}: {e}")
                continue  # Skip this iteration and proceed to the next file

            # Read the input data from Parquet with integrity check
            try:
                input_df = pd.read_parquet(input_file)
            except Exception as e:
                print(f"Failed to read input file {number}: {e}")
                continue  # Skip this iteration and proceed to the next file

            # Filter for significant and background data
            sig_indices = target_df['pt'].abs() >= 2
            bkg_indices = ~sig_indices

            # Reshape and append the significant input data
            sig_input_reshaped = input_df[sig_indices].to_numpy().reshape(-1, 20*13*21).astype(np.float32)
            sig_input_list.append(sig_input_reshaped)

            # Reshape and append the significant target data
            sig_target_reshaped = target_df[sig_indices].to_numpy().reshape(-1, 13).astype(np.float32)
            sig_target_list.append(sig_target_reshaped)

            # Reshape background data
            bkg_input_reshaped = input_df[bkg_indices].to_numpy().reshape(-1, 20*13*21).astype(np.float32)
            bkg_target_reshaped = target_df[bkg_indices].to_numpy().reshape(-1, 13).astype(np.float32)

            # Random undersampling of the background data
            num_sig_samples = sig_input_reshaped.shape[0]
            random_indices = np.random.choice(bkg_input_reshaped.shape[0], num_sig_samples, replace=False)
            bkg_input_undersampled = bkg_input_reshaped[random_indices]
            bkg_target_undersampled = bkg_target_reshaped[random_indices]

            bkg_input_list.append(bkg_input_undersampled)
            bkg_target_list.append(bkg_target_undersampled)

        # Convert lists to NumPy arrays
        sig_input_combined = np.vstack(sig_input_list)
        sig_target_combined = np.vstack(sig_target_list)
        bkg_input_combined = np.vstack(bkg_input_list)
        bkg_target_combined = np.vstack(bkg_target_list)

        # Save data in HDF5 format
        h5f.create_dataset('sig_input', data=sig_input_combined)
        h5f.create_dataset('sig_target', data=sig_target_combined)
        h5f.create_dataset('bkg_input', data=bkg_input_combined)
        h5f.create_dataset('bkg_target', data=bkg_target_combined)

# Example usage with a list of dataset numbers
number_list = list(range(17301, 17460))
process_datasets(number_list)


Processing dataset number 17301
Processing dataset number 17302
Processing dataset number 17303
Processing dataset number 17304
Processing dataset number 17305
Processing dataset number 17306
Processing dataset number 17307
Processing dataset number 17308
Processing dataset number 17309
Processing dataset number 17310
Processing dataset number 17311
Processing dataset number 17312
Processing dataset number 17313
Processing dataset number 17314
Processing dataset number 17315
Processing dataset number 17316
Processing dataset number 17317
Processing dataset number 17318
Processing dataset number 17319
Processing dataset number 17320
Processing dataset number 17321
Processing dataset number 17322
Processing dataset number 17323
Processing dataset number 17324
Processing dataset number 17325
Processing dataset number 17326
Processing dataset number 17327
Processing dataset number 17328
Processing dataset number 17329
Processing dataset number 17330
Processing dataset number 17331
Processi