# Dataset Index Generation
Generates indices for train, validation and test sets

In [1]:
import h5py
# from progressbar import *
import re
import sys
import numpy as np

sys.path.append('../../WatChMaL_analysis/WatChMaL')
import analysis.event_utils as ev

## Options

In [22]:
n_test_files = 100
n_val_files = 200
labels = [0,1]
inverse_label_dict = {0:"gamma", 1:"e-", 2:"mu-", 3: "pi0"}

## Load dataset

In [23]:
# data_path = "/project/rpp-blairt2k/machine_learning/data/HKHybrid/numpy/e-mu-pi0_wo_mpmts.hdf5"
data_path = "/project/rpp-blairt2k/machine_learning/data/HKHybrid/numpy/HKHybrid_e-gamma_E0to1000MeV_unif-pos-R3240-y3287cm_4pi-dir_6Mevts_w_mPMT.hdf5"
f = h5py.File(data_path, "r")

In [24]:
positions = np.array(f['positions'])
event_labels = np.array(f['labels'])
root_files = np.array(f['root_files']).astype(str)

In [5]:
event_labels

array([1, 1, 1, ..., 3, 3, 3], dtype=int32)

## Find the files of each label and indices of each file

In [15]:
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

In [25]:
files_in_labels = {l: sorted(set(root_files[event_labels==l]), key=natural_keys) for l in labels}
idxs_in_files = {f: range(i, i+c) for f,i,c in zip(*np.unique(root_files, return_index=True, return_counts=True))}

In [26]:
for l, f in files_in_labels.items():
    print("label", l,"has", len(f),"files and ", sum([len(idxs_in_files[i]) for i in f]), "indices")

label 0 has 1000 files and  3000000 indices
label 1 has 1000 files and  3000000 indices


## Create the splits

In [27]:
split_files = {"test_idxs":  [f for l in labels for f in files_in_labels[l][:n_test_files]],
               "val_idxs":   [f for l in labels for f in files_in_labels[l][n_test_files:n_test_files+n_val_files]],
               "train_idxs": [f for l in labels for f in files_in_labels[l][n_test_files+n_val_files:]]}
split_idxs = {k: [i for f in v for i in idxs_in_files[f]] for k, v in split_files.items()}

In [28]:
for s in split_files.keys():
    print(s,"has", len(split_files[s]),"files and", len(split_idxs[s]),"indices")

test_idxs has 200 files and 600000 indices
val_idxs has 400 files and 1200000 indices
train_idxs has 1400 files and 4200000 indices


In [29]:
# Verify that all events are uniquely accounted for
all_indices = np.concatenate(list(split_idxs.values()))
print(len(event_labels))
print(len(all_indices))
print(len(set(all_indices)))

6000000
6000000
6000000


## If you want to do FV cuts, run the following block of code, otherwise, go straight to Save File

In [23]:
# h5_positions = np.array(f['positions'])
dwall = ev.dwall(positions[split_idxs['test_idxs']], tank_half_height=3287, tank_radius=3240, tank_axis=2)
dwall_cut = dwall > 200.

In [24]:
one_particle_test_event_no = int(len(split_idxs['test_idxs'])/2)
first_particle_after_FVcut_no = len(np.where(dwall_cut[:one_particle_test_event_no]== True)[0])
fv_2m_cut = {}
fv_2m_cut[inverse_label_dict[labels[0]]] = np.where(dwall_cut== True)[0][:first_particle_after_FVcut_no]
fv_2m_cut[inverse_label_dict[labels[1]]] = np.where(dwall_cut== True)[0][first_particle_after_FVcut_no:]
np.save('/project/rpp-blairt2k/machine_learning/data/HKHybrid/numpy/e-pi0_w_mpmts_fv_2m.npy', fv_2m_cut)

## Save file

In [30]:
np.savez('/project/rpp-blairt2k/machine_learning/data/HKHybrid/numpy/gammae-_wo_mpmts_idxs_712ttv.npz', **split_idxs)