# Dataset Index Generation 

This is used to generate the indices for the test set for the Physics samples. 

In [1]:
import sys
import os
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt 
from functools import reduce

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

sys.path.append("../..")
sys.path.append("..")

%load_ext autoreload
%matplotlib inline
%autoreload 2

from IPython.display import HTML

In [2]:
data_path = "/fast_scratch/WatChMaL/data/physics_samples_five_thousand_files_apr28.h5"

f = h5py.File(data_path, "r")

In [3]:
print(f.keys())

for key in f.keys():
    print(f[key])

<KeysViewHDF5 ['angles', 'energies', 'event_hits_index', 'event_ids', 'hit_charge', 'hit_pmt', 'hit_time', 'labels', 'positions', 'root_files', 'veto', 'veto2']>
<HDF5 dataset "angles": shape (1897294, 2), type "<f4">
<HDF5 dataset "energies": shape (1897294, 1), type "<f4">
<HDF5 dataset "event_hits_index": shape (1897294,), type "<i8">
<HDF5 dataset "event_ids": shape (1897294,), type "<i4">
<HDF5 dataset "hit_charge": shape (2090271258,), type "<f4">
<HDF5 dataset "hit_pmt": shape (2090271258,), type "<i4">
<HDF5 dataset "hit_time": shape (2090271258,), type "<f4">
<HDF5 dataset "labels": shape (1897294,), type "<i4">
<HDF5 dataset "positions": shape (1897294, 1, 3), type "<f4">
<HDF5 dataset "root_files": shape (1897294,), type "|O">
<HDF5 dataset "veto": shape (1897294,), type "|b1">
<HDF5 dataset "veto2": shape (1897294,), type "|b1">


In [4]:
hdf5_hit_pmt = f["hit_pmt"]
hdf5_hit_time = f["hit_time"]
hdf5_hit_charge = f["hit_charge"]

In [5]:
hit_pmt = np.memmap(data_path, mode="r", shape=hdf5_hit_pmt.shape,
                                    offset=hdf5_hit_pmt.id.get_offset(), dtype=hdf5_hit_pmt.dtype)

hit_time = np.memmap(data_path, mode="r", shape=hdf5_hit_time.shape,
                                    offset=hdf5_hit_time.id.get_offset(), dtype=hdf5_hit_time.dtype)

hit_charge = np.memmap(data_path, mode="r", shape=hdf5_hit_charge.shape,
                                    offset=hdf5_hit_charge.id.get_offset(), dtype=hdf5_hit_charge.dtype)

In [6]:
angles     = np.array(f['angles'])
energies   = np.array(f['energies'])
positions  = np.array(f['positions'])
labels     = np.array(f['labels'])
root_files = np.array(f['root_files'])
veto = np.array(f['veto'])
veto2 = np.array(f['veto2'])

In [7]:
print(positions)

[[[ -247.50923  -2050.8457      94.77088 ]]

 [[  -46.375065 -1879.7782    -278.02145 ]]

 [[  285.2023   -2202.5671     265.48422 ]]

 ...

 [[ -345.21408  -2302.003     -148.44632 ]]

 [[  -24.09766  -2189.0469     270.51706 ]]

 [[ -251.45122  -2303.6245      98.18861 ]]]


In [None]:
# Set up indices
indices = np.array(range(len(labels)))

# Set up dict of file indices
file_dict = dict.fromkeys(root_files)
print("Dict set")

for file in file_dict.keys():
    file_dict[file] = []

for idx, root_file in enumerate(root_files):
    file_dict[root_file].append(idx)
print("Done")

In [None]:
# Get files associated with each particle type

samples_indices = indices[np.where(labels == -1)]
samples_root_file_set = list(dict.fromkeys(root_files[samples_indices]))

print(samples_root_file_set)
print(len(samples_root_file_set))

In [None]:
def get_indices_for_files(file_names):
    all_indices = []
    for file_name in file_names:
        all_indices.extend(file_dict[file_name])
    return np.array(all_indices)

In [None]:
samples_test_files = samples_root_file_set[:]

samples_test_set = get_indices_for_files(samples_test_files)

print(samples_test_set)
print(len(samples_test_set))
print(np.array_equal(samples_test_set,indices))

In [None]:
np.savez('/home/hlahiouel/physics_samples_five_thousand_files_apr28.npz', test_idxs=samples_test_set)