# Dataset Generation - include Pions

This is used to generate the dataset including pions. 

In [1]:
import sys
import os
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np
from scipy import signal
import matplotlib
from functools import reduce

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

sys.path.append("../..")
sys.path.append("..")


%load_ext autoreload
%matplotlib inline
%autoreload 2

from IPython.display import HTML

## Load Data

In [2]:
# Load emg data 

emg_data_path = "/fast_scratch/WatChMaL/data/IWCD_mPMT_Short_emg_E0to1000MeV_digihits.h5"

f = h5py.File(emg_data_path, "r") # Loads emg data

print(f.keys())

<KeysViewHDF5 ['angles', 'energies', 'event_hits_index', 'event_ids', 'hit_charge', 'hit_pmt', 'hit_time', 'labels', 'positions', 'root_files', 'veto', 'veto2']>


In [3]:
# Load pion data 

pion_data_path = "/fast_scratch/WatChMaL/data/IWCD_mPMT_Short_pi0_E0to1000MeV_digihits.h5"

p = h5py.File(pion_data_path, "r") # Loads pion data

print(p.keys())

<KeysViewHDF5 ['angles', 'energies', 'event_hits_index', 'event_ids', 'hit_charge', 'hit_pmt', 'hit_time', 'labels', 'positions', 'root_files', 'veto', 'veto2']>


## New h5 File

In [4]:
new_h5_file = h5py.File('/data/WatChMaL/data/IWCD_mPMT_Short_emgp0_E0to1000MeV_digihits.h5','w')

## Angles

In [5]:
f_angles = f["angles"]
print(len(f_angles))

print(np.dtype(f_angles))

p_angles = p["angles"]
print(len(p_angles))

20613195
float32
2868354


In [6]:
num_rows_f = f_angles.shape
print(num_rows_f)

num_rows_p = p_angles.shape
print(num_rows_p)

(20613195, 2)
(2868354, 2)


In [7]:
new_angles = new_h5_file.create_dataset("angles", shape=(len(p_angles)+len(f_angles),num_rows_f[1]), dtype=np.dtype(f_angles))

In [8]:
new_angles[0:len(f_angles)] = f_angles[:]

new_angles[len(f_angles):] = p_angles[:]

## Energies 

In [9]:
f_energies = f["energies"]
print(len(f_energies))

print(np.dtype(f_energies))

p_energies = p["energies"]
print(len(p_energies))

20613195
float32
2868354


In [10]:
num_rows_f = f_energies.shape
print(num_rows_f)

num_rows_p = p_energies.shape
print(num_rows_p)

(20613195, 1)
(2868354, 1)


In [11]:
new_energies = new_h5_file.create_dataset("energies", shape=(len(p_energies)+len(f_energies),num_rows_f[1]), dtype=np.dtype(f_energies))

In [12]:
new_energies[0:len(f_energies)] = f_energies[:]

new_energies[len(f_energies):] = p_energies[:]

## Deal with event_hits_index

In [13]:
f_event_hits_index = f["event_hits_index"]
print(len(f_event_hits_index))

print(np.dtype(f_event_hits_index))

p_event_hits_index = p["event_hits_index"]
print(len(p_event_hits_index))

20613195
int64
2868354


In [14]:
num_rows_f = f_event_hits_index.shape
print(num_rows_f)

num_rows_p = p_event_hits_index.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [15]:
new_event_hits_index = new_h5_file.create_dataset("event_hits_index", shape=(len(f_event_hits_index)+len(p_event_hits_index),), dtype=np.dtype(f_event_hits_index))

In [16]:
new_event_hits_index[0:len(f_event_hits_index)] = f_event_hits_index[:] 

new_event_hits_index[len(f_event_hits_index):] = p_event_hits_index[:] 

In [17]:
shift = len(f["hit_charge"])

In [18]:
idxs = list(range(len(f_event_hits_index), len(new_event_hits_index)))

In [19]:
new_event_hits_index[idxs] += shift 

In [20]:
#for i in range(len(f_event_hits_index), len(new_event_hits_index)):
    #new_event_hits_index[i] += shift

## Event_ids 

In [21]:
f_event_ids = f["event_ids"]
print(len(f_event_ids))

print(np.dtype(f_event_ids))

p_event_ids = p["event_ids"]
print(len(p_event_ids))

20613195
int32
2868354


In [22]:
num_rows_f = f_event_ids.shape
print(num_rows_f)

num_rows_p = p_event_ids.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [23]:
new_event_ids = new_h5_file.create_dataset("event_ids", shape=(len(p_event_ids)+len(f_event_ids),), dtype=np.dtype(f_event_ids))

In [24]:
new_event_ids[0:len(f_event_ids)] = f_event_ids[:]

new_event_ids[len(f_event_ids):] = p_event_ids[:]

## Hit Charges

In [25]:
hdf5_f_hit_charge = f["hit_charge"]
print(len(hdf5_f_hit_charge))

print(np.dtype(hdf5_f_hit_charge))

f_hit_charge = np.memmap(emg_data_path, mode="r", shape=hdf5_f_hit_charge.shape,
                                    offset=hdf5_f_hit_charge.id.get_offset(), dtype=hdf5_f_hit_charge.dtype)

26925826390
float32


In [26]:
hdf5_p_hit_charge = p["hit_charge"]
print(len(hdf5_p_hit_charge))

p_hit_charge = np.memmap(pion_data_path, mode="r", shape=hdf5_p_hit_charge.shape,
                                    offset=hdf5_p_hit_charge.id.get_offset(), dtype=hdf5_p_hit_charge.dtype)

4607347439


In [27]:
num_rows_f = f_hit_charge.shape
print(num_rows_f)

num_rows_p = p_hit_charge.shape
print(num_rows_p)

(26925826390,)
(4607347439,)


In [28]:
new_hit_charge = new_h5_file.create_dataset("hit_charge", shape=(len(p_hit_charge)+len(f_hit_charge),), dtype=np.dtype(hdf5_f_hit_charge))

In [29]:
index = len(f_hit_charge)

In [30]:
new_hit_charge[0:index] = f_hit_charge[:]

new_hit_charge[index:] = p_hit_charge[:]

## Hit PMT

In [31]:
hdf5_f_hit_pmt = f["hit_pmt"]
print(len(hdf5_f_hit_pmt))

print(np.dtype(hdf5_f_hit_pmt))

f_hit_pmt = np.memmap(emg_data_path, mode="r", shape=hdf5_f_hit_pmt.shape,
                                    offset=hdf5_f_hit_pmt.id.get_offset(), dtype=hdf5_f_hit_pmt.dtype)

26925826390
int32


In [32]:
hdf5_p_hit_pmt = p["hit_pmt"]
print(len(hdf5_p_hit_pmt))

p_hit_pmt = np.memmap(pion_data_path, mode="r", shape=hdf5_p_hit_pmt.shape,
                                    offset=hdf5_p_hit_pmt.id.get_offset(), dtype=np.dtype(hdf5_f_hit_pmt))

4607347439


In [33]:
num_rows_f = f_hit_pmt.shape
print(num_rows_f)

num_rows_p = p_hit_pmt.shape
print(num_rows_p)

(26925826390,)
(4607347439,)


In [34]:
new_hit_pmt = new_h5_file.create_dataset("hit_pmt", shape=(len(p_hit_pmt)+len(f_hit_pmt),), dtype=np.dtype(hdf5_f_hit_pmt))

In [35]:
index = len(f_hit_pmt)

In [36]:
new_hit_pmt[0:index] = f_hit_pmt[:]

new_hit_pmt[index:] = p_hit_pmt[:]

## Hit Time

In [37]:
hdf5_f_hit_time = f["hit_time"]
print(len(hdf5_f_hit_time))

print(np.dtype(hdf5_f_hit_time))

f_hit_time = np.memmap(emg_data_path, mode="r", shape=hdf5_f_hit_time.shape,
                                    offset=hdf5_f_hit_time.id.get_offset(), dtype=hdf5_f_hit_time.dtype)

26925826390
float32


In [38]:
hdf5_p_hit_time = p["hit_time"]
print(len(hdf5_p_hit_time))

p_hit_time = np.memmap(pion_data_path, mode="r", shape=hdf5_p_hit_time.shape,
                                    offset=hdf5_p_hit_time.id.get_offset(), dtype=np.dtype(hdf5_f_hit_time))

4607347439


In [39]:
num_rows_f = f_hit_time.shape
print(num_rows_f)

num_rows_p = p_hit_time.shape
print(num_rows_p)

(26925826390,)
(4607347439,)


In [40]:
new_hit_time = new_h5_file.create_dataset("hit_time", shape=(len(p_hit_time)+len(f_hit_time),), dtype=np.dtype(hdf5_f_hit_time))

In [41]:
index = len(f_hit_time)

In [42]:
new_hit_time[0:index] = f_hit_time[:]

new_hit_time[index:] = p_hit_time[:]

## Labels 

In [43]:
f_labels = f["labels"]
print(len(f_labels))

print(np.dtype(f_labels))

20613195
int32


In [44]:
p_labels = p["labels"]

print(len(p_labels))

print(p_labels[0])

2868354
-1


In [45]:
num_rows_f = f_labels.shape
print(num_rows_f)

num_rows_p = p_labels.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [46]:
new_labels = new_h5_file.create_dataset("labels", shape=(len(p_labels)+len(f_labels),), dtype=np.dtype(f_labels))

In [47]:
new_labels[0:len(f_labels)] = f_labels[:]

new_labels[len(f_labels):] = p_labels[:] 

In [48]:
for j in range(len(f_labels),len(p_labels)+len(f_labels)):
    new_labels[j] = 3

## Positions

In [49]:
f_positions = f["positions"]
print(len(f_positions))

print(np.dtype(f_positions))

p_positions = p["positions"]
print(len(p_positions))

20613195
float32
2868354


In [50]:
num_rows_f = f_positions.shape
print(num_rows_f)

num_rows_p = p_positions.shape
print(num_rows_p)

(20613195, 1, 3)
(2868354, 1, 3)


In [51]:
new_positions = new_h5_file.create_dataset("positions", shape=(len(p_positions)+len(f_positions), num_rows_f[1], num_rows_f[2]), dtype=np.dtype(f_positions))

In [52]:
new_positions[0:len(f_positions)] = f_positions[:]

new_positions[len(f_positions):] = p_positions[:]

## Root Files

In [53]:
f_root_files = f["root_files"]
print(len(f_root_files))

print(np.dtype(f_root_files))

p_root_files = p["root_files"]
print(len(p_root_files))

20613195
object
2868354


In [54]:
num_rows_f = f_root_files.shape
print(num_rows_f)

num_rows_p = p_root_files.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [55]:
new_root_files = new_h5_file.create_dataset("root_files", shape=(len(p_root_files)+len(f_root_files),), dtype=np.dtype(f_root_files))

In [56]:
new_root_files[0:len(f_root_files)] = f_root_files[:]

new_root_files[len(f_root_files):] = p_root_files[:]

## Veto 

In [57]:
f_veto = f["veto"]
print(len(f_veto))

print(np.dtype(f_veto))

p_veto = p["veto"]
print(len(p_veto))

20613195
bool
2868354


In [58]:
num_rows_f = f_veto.shape
print(num_rows_f)

num_rows_p = p_veto.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [59]:
new_veto = new_h5_file.create_dataset("veto", shape=(len(p_veto)+len(f_veto),), dtype=np.dtype(f_veto))

In [60]:
new_veto[0:len(f_veto)] = f_veto[:]

new_veto[len(f_veto):] = p_veto[:]

## Veto 2

In [61]:
f_veto2 = f["veto2"]
print(len(f_veto2))

p_veto2 = p["veto2"]
print(len(p_veto2))

print(np.dtype(f_veto2))

num_rows_f = f_veto2.shape
print(num_rows_f)

num_rows_p = p_veto2.shape
print(num_rows_p)

20613195
2868354
bool
(20613195,)
(2868354,)


In [62]:
num_rows_f = f_veto2.shape
print(num_rows_f)

num_rows_p = p_veto2.shape
print(num_rows_p)

(20613195,)
(2868354,)


In [63]:
new_veto2 = new_h5_file.create_dataset("veto2", shape=(len(p_veto2)+len(f_veto2),), dtype=np.dtype(f_veto2))

In [64]:
new_veto2[0:len(f_veto2)] = f_veto2[:]

new_veto2[len(f_veto2):] = p_veto2[:]

In [65]:
new_data_path = "/data/WatChMaL/data/IWCD_mPMT_Short_emgp0_E0to1000MeV_digihits.h5"

h = h5py.File(new_data_path, "r")

for key in h.keys():
    print(h[key])

<HDF5 dataset "angles": shape (23481549, 2), type "<f4">
<HDF5 dataset "energies": shape (23481549, 1), type "<f4">
<HDF5 dataset "event_hits_index": shape (23481549,), type "<i8">
<HDF5 dataset "event_ids": shape (23481549,), type "<i4">
<HDF5 dataset "hit_charge": shape (31533173829,), type "<f4">
<HDF5 dataset "hit_pmt": shape (31533173829,), type "<i4">
<HDF5 dataset "hit_time": shape (31533173829,), type "<f4">
<HDF5 dataset "labels": shape (23481549,), type "<i4">
<HDF5 dataset "positions": shape (23481549, 1, 3), type "<f4">
<HDF5 dataset "root_files": shape (23481549,), type "|O">
<HDF5 dataset "veto": shape (23481549,), type "|b1">
<HDF5 dataset "veto2": shape (23481549,), type "|b1">
