In this notebook, I want to convert the ML outputs (softmaxes and predictions) into a format which can be used in Charlie's analysis code (i.e. a ROOT file). 

In [1]:
%load_ext autoreload
%matplotlib inline
%autoreload 2

In [2]:
import uproot3 as uproot
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import h5py

In [3]:
sys.path.append("..")

## ML Outputs

In [4]:
locs_physics_sample = '/home/hlahiouel/WatChMaL/outputs/2021-04-29/10-05-34/outputs'

titles_physics_sample = 'Physics Sample'

softmax_physics_sample = np.load(locs_physics_sample + "/softmax.npy") 

print(softmax_physics_sample)

labels_physics_sample  = np.load(locs_physics_sample + "/labels.npy")

print(labels_physics_sample)

indices_physics_sample = np.load(locs_physics_sample + "/indices.npy")

print(indices_physics_sample)

predictions_physics_sample = np.load(locs_physics_sample + "/predictions.npy")

print(predictions_physics_sample)

print(len(predictions_physics_sample))

print(np.where(predictions_physics_sample == 2))

muons = len(np.where(predictions_physics_sample == 2)[0])

print(muons)

[[1.53504673e-03 5.26998704e-03 9.83841419e-01 9.35359485e-03]
 [2.43088249e-02 2.67587882e-02 9.35608089e-01 1.33241834e-02]
 [2.04171915e-08 3.63517358e-07 9.99999642e-01 6.43729869e-09]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00]
 [6.47507384e-02 1.10067636e-01 8.10013771e-01 1.51678268e-02]
 [2.49784842e-01 3.14024419e-01 2.06582069e-01 2.29608640e-01]]
[-1 -1 -1 ... -1 -1 -1]
[      0       1       2 ... 1897291 1897292 1897293]
[2 2 2 ... 3 2 1]
1897294
(array([      0,       1,       2, ..., 1897286, 1897290, 1897292]),)
936040


In [5]:
print('Softmax Datatype:', softmax_physics_sample.dtype)
print('Prediction Datatype:', predictions_physics_sample.dtype)

Softmax Datatype: float32
Prediction Datatype: int64


In [6]:
bad_fitqun_files = [102, 104, 107, 113, 114, 119, 200, 312, 314, 315, 316, 317, 319, 320, 321, 323, 325, 326, 436, 437, 439, 442, 443, 445, 447, 450, 4027, 5030, 5031, 5032, 5033, 5034, 5035, 5036, 5037, 5038, 5329, 5330, 5331, 5332, 5333, 5339, 5340, 5365, 5366, 5367, 5400, 5401, 5456, 5458, 5459, 5460, 5461, 5462, 5463, 5464, 5465, 5466, 5467, 5468, 5508, 5509, 5510, 5511, 5515, 5517, 5519, 5520, 5521, 5522, 5524, 7601, 7602, 7603, 7606, 7608, 7612, 8705, 8720, 8846, 8847, 8848, 8849, 8851, 8852, 8853, 8855, 8856, 8857, 8858, 8861, 9338]

In [7]:
print(len(bad_fitqun_files))

92


In [8]:
predictions = predictions_physics_sample
p_gamma = softmax_physics_sample[:,0]
p_electron = softmax_physics_sample[:,1]
p_muon = softmax_physics_sample[:,2]
p_pion = softmax_physics_sample[:,3]

In [9]:
print(type(p_muon))

<class 'numpy.ndarray'>


In [10]:
print(type(predictions[0]))

<class 'numpy.int64'>


## H5 File

In [11]:
data_path = "/fast_scratch/WatChMaL/data/physics_samples_five_thousand_files_apr28.h5"

h5file = h5py.File(data_path, "r")

print(h5file.keys())

for key in h5file.keys():
    print(h5file[key])

<KeysViewHDF5 ['angles', 'energies', 'event_hits_index', 'event_ids', 'hit_charge', 'hit_pmt', 'hit_time', 'labels', 'positions', 'root_files', 'veto', 'veto2']>
<HDF5 dataset "angles": shape (1897294, 2), type "<f4">
<HDF5 dataset "energies": shape (1897294, 1), type "<f4">
<HDF5 dataset "event_hits_index": shape (1897294,), type "<i8">
<HDF5 dataset "event_ids": shape (1897294,), type "<i4">
<HDF5 dataset "hit_charge": shape (2090271258,), type "<f4">
<HDF5 dataset "hit_pmt": shape (2090271258,), type "<i4">
<HDF5 dataset "hit_time": shape (2090271258,), type "<f4">
<HDF5 dataset "labels": shape (1897294,), type "<i4">
<HDF5 dataset "positions": shape (1897294, 1, 3), type "<f4">
<HDF5 dataset "root_files": shape (1897294,), type "|O">
<HDF5 dataset "veto": shape (1897294,), type "|b1">
<HDF5 dataset "veto2": shape (1897294,), type "|b1">


In [12]:
root_files = np.array(h5file['root_files'])
event_ids = np.array(h5file['event_ids'])

In [13]:
bad_fitqun_files = [102, 104, 107, 113, 114, 119, 200, 312, 314, 315, 316, 317, 319, 320, 321, 323, 325, 326, 436, 437, 439, 442, 443, 445, 447, 450, 4027, 5030, 5031, 5032, 5033, 5034, 5035, 5036, 5037, 5038, 5329, 5330, 5331, 5332, 5333, 5339, 5340, 5365, 5366, 5367, 5400, 5401, 5456, 5458, 5459, 5460, 5461, 5462, 5463, 5464, 5465, 5466, 5467, 5468, 5508, 5509, 5510, 5511, 5515, 5517, 5519, 5520, 5521, 5522, 5524, 7601, 7602, 7603, 7606, 7608, 7612, 8705, 8720, 8846, 8847, 8848, 8849, 8851, 8852, 8853, 8855, 8856, 8857, 8858, 8861, 9338]

In [14]:
print(len(bad_fitqun_files))

92


In [15]:
for i in range(len(root_files)):
    root_files[i] = str(root_files[i])

In [16]:
root_file_ids = []

In [17]:
for j in range(len(root_files)):
    root_file_ids.append(root_files[j][79:84])

In [18]:
for k in range(len(root_file_ids)):
    root_file_ids[k] = int(root_file_ids[k])

In [19]:
root_file_ids = np.array(root_file_ids)

In [20]:
bad_fitqun_file_ids = np.array([],int)

for h in bad_fitqun_files:
    bad_indices = np.where(root_file_ids == h)[0]
    bad_fitqun_file_ids = np.concatenate((bad_fitqun_file_ids, bad_indices), axis=None)

In [21]:
print(bad_fitqun_file_ids.shape)

(32945,)


In [22]:
print(bad_fitqun_file_ids)

[  25703   25704   25705 ... 1737072 1737073 1737074]


In [23]:
predictions = np.delete(predictions, bad_fitqun_file_ids)
p_gamma = np.delete(p_gamma, bad_fitqun_file_ids)
p_electron = np.delete(p_electron, bad_fitqun_file_ids)
p_muon = np.delete(p_muon, bad_fitqun_file_ids)
p_pion = np.delete(p_pion, bad_fitqun_file_ids)
root_file_ids = np.delete(root_file_ids, bad_fitqun_file_ids)
event_ids = np.delete(event_ids, bad_fitqun_file_ids)

In [24]:
print(len(predictions))

1864349


## ROOT File 

In [25]:
with uproot.create("/home/hlahiouel/ml_output_root_files/watchmal_ml_outputs_4_class_model_may3.root") as m:
    
    m["WatChMaL_ML_Outputs"] = uproot.newtree({"predictions":"int",
                                               "p_gamma":"float32",
                                               "p_electron":"float32",
                                               "p_muon":"float32",
                                               "p_pion":"float32",
                                               "event_ids":"int",
                                               "root_file_ids":"int"})
    
    m["WatChMaL_ML_Outputs"].extend({"predictions": predictions,
                                     "p_gamma": p_gamma,
                                     "p_electron": p_electron,
                                     "p_muon": p_muon,
                                     "p_pion": p_pion,
                                     "event_ids": event_ids,
                                     "root_file_ids": root_file_ids})

In [26]:
new_file = uproot.open("/home/hlahiouel/ml_output_root_files/watchmal_ml_outputs_4_class_model_may3.root")
new_file

<ROOTDirectory b'watchmal_ml_outputs_4_class_model_may3.root' at 0x7f73cc263940>

In [27]:
new_file["WatChMaL_ML_Outputs"].array("predictions")

array([2, 2, 2, ..., 3, 2, 1])

In [28]:
new_file["WatChMaL_ML_Outputs"].array("p_gamma")

array([1.5350467e-03, 2.4308825e-02, 2.0417191e-08, ..., 0.0000000e+00,
       6.4750738e-02, 2.4978484e-01], dtype=float32)

In [29]:
new_file["WatChMaL_ML_Outputs"].array("event_ids")

array([  0,   1,   2, ..., 350, 351, 352])

In [30]:
new_file["WatChMaL_ML_Outputs"].array("root_file_ids")

array([   0,    0,    0, ..., 9968, 9968, 9968])

In [None]:
fitqun_file = uproot.open("/home/hlahiouel/physics_samples/root_files/iwcd_p320ka_w750m_1e17pot_2p39_fitqun.00000.root")
fitqun_file

In [None]:
fitqun_file.keys()

In [None]:
fitqun_file['fiTQun']

In [None]:
fitqun_file['fiTQun'].keys()