In [1]:
import awkward as ak
import numpy as np
import pandas as pd
import vector
import matplotlib.pyplot as plt
import mplhep as hep
import torch
from sklearn.metrics import auc
import os
import os.path as osp

plt.style.use(hep.style.ROOT)

In [2]:
# !pwd

In [3]:
# !nvidia-smi -q

In [4]:
label = "test"
hdf5_file = f"/ssl-jet-vol-v2/toptagging/{label}/raw/{label}.h5"
vector.register_awkward()

df = pd.read_hdf(hdf5_file, key="table")

In [5]:
def _col_list(prefix, max_particles=200):
    return ["%s_%d" % (prefix, i) for i in range(max_particles)]

_px = df[_col_list("PX")].values
_py = df[_col_list("PY")].values
_pz = df[_col_list("PZ")].values
_e = df[_col_list("E")].values

mask = _e > 0
n_particles = np.sum(mask, axis=1)

px = ak.unflatten(_px[mask], n_particles)
py = ak.unflatten(_py[mask], n_particles)
pz = ak.unflatten(_pz[mask], n_particles)
energy = ak.unflatten(_e[mask], n_particles)

p4 = ak.zip(
            {
                "px": px,
                "py": py,
                "pz": pz,
                "energy": energy,
            },
            with_name="Momentum4D",
        )

jet_p4 = ak.sum(p4, axis=-1)

# outputs
v = {}
v["label"] = df["is_signal_new"].values

v["jet_pt"] = jet_p4.pt.to_numpy()
v["jet_eta"] = jet_p4.eta.to_numpy()
v["jet_phi"] = jet_p4.phi.to_numpy()
v["jet_energy"] = jet_p4.energy.to_numpy()
v["jet_mass"] = jet_p4.mass.to_numpy()
v["jet_nparticles"] = n_particles

v["part_px"] = px
v["part_py"] = py
v["part_pz"] = pz
v["part_energy"] = energy

v["part_deta"] = p4.deltaeta(jet_p4)
v["part_dphi"] = p4.deltaphi(jet_p4)

part_pt = np.hypot(v["part_px"], v["part_py"])

In [6]:
# v["part_deta"][0]

In [7]:
# part_pt[0]

In [8]:
# max_nconstit = 50
# i = 0
# pt = torch.from_numpy(part_pt[i].to_numpy().reshape(-1, 1))[:max_nconstit]
# padded_pt = pt
# if pt.shape[0] < max_nconstit:
#     zeros = torch.zeros(max_nconstit - pt.shape[0], 1)
#     padded_pt = torch.cat([pt, zeros], axis=0)
# padded_pt

In [9]:
def zero_pad(arr, max_nconstit=50):
    """
    arr: torch tensor
    """
    arr = arr[:max_nconstit]
    if arr.shape[0] < max_nconstit:
        zeros = torch.zeros(max_nconstit - arr.shape[0], 1)
        padded_arr = torch.cat([arr, zeros], axis=0)
        return padded_arr
    else:
        return arr

In [10]:
# zero_pad(torch.from_numpy(part_pt[100].to_numpy().reshape(-1, 1))).shape

torch.Size([50, 1])

In [11]:
# jet_index = 0
# pt = zero_pad(torch.from_numpy(part_pt[jet_index].to_numpy().reshape(-1, 1)))
# deta = zero_pad(torch.from_numpy(v["part_deta"][jet_index].to_numpy().reshape(-1, 1)))
# dphi = zero_pad(torch.from_numpy(v["part_dphi"][jet_index].to_numpy().reshape(-1, 1)))

# jet = torch.cat([pt, deta, dphi], axis=1).transpose(0,1)

# jet.shape

torch.Size([3, 50])

In [12]:
# jet2 = jet.clone()

In [13]:
# features = []
# labels = []
# for jet_index in range(10):
#     pt = zero_pad(torch.from_numpy(part_pt[jet_index].to_numpy().reshape(-1, 1)))
#     deta = zero_pad(torch.from_numpy(v["part_deta"][jet_index].to_numpy().reshape(-1, 1)))
#     dphi = zero_pad(torch.from_numpy(v["part_dphi"][jet_index].to_numpy().reshape(-1, 1)))

#     jet = torch.cat([pt, deta, dphi], axis=1).transpose(0,1)
#     y = torch.tensor(v["label"][jet_index]).long()
    
#     features.append(jet)
#     labels.append(y)

In [14]:
# torch.stack(features)

tensor([[[ 2.0452e+02,  1.4516e+02,  7.2957e+01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.9442e-02,  5.8873e-03,  2.7067e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 9.3275e-02, -3.4971e-02, -2.2574e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 9.5436e+01,  9.4674e+01,  7.1292e+01,  ...,  1.2189e+00,
           1.0371e+00,  8.6181e-01],
         [ 3.1814e-02,  3.1344e-02,  1.5960e-02,  ..., -9.9717e-02,
          -1.2260e-01, -6.8209e-01],
         [ 2.9309e-02, -6.7194e-02,  4.9881e-02,  ..., -3.6181e-01,
           6.1498e-02, -1.9809e-02]],

        [[ 1.2621e+02,  6.6783e+01,  4.1358e+01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-7.3247e-03,  5.5220e-03,  1.4092e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-7.0035e-02,  7.6470e-03, -1.1063e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        ...,

        [[ 1.5427e+02,  8.9057e+01,

In [15]:
# torch.stack(labels).shape

torch.Size([10])

In [16]:
features = []
labels = []
c = 0
processed_dir = f"/ssl-jet-vol-v2/toptagging/{label}/processed/3_features"
os.system(
        f"mkdir -p {processed_dir}"
    )
for jet_index in range(len(df)):
    pt = zero_pad(torch.from_numpy(part_pt[jet_index].to_numpy().reshape(-1, 1)))
    deta = zero_pad(torch.from_numpy(v["part_deta"][jet_index].to_numpy().reshape(-1, 1)))
    dphi = zero_pad(torch.from_numpy(v["part_dphi"][jet_index].to_numpy().reshape(-1, 1)))

    jet = torch.cat([pt, deta, dphi], axis=1).transpose(0,1)
    y = torch.tensor(v["label"][jet_index]).long()
    
    features.append(jet)
    labels.append(y)
    print("success")
    break
    
    if jet_index % 100000 == 0 and jet_index != 0:
        print(f"saving datafile data_{c}")
        torch.save(torch.stack(features), osp.join(processed_dir, f"data_{c}.pt"))
        torch.save(torch.stack(labels), osp.join(processed_dir, f"labels_{c}.pt"))
        c += 1
        features = []
        labels = []

saving datafile data_0
saving datafile data_1
saving datafile data_2
saving datafile data_3


In [17]:
# plt.hist(v["jet_phi"])

In [18]:
data_0 = torch.load(osp.join(processed_dir, "data_0.pt"))

In [19]:
data_0.shape

torch.Size([100001, 3, 50])