In [1]:
import os
import pickle as pkl

import matplotlib.pyplot as plt
import numpy as np
import torch

import awkward as ak
import fastjet
import vector

from copy import deepcopy

import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import argparse
import awkward as ak
import pandas as pd
import mplhep as hep
import os
import os.path as osp
import h5py
import json

In [4]:
! pip install h5py --upgrade

Collecting h5py
  Obtaining dependency information for h5py from https://files.pythonhosted.org/packages/af/26/f231ee425c8df93c1abbead3d90ea4a5ff3d6aa49e0edfd3b4c017e74844/h5py-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading h5py-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Downloading h5py-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: h5py
  Attempting uninstall: h5py
    Found existing installation: h5py 3.9.0
    Uninstalling h5py-3.9.0:
      Successfully uninstalled h5py-3.9.0
Successfully installed h5py-3.11.0


# Examine saved dataset

### Load the saved dataset (this takes a while)

In [2]:
save_path = "../data/val"
with h5py.File(f'{save_path}/val_20_30.h5', 'r') as hdf:
        # Access the dataset containing your JSON strings
    dataset = hdf["subjets"]
    
    # Initialize a list to hold the deserialized subjets data
    subjets_data = []
    particles_features = hdf["particles"]["features"][:]
    particles_labels = hdf["particles"]["labels"][:]
    
    # Iterate over each JSON string in the dataset
    for json_str in dataset:
        # Deserialize the JSON string to a Python object
        subjet = json.loads(json_str)
        subjets_data.append(subjet)

# At this point, `subjets_data` contains all your subjets as Python dictionaries

### ~400k jets in total  
### Each jet contains 128 particles (some are zero-padded)  
### Each particle has 4 features: part_deta, part_dphi, part_pt_log, part_e_log

In [3]:
particles_features.shape

(403000, 4, 128)

In [5]:
particles_labels.shape

(403000,)

In [6]:
len(subjets_data)

403000

### Each jet is clustered into 20 different subjets (zero-padded if not enough subjets)  

In [7]:
len(subjets_data[0])

20

### Each subjet is a dictionary

In [8]:
type(subjets_data[0][0])

dict

In [3]:
subjets_data[0][0].keys()

dict_keys(['features', 'indices'])

### 'features' contains subjet level information ($p_T, \eta, \phi$ of the subjet, and number of particles)

In [4]:
subjets_data[0][0]['features']

{'pT': 563.1187802221922,
 'eta': 0.4276622300992181,
 'phi': 5.0734595349950595,
 'num_ptcls': 15}

### 'indices' are the indices of the particles contained in the subjet
### Each subjet contains 30 particles (zero-padded if not enough particles) ordered by $p_T$  
-1 indicates zero-padded particles

In [11]:
len(subjets_data[0][0]['indices'])

30

In [12]:
subjets_data[0][0]['indices']

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 14,
 15,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1]

### We can use the indices to get the particle level features

In [18]:
subjet_0_0_particles = particles_features[0, :, subjets_data[0][0]['indices']]
print(f"shape: {subjet_0_0_particles.shape}")
print(f"0th particle: {subjet_0_0_particles[0]}")
num_ptcls = subjets_data[0][0]['features']['num_ptcls']
print(f"real particles: {subjet_0_0_particles[:num_ptcls]}")
print(f"padded particles: {subjet_0_0_particles[num_ptcls:]}")

shape: (30, 4)
0th particle: [ 8.07034969e-03 -2.84028053e-03  5.49364185e+00  5.58603764e+00]
real particles: [[ 8.07034969e-03 -2.84028053e-03  5.49364185e+00  5.58603764e+00]
 [ 7.74672627e-03 -4.23669815e-04  4.88897181e+00  4.98123503e+00]
 [ 1.33943260e-02  3.03339958e-03  4.23767281e+00  4.33226681e+00]
 [-2.37070918e-02  2.46822834e-03  3.04663992e+00  3.12640619e+00]
 [-8.56129825e-02  6.96027279e-03  2.87129927e+00  2.92895222e+00]
 [-2.45730877e-02 -1.53195858e-03  2.65378785e+00  2.73322177e+00]
 [-9.60916281e-04  3.13551426e-02  2.59992838e+00  2.68864846e+00]
 [ 3.11870575e-02 -2.16490030e-02  2.56456566e+00  2.66667747e+00]
 [-7.95053542e-02 -3.45020294e-02  2.35992312e+00  2.41960835e+00]
 [-7.25546777e-02 -1.01785660e-02  2.17573619e+00  2.23777437e+00]
 [-1.52146608e-01  9.38500166e-02  2.01040292e+00  2.04809928e+00]
 [ 2.57428885e-02 -1.85542107e-02  1.55479038e+00  1.65457439e+00]
 [-2.89505124e-02  2.42720842e-02  1.52238345e+00  1.60014784e+00]
 [ 2.45089233e-02 