In [None]:
import uproot
import awkward as ak
import numpy as np

# Generating HEP-like pseudo-data
The function here will generate a ragged-array with several fields, representing HEP-like data for one of the ML benchmark models

In [None]:
def generate_pseudodata_from_njets(njets):
    total_jets = np.sum(njets)    
    points = ak.unflatten(grdn.random((total_jets, 2, 100), dtype=np.float32), njets)
    features = ak.unflatten(grdn.random((total_jets, 5, 100), dtype=np.float32), njets)
    mask = ak.unflatten(grdn.random((total_jets, 1, 100), dtype=np.float32), njets)
    return ak.Array({"points": points,
            "features": features,
            "mask": mask})

## Writing Data to files
HEP commonly uses the ROOT file format. ```uproot``` is a package for I/O between different file formats like ```root```, and in-memory data in the format of ```numpy``` arrays, ragged arrays (via ```awkward```), ```pandas```, etc.

In [None]:
FILE_BASE = "../data/PseudoData_{nfile}.root"
N_FILES = 100
N_EVENTS_PER_BASKET = 2_000
N_BASKETS = 20

grdn = np.random.default_rng()
file_names = []

for nfile in range(N_FILES):
    file_name = FILE_BASE.format(nfile=nfile)
    file = uproot.recreate(file_name, compression = uproot.ZLIB(1))
    print("Writing file...", file_name)
    for nbasket in range(N_BASKETS):
        grdn = np.random.default_rng([nfile, nbasket])
        njets = ak.Array(grdn.integers(1, 10, size=N_EVENTS_PER_BASKET))
        if nbasket == 0:
            file["Events"] = {"Jet": njets}
        else:
            file["Events"].extend({"Jet": njets})
    file.close()
    file_names.append(file_name)
        

## Reading back data

Data can be opened again using ```uproot.open```

In [None]:
ftest = uproot.open("../data/PseudoDataSeeds_99.root")
treetest = ftest["Events"]

In [None]:
print(ftest.keys(), treetest.keys(), treetest.num_entries)