In [20]:
"""
*Filename: hffragDataLoader
*Description: In this jupyter notebook the data contained in the larger hffrag root file
is loaded into python and sample is taken. This sample is saved to a numpy file for future
use. 
Date: 16/02/2023
Author: Kaylen Smith Darnbrook
"""
import uproot
import awkward as ak
import DeepSetNeuralNetArchitecture as DSNNA
from numpy.lib.recfunctions import structured_to_unstructured
import numpy as np

In [3]:
# The data is being stored in a tree datastructure.
# We access the charm root using this command
tree = uproot.open("/storage/epp2/phswmv/data/hffrag/hffrag.root:CharmAnalysis")

In [5]:
#Specify how much data to load into the file.
MAXTRACKS = 64
MAXEVENTS = 1e15

In [1]:
# Select the features we wish to study
track_features = ["AnalysisTracks_pt", "AnalysisTracks_eta", "AnalysisTracks_phi", "AnalysisTracks_z0sinTheta",
                  "AnalysisTracks_d0sig", "AnalysisTracks_d0", "AnalysisTracks_d0sigPV", "AnalysisTracks_d0PV" ]

jet_features = ["AnalysisAntiKt4TruthJets_pt", "AnalysisAntiKt4TruthJets_eta", "AnalysisAntiKt4TruthJets_phi", "AnalysisAntiKt4TruthJets_m"]

bhads_features = ["AnalysisAntiKt4TruthJets_ghostB_pt", "AnalysisAntiKt4TruthJets_ghostB_eta","AnalysisAntiKt4TruthJets_ghostB_phi",  "AnalysisAntiKt4TruthJets_ghostB_m"]


In [7]:
#Use uproot to extract the information from the root files.
features = tree.arrays(jet_features+track_features + bhads_features, entry_stop=MAXEVENTS)

In [40]:
# Select the events of interest
events = features[ak.sum(
    features["AnalysisAntiKt4TruthJets_pt"] > 25000, axis=1) > 0]

In [50]:
# Displays the number of jets being trained on
jets = events[jet_features+bhads_features][:, 0]
print("The number of jets to train on is: ", len(jets))
print("The number of track features is: ",len(track_features))

The number of jets to train on is:  2676798
The number of track features is:  8


In [51]:
# Select tracks from the events
tracks = events[track_features]

# Match the tracks to the jets
matchedtracks = tracks[DSNNA.Match_Tracks(jets, tracks)]

# Pad and Flatten the data
matchedtracks = DSNNA.flatten(matchedtracks, MAXTRACKS)

In [52]:
# Identify the the bottom jets and their associated tracks
bjets = ak.sum(jets["AnalysisAntiKt4TruthJets_ghostB_pt"] > 5000, axis=1) > 0
jets = jets[bjets]

# Obtain the pt, eta and phi of each b hadron jet
bhads_pt = jets["AnalysisAntiKt4TruthJets_ghostB_pt"][:, 0].to_numpy()
bhads_eta = jets["AnalysisAntiKt4TruthJets_ghostB_eta"][:,0].to_numpy()
bhads_phi = jets["AnalysisAntiKt4TruthJets_ghostB_phi"][:,0].to_numpy()
bhads_m = jets["AnalysisAntiKt4TruthJets_ghostB_m"][:,0].to_numpy()

jets_pt = jets["AnalysisAntiKt4TruthJets_pt"].to_numpy()
jets_eta = jets["AnalysisAntiKt4TruthJets_eta"].to_numpy()
jets_phi = jets["AnalysisAntiKt4TruthJets_phi"].to_numpy()
jets_m = jets["AnalysisAntiKt4TruthJets_m"].to_numpy()

b_jets = np.stack([jets_pt,jets_eta,jets_phi,jets_m], axis = -1)

bhads = np.stack([bhads_pt,bhads_eta,bhads_phi,bhads_m],axis = -1) #Combine the momentum, eta and phi for each jet into one array

print("There are {} outputs".format(np.shape(bhads)[1])) # Display the number of target features the neural network will predict
matchedtracks = matchedtracks[bjets]
print("There are {} inputs".format(np.shape(matchedtracks)[1])) # Display the number of target features the neural network will use in it's ppredictions

There are 4 outputs
There are 64 inputs


In [57]:
#Check that the shapes are correct
print(np.shape(bhads))
print(np.shape(b_jets))
print(bhads[:,:3].shape)

(1291472, 4)
(1291472, 4)
(1291472, 3)


In [54]:
# Transform the jet and tracks to unstructed data.
jets = structured_to_unstructured(jets[jet_features[:-3]])
matchedtracks = structured_to_unstructured(matchedtracks)

In [58]:
# Convert the coordinates of the b jets and tracks to cartesian coordinates
tracks_p = DSNNA.pt_eta_phi_2_px_py_pz_tracks(matchedtracks.to_numpy())
bhads_momenta = DSNNA.pt_eta_phi_2_px_py_pz_jets(bhads[:,:3])
b_jets_momenta = DSNNA.pt_eta_phi_2_px_py_pz_jets(b_jets[:,:3])

#Combine the momenta of the tracks with the rest of the track features to form the track dataset
tracks = np.concatenate([tracks_p,matchedtracks[:,:,3:].to_numpy()],axis = 2)
bhads = np.concatenate([bhads_momenta,bhads[:,3:]], axis = -1)
b_jets = np.concatenate([b_jets_momenta, b_jets[:,3:]], axis = -1)


  pzs = np.where(mask1 | mask3, pts, pts * np.sinh(etas))


In [59]:
#Check the shape of the sample is still correct.
print(np.shape(tracks))
print(np.shape(bhads))
print(np.shape(b_jets))

(1291472, 64, 8)
(1291472, 4)
(1291472, 4)


In [61]:
#Save the file to numpy files.
np.save("/home/physics/phujdj/DeepLearningParticlePhysics/TrainingData/Track_Data.npy", tracks)
np.save("/home/physics/phujdj/DeepLearningParticlePhysics/TrainingData/B_Jet_Data.npy", b_jets)
np.save("/home/physics/phujdj/DeepLearningParticlePhysics/TrainingData/Bhads_Data.npy", bhads)