In [None]:
%matplotlib inline
import bz2
import json
import pandas
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import glob
import fastjet
import awkward as ak

In [None]:
jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)

## Datasets

CLIC detector model, Full Geant4 simulation: https://hepsim.jlab.org/taginfo.php?id=rfull201

Physics processes:
- `gev380ee_pythia6_ttbar_rfull201`: ttbar events in an electron-electron collider, no pileup 

# Plot one event

In [None]:
data = json.load(bz2.BZ2File("../testdata/pythia6_ttbar_0001_pandora_0.json.bz2", "r"))

iev = 0
df_gen = pandas.DataFrame(data[iev]["genparticles"])
df_hit = pandas.DataFrame(data[iev]["track_hits"])
df_cl = pandas.DataFrame(data[iev]["clusters"])
df_tr = pandas.DataFrame(data[iev]["tracks"])
df_ecal = pandas.DataFrame(data[iev]["ecal_hits"])
df_hcal = pandas.DataFrame(data[iev]["hcal_hits"])
df_pfs = pandas.DataFrame(data[iev]["pfs"])
    
plt.figure(figsize=(10,10))
plt.scatter(df_ecal["x"], df_ecal["y"], s=5*df_ecal["energy"]);
plt.scatter(df_hcal["x"], df_hcal["y"], s=5*df_hcal["energy"]);
plt.scatter(df_hit["x"], df_hit["y"], s=1);
plt.scatter(df_cl["x"], df_cl["y"], s=5*df_cl["energy"], alpha=0.5, marker="s");
plt.xlim(-2000,2000)
plt.ylim(-2000,2000)

In [None]:
#An old detector note describing how to compute track parameters
#http://flc.desy.de/lcnotes/notes/localfsExplorer_read?currentPath=/afs/desy.de/group/flc/lcnotes/LC-DET-2006-004.pdf 
a = 3*10**-4
b = 5 #B-field in tesla

def track_pt(omega):
    return a*np.abs(b/omega)

In [None]:
#Create the genparticle -> track/cluster -> particle flow event graph
def event_to_graph(df_gen, df_cl, df_tr, df_pfs):
    g = nx.DiGraph()
    
    #Add genparticles
    for igen in range(len(df_gen)):
        g.add_node("gen{}".format(igen), typ=int(df_gen.iloc[igen]["pdgid"]), e=df_gen.iloc[igen]["energy"])

    #Add links to parents
    for igen in range(len(df_gen)):
        
        #Add links to parents
        idx_parent0 = int(df_gen.iloc[igen]["idx_parent0"])
        if idx_parent0 != -1:
            g.add_edge("gen{}".format(idx_parent0), "gen{}".format(igen), w=0)
            
        idx_parent1 = int(df_gen.iloc[igen]["idx_parent1"])
        if idx_parent1 != -1:
            g.add_edge("gen{}".format(idx_parent1), "gen{}".format(igen), w=0)
            
    #Add calorimeter clusters
    for icl in range(len(df_cl)):
        g.add_node(
            "clu{}".format(icl),
            typ=df_cl.iloc[icl]["type"],
            e=df_cl.iloc[icl]["energy"]
        )
        
        #Add links from genparticles to cluster
        #The weight is the energy contribution from the genparticle
        for gp, gp_w in df_cl.iloc[icl]["gp_contributions"].items():
            gp = int(gp)
            if gp_w/df_cl.iloc[icl]["energy"]>0.2:
                g.add_edge("gen{}".format(gp), "clu{}".format(icl), w=gp_w)

    #Add tracks
    for itr in range(len(df_tr)):
        g.add_node("tra{}".format(itr), typ=0, e=df_tr.iloc[itr]["pt"])
        
        #Add links from genparticles to track.
        #The weight is the number of hits in the track that came from this genparticle
        for gp, gp_w in df_tr.iloc[itr]["gp_contributions"].items():
            gp = int(gp)
            if gp_w/df_tr.iloc[itr]["nhits"]>0.2:
                g.add_edge("gen{}".format(gp), "tra{}".format(itr), w=gp_w)

    #Add PF objects
    for ipf in range(len(df_pfs)):
        g.add_node(
            "pfo{}".format(ipf),
            typ=int(df_pfs.iloc[ipf]["type"]),
            e=df_pfs.iloc[ipf]["energy"]
        )
        
        #Add link from cluster to PF object if available
        cl_idx = int(df_pfs.iloc[ipf]["cluster_idx"])
        if cl_idx!=-1:
            g.add_edge("clu{}".format(cl_idx), "pfo{}".format(ipf), w=0)

        #Add link from track to PF object if available
        tr_idx = int(df_pfs.iloc[ipf]["track_idx"])
        if tr_idx!=-1:
            g.add_edge("tra{}".format(tr_idx), "pfo{}".format(ipf), w=0)
    return g

In [None]:
#Given the decay graph, estiamte the energy in each PF object that came from the generator-level tau
def get_tau_fractions(df_tau, df_pfs, g):
    
    tau_genparticles = [
        "gen{}".format(x) for x in df_tau.index
    ]
    
    energy_tau_tr = np.zeros_like(df_pfs["energy"])
    energy_tau_cl = np.zeros_like(df_pfs["energy"])
    
    for genp in tau_genparticles:
        
        #Get all the nodes reachable from this gen tau
        ng = nx.descendants(g, genp)
        
        #Find the particle flow objects downstream from the gen tau
        pfs = [node for node in ng if node.startswith("pfo")]
        
        #Loop over all the particle flow objects
        for pfpart in pfs:
            ipf = int(pfpart[3:])
            
            e_tr = 0.0
            e_cl = 0.0
            for pred in list(g.predecessors(pfpart)):
                
                #if this PF object came from a track, get the energy from the parent genparticle
                if pred.startswith('tra'):
                    track_preds = list(g.predecessors(pred))
                    for this_tr_pred in track_preds:
                        e_tr += g.nodes[this_tr_pred]["e"]
                        
                #if this PF object came from a cluster, get the energy from the edge
                elif pred.startswith('clu'):
                    cl_preds = list(g.predecessors(pred))
                    for this_cl_pred in cl_preds:
                        e_cl += g.edges[(this_cl_pred, pred)]["w"]
            
            energy_tau_tr[ipf] = e_tr
            energy_tau_cl[ipf] = e_cl
            
#             print("  {} {} E_pf={:.2f} E_tau_tr={:.2f} E_tau_cl={:.2f}".format(
#                 pfpart, g.nodes[pfpart]["typ"], g.nodes[pfpart]["e"], e_tr, e_cl)
#             )
    df_pf_taufracs = pandas.DataFrame()
    df_pf_taufracs["energy_tau_tr"] = energy_tau_tr
    df_pf_taufracs["energy_tau_cl"] = energy_tau_cl
    return df_pf_taufracs

In [None]:
def compute_track_properties(df_tr):
    df_tr["pt"] = track_pt(df_tr["omega"])
    df_tr["px"] = np.cos(df_tr["phi"])*df_tr["pt"]
    df_tr["py"] = np.sin(df_tr["phi"])*df_tr["pt"]
    df_tr["pz"] = df_tr["tan_lambda"]*df_tr["pt"]

In [None]:
def process_one_event(data, iev):
    #Get the dataframes corresponding to this event
    df_gen = pandas.DataFrame(data[iev]["genparticles"])
    df_hit = pandas.DataFrame(data[iev]["track_hits"])
    df_cl = pandas.DataFrame(data[iev]["clusters"])
    df_tr = pandas.DataFrame(data[iev]["tracks"])
    df_ecal = pandas.DataFrame(data[iev]["ecal_hits"])
    df_hcal = pandas.DataFrame(data[iev]["hcal_hits"])
    df_pfs = pandas.DataFrame(data[iev]["pfs"])
    compute_track_properties(df_tr)

    #Get the generator taus with status==2
    df_tau = df_gen[(df_gen["pdgid"].abs()==15) & (df_gen["status"]==2)]
    
    #cluster the PF particles to jets, reorder by pt descending
    cluster = fastjet.ClusterSequence(ak.Array({
        "px": df_pfs["px"],
        "py": df_pfs["py"],
        "pz": df_pfs["pz"],
        "E": df_pfs["energy"],
    }), jetdef)
    jets_constituents = cluster.constituent_index(min_pt=5)[::-1]
    
    #Get the tau contributions in each PF object
    graph = event_to_graph(df_gen, df_cl, df_tr, df_pfs)
    df_pfs_taufrac = get_tau_fractions(df_gen, df_pfs, graph)
    
    #Now get the list of PF objects in each jet
    pfs_by_jet = []
    for jet_constituents in jets_constituents:
        
        #Get the PF objects corresponding to this jet
        pfs_jet = df_pfs.iloc[jet_constituents]
        pfs_jet_additional = df_pfs_taufrac.iloc[jet_constituents]
        pfs_this_jet = pandas.concat([pfs_jet, pfs_jet_additional], axis=1)
        pfs_by_jet.append(pfs_this_jet)

    #return the gen taus, and the list of PF candidates in each jet
    return df_tau, pfs_by_jet

# Process one event

In [None]:
data = json.load(bz2.BZ2File("../testdata/pythia6_ttbar_0001_pandora_0.json.bz2", "r"))
ret = process_one_event(data, 1)

# Process all the data

In [None]:
for fn in glob.glob("../testdata/*.json.bz2"):
    
    #Load the data file consisting of multiple events
    data = json.load(bz2.BZ2File(fn, "r"))
    
    #Loop over the events in the data file
    for iev in range(len(data)):
        process_one_event(data, iev)

# Misc

## Plot the graph for one event

In [None]:
# from networkx.drawing.nx_pydot import graphviz_layout
# def node_color(node):
#     if node.startswith("gen"):
#         if abs(g.nodes[node]["typ"])==15:
#             return "purple"
#         return "red"
#     elif node.startswith("clu"):
#         return "blue"
#     elif node.startswith("tra"):
#         return "green"
#     else:
#         return "gray"
    
# def node_label(node):
#     typ = node[:4]
#     l = "{}\n{:.2f}".format(g.nodes[node]["typ"], g.nodes[node]["e"])
#     return l

# def edge_label(edge):
#     w = g.edges[edge]["w"]
#     if w>0.0:
#         return "{:.2f}".format(w)
#     return ""

In [None]:
# plt.figure(figsize=(50,30))
# pos = graphviz_layout(g, prog="dot")
# nx.draw_networkx_nodes(g, pos,
#     node_size=[5*g.nodes[n]["e"] for n in g.nodes],
#     node_color=[node_color(n) for n in g.nodes],
# )
# nx.draw_networkx_labels(g, pos,
#     labels={n: node_label(n) for n in g.nodes},
#     font_size=4
# )
# nx.draw_networkx_edges(g, pos, node_size=100.0);
# nx.draw_networkx_edge_labels(
#     g, pos,
#     edge_labels={e: edge_label(e)  for e in g.edges},
#     font_size=4
# )
# plt.savefig("plot.svg")