In [None]:
import awkward as ak
import glob
import tqdm
import matplotlib.pyplot as plt
import numpy as np
import mplhep
import boost_histogram as bh
import vector
import matplotlib

In [None]:
titles_z = "Z"
titles_zh = "ZH"
titles_qq = "qq"

In [None]:
def to_bh(data, bins, cumulative=False):
    h1 = bh.Histogram(bh.axis.Variable(bins))
    h1.fill(data)
    if cumulative:
        h1[:] = np.sum(h1.values()) - np.cumsum(h1)
    return h1


def to_p4(p4_obj):
    return vector.awk(
        ak.zip(
            {
                "mass": p4_obj.tau,
                "x": p4_obj.x,
                "y": p4_obj.y,
                "z": p4_obj.z,
            }
        )
    )

In [None]:
def load_sample(path, max_files):
    columns = [
        #basic reco inputs
        "reco_jet_p4s",
        "reco_cand_p4s",
        "reco_cand_charge",
        "reco_cand_pdg",

        #advanced reco inputs: tracking variables
        "reco_cand_dz",
        "reco_cand_dz_err",
        "reco_cand_d3",
        "reco_cand_d3_err",
        "reco_cand_z0",
        "reco_cand_z0_err",
        "reco_cand_d0",
        "reco_cand_d0_err",

        #targets
        "gen_jet_p4s",
        "gen_jet_tau_p4s", #tau visible momentum
        "gen_jet_tau_decaymode",
        "gen_jet_full_tau_p4s" #tau full momentum, for debugging
    ]
    data = []
    for fi in tqdm.tqdm(list(glob.glob(path + "/*.parquet"))[:max_files]):
        ret = ak.from_parquet(fi, columns=columns)
        ret = ak.Array({k: ret[k] for k in ret.fields})
        ret = ret[to_p4(ret["gen_jet_p4s"]).pt>10]
        data.append(ret)
    data = ak.concatenate(data)
    return data

In [None]:
data_qq = load_sample("/local/joosep/ml-tau-en-reg/ntuples/20240519_qq_and_zh_2M/QCD/", -1)

In [None]:
data_z = load_sample("/local/joosep/ml-tau-en-reg/ntuples/20240519_qq_and_zh_2M/Z_Ztautau", -1)

In [None]:
data_zh = load_sample("/local/joosep/ml-tau-en-reg/ntuples/20240519_qq_and_zh_2M/ZH_Htautau/", -1)

In [None]:
len(data_qq), len(data_z), len(data_zh)

In [None]:
def split_train_test(data, split=0.8):
    ndata = len(data)
    ntrain = int(ndata*split)
    data_train = data[:ntrain]
    data_test = data[ntrain:]
    return data_train, data_test

In [None]:
data_qq_train, data_qq_test = split_train_test(data_qq)
ak.to_parquet(data_qq_train, "qq_train.parquet")
ak.to_parquet(data_qq_test, "qq_test.parquet")

In [None]:
data_z_train, data_z_test = split_train_test(data_z)
ak.to_parquet(data_z_train, "z_train.parquet")
ak.to_parquet(data_z_test, "z_test.parquet")

In [None]:
data_zh_train, data_zh_test = split_train_test(data_zh)
ak.to_parquet(data_zh_train, "zh_train.parquet")
ak.to_parquet(data_zh_test, "zh_test.parquet")

In [None]:
!du -csh *.parquet

In [None]:
bins = np.linspace(0,220,51)
mplhep.histplot(to_bh(to_p4(data_z["gen_jet_tau_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="Z")
mplhep.histplot(to_bh(to_p4(data_zh["gen_jet_tau_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
#mplhep.histplot(to_bh(to_p4(data_qq["gen_jet_tau_p4s"]).pt, bins=bins), histtype="step", lw=2, flow="sum", label="qq")
plt.legend(loc="best")
plt.yscale("log")
plt.xlabel("gen tau visible pT [GeV]")
plt.ylabel("Number of gen jets / bin")

In [None]:
bins = np.linspace(0,220,51)
mplhep.histplot(to_bh(to_p4(data_z["gen_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="Z")
mplhep.histplot(to_bh(to_p4(data_zh["gen_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
mplhep.histplot(to_bh(to_p4(data_qq["gen_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="qq")
plt.legend(loc="best")
plt.yscale("log")
plt.xlabel("gen jet pT [GeV]")
plt.ylabel("Number of gen jets / bin")

In [None]:
bins = np.linspace(0,220,51)
mplhep.histplot(to_bh(to_p4(data_z["reco_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="Z")
mplhep.histplot(to_bh(to_p4(data_zh["reco_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
mplhep.histplot(to_bh(to_p4(data_qq["reco_jet_p4s"]).pt, bins=bins), histtype="step", lw=1, flow="sum", label="qq")
plt.legend(loc="best")
plt.yscale("log")
plt.xlabel("reco jet pT [GeV]")
plt.ylabel("Number of gen jets / bin")

In [None]:
b = np.linspace(0,220,221)
plt.title(titles_qq)
plt.hist2d(
    ak.to_numpy(to_p4(data_qq["gen_jet_p4s"]).pt),
    ak.to_numpy(to_p4(data_qq["reco_jet_p4s"]).pt),
    bins=(b,b),
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
);
plt.colorbar()
plt.plot([0,220],[0,220], color="black", ls="--", lw=1.0)
plt.xlabel("gen jet pt")
plt.ylabel("reco jet pt")

In [None]:
b = np.linspace(0,220,221)
plt.title(titles_z)
plt.hist2d(
    ak.to_numpy(to_p4(data_z["gen_jet_p4s"]).pt),
    ak.to_numpy(to_p4(data_z["reco_jet_p4s"]).pt),
    bins=(b,b),
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
);
plt.colorbar()
plt.plot([0,220],[0,220], color="black", ls="--", lw=1.0)
plt.xlabel("gen jet pt")
plt.ylabel("reco jet pt")

In [None]:
b = np.linspace(0,220,221)
plt.title(titles_zh)
plt.hist2d(
    ak.to_numpy(to_p4(data_zh["gen_jet_p4s"]).pt),
    ak.to_numpy(to_p4(data_zh["reco_jet_p4s"]).pt),
    bins=(b,b),
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
);
plt.colorbar()
plt.plot([0,220],[0,220], color="black", ls="--", lw=1.0)
plt.xlabel("gen jet pt")
plt.ylabel("reco jet pt")

In [None]:
b = np.linspace(0,220,221)
plt.title(titles_z)
plt.hist2d(
    ak.to_numpy(to_p4(data_z["gen_jet_tau_p4s"]).pt),
    ak.to_numpy(to_p4(data_z["reco_jet_p4s"]).pt),
    bins=(b,b),
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
);
plt.colorbar()
plt.plot([0,220],[0,220], color="black", ls="--", lw=1.0)
plt.xlabel("gen tau visible pt")
plt.ylabel("reco jet pt")

In [None]:
b = np.linspace(0,220,221)
plt.title(titles_zh)
plt.hist2d(
    ak.to_numpy(to_p4(data_zh["gen_jet_tau_p4s"]).pt),
    ak.to_numpy(to_p4(data_zh["reco_jet_p4s"]).pt),
    bins=(b,b),
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
);
plt.colorbar()
plt.plot([0,220],[0,220], color="black", ls="--", lw=1.0)
plt.xlabel("gen tau visible pt")
plt.ylabel("reco jet pt")

In [None]:
bins = np.linspace(0.0,2,201)

mplhep.histplot(
    to_bh(
        to_p4(data_z["gen_jet_p4s"]).pt / to_p4(data_z["gen_jet_full_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="Z"
)

mplhep.histplot(
    to_bh(
        to_p4(data_zh["gen_jet_p4s"]).pt / to_p4(data_zh["gen_jet_full_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="ZH")

plt.legend(loc="best")
#plt.yscale("log")
plt.xlabel("genjet pt / gentau pt")
plt.ylabel("matched jets / bin")

In [None]:
bins = np.linspace(0.95, 1.05, 201)

mplhep.histplot(
    to_bh(
        to_p4(data_z["gen_jet_p4s"]).pt / to_p4(data_z["gen_jet_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="Z"
)

mplhep.histplot(
    to_bh(
        to_p4(data_zh["gen_jet_p4s"]).pt / to_p4(data_zh["gen_jet_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
plt.legend(loc="best")
#plt.yscale("log")
plt.xlabel("genjet pt / gentau visible pt")
plt.ylabel("matched jets / bin")

In [None]:
bins = np.linspace(0.75,1.25,101)

mplhep.histplot(
    to_bh(
        to_p4(data_z["reco_jet_p4s"]).pt / to_p4(data_z["gen_jet_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="Z")

mplhep.histplot(
    to_bh(
        to_p4(data_zh["reco_jet_p4s"]).pt / to_p4(data_zh["gen_jet_tau_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
plt.legend(loc="best")
#plt.yscale("log")
plt.xlabel("recojet pt / gentau visible pt")
plt.ylabel("matched jets / bin")

In [None]:
bins = np.linspace(0.5,2,101)

mplhep.histplot(
    to_bh(
        to_p4(data_z["reco_jet_p4s"]).pt / to_p4(data_z["gen_jet_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="Z")

mplhep.histplot(
    to_bh(
        to_p4(data_zh["reco_jet_p4s"]).pt / to_p4(data_zh["gen_jet_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="ZH")

mplhep.histplot(
    to_bh(
        to_p4(data_qq["reco_jet_p4s"]).pt / to_p4(data_qq["gen_jet_p4s"]).pt
    , bins=bins), histtype="step", lw=1, flow="sum", label="qq")

plt.legend(loc="best")
plt.yscale("log")
plt.xlabel("recojet pt / genjet pt")
plt.ylabel("jets / bin")

In [None]:
bins = np.linspace(0,50,51)
mplhep.histplot(to_bh(ak.num(data_z["reco_cand_p4s"]), bins=bins), histtype="step", lw=1, flow="sum", label="Z")
mplhep.histplot(to_bh(ak.num(data_zh["reco_cand_p4s"]), bins=bins), histtype="step", lw=1, flow="sum", label="ZH")
mplhep.histplot(to_bh(ak.num(data_qq["reco_cand_p4s"]), bins=bins), histtype="step", lw=1, flow="sum", label="qq")
plt.legend(loc="best")
plt.yscale("log")
plt.xlabel("Number of reco particles / jet")
plt.ylabel("Number of reco jets / bin")

In [None]:
dms = np.arange(17)
plt.title(titles_z)
plt.hist(
    data_z["gen_jet_tau_decaymode"], bins=dms,
    width=0.8
)

plt.xticks(dms+0.4, dms);

In [None]:
dms = np.arange(17)
plt.title(titles_zh)
plt.hist(
    data_zh["gen_jet_tau_decaymode"], bins=dms,
    width=0.8
)

plt.xticks(dms+0.4, dms);

In [None]:
bins = np.linspace(0,10,100)
mplhep.histplot(to_bh(ak.flatten(data_z["reco_cand_d3"][data_z["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_z)
mplhep.histplot(to_bh(ak.flatten(data_zh["reco_cand_d3"][data_zh["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_zh)
mplhep.histplot(to_bh(ak.flatten(data_qq["reco_cand_d3"][data_qq["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_qq)
plt.yscale("log")
plt.legend(loc="best")

In [None]:
bins = np.linspace(-10,10,100)
mplhep.histplot(to_bh(ak.flatten(data_z["reco_cand_d0"][data_z["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_z)
mplhep.histplot(to_bh(ak.flatten(data_zh["reco_cand_d0"][data_zh["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_zh)
mplhep.histplot(to_bh(ak.flatten(data_qq["reco_cand_d0"][data_qq["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_qq)
plt.yscale("log")
plt.legend(loc="best")

In [None]:
bins = np.linspace(0,10,100)
mplhep.histplot(to_bh(ak.flatten(data_z["reco_cand_dz"][data_z["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_z)
mplhep.histplot(to_bh(ak.flatten(data_zh["reco_cand_dz"][data_zh["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_zh)
mplhep.histplot(to_bh(ak.flatten(data_qq["reco_cand_dz"][data_qq["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_qq)
plt.yscale("log")
plt.legend(loc="best")

In [None]:
bins = np.linspace(0,10,100)
mplhep.histplot(to_bh(ak.flatten(data_z["reco_cand_z0"][data_z["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_z)
mplhep.histplot(to_bh(ak.flatten(data_zh["reco_cand_z0"][data_zh["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_zh)
mplhep.histplot(to_bh(ak.flatten(data_qq["reco_cand_z0"][data_qq["reco_cand_charge"]!=0]), bins=bins), histtype="step", lw=1, flow="sum", label=titles_qq)
plt.yscale("log")
plt.legend(loc="best")