In [2]:
### Reloads modules properly
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [3]:
import os

os.sys.path.append("../")

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mplhep as hep

hep.style.use("CMS")

# Random Forest For Primary Vertexing

In here we will explore the potential for using a random forest model to perform the primary vertexing.
There's 3 ways to formulate the primary vertexing problem to use a random forest regressor.
1. Event level Regression
    - Track-level features (pad vectors to a fixed size and stack)
    - Event-level target (z0 PV truth)
2. Classification on each event
    - Track-level features (pad vectors to a fixed size and stack)
    - Track-level target (is pv)
3. Classification on each track
    - Track-level features (no-padding or vector stacking)
    - Track-level target (is pv)

# Loading data and pre-processing

Before we do anything, we need to load the data and pre-process it.

In [37]:
trk = pd.read_parquet("/media/lucas/QS/l1_nnt/pv_features.parquet")

In [34]:
trk_fake_info = pd.read_pickle("/media/lucas/QS/l1_nnt/trk.pkl")["trk_fake"]

In [35]:
trk_fake_info.head()

entry  subentry
0      0           2
       1           2
       2           1
       3           2
       4           1
Name: trk_fake, dtype: int32

In [39]:
trk = pd.concat([trk, trk_fake_info], axis=1)

In [40]:
trk_fake_info.shape

(40780722,)

In [41]:
trk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake
entry,subentry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2
0,1,3.435026,0.566814,-0.159267,3.339844,45.10569,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2
0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.95879,1
0,3,2.641448,-1.435747,-0.128747,-0.46875,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2
0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1


In [42]:
mc = pd.read_pickle("/media/lucas/QS/l1_nnt/mc.pkl")

In [43]:
mc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pv_L1reco_z0,pv_L1reco_sum,pv_MC
entry,subentry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,5.026911,79.777252,5.055163
1,0,-2.02108,90.041313,-3.608661
2,0,-0.141508,202.421341,-0.261453
3,0,2.744174,303.983124,2.803285
4,0,-1.279072,218.012939,-1.400667


In [44]:
trk.reset_index(inplace=True)

In [45]:
trk.rename(columns={"entry": "event_number", "subentry": "trk_number"}, inplace=True)

In [47]:
trk.head()

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.10569,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.95879,1
3,0,3,2.641448,-1.435747,-0.128747,-0.46875,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1


In [48]:
mc.reset_index(drop=True, inplace=True)

In [49]:
mc["event_number"] = range(222976)

In [50]:
mc.head()

Unnamed: 0,pv_L1reco_z0,pv_L1reco_sum,pv_MC,event_number
0,5.026911,79.777252,5.055163,0
1,-2.02108,90.041313,-3.608661,1
2,-0.141508,202.421341,-0.261453,2
3,2.744174,303.983124,2.803285,3
4,-1.279072,218.012939,-1.400667,4


**Remove NAN**

In [51]:
trk.shape

(40780722, 15)

In [52]:
len(trk["event_number"].unique())

222976

In [53]:
trk.dropna().shape

(40780640, 15)

In [54]:
trk.dropna(inplace=True)

In [55]:
len(trk["event_number"].unique())

222976

**Create track-level is primary vertex labels**

In [56]:
from primaryvertexingtools import create_pv_truth_labels

In [58]:
trk = create_pv_truth_labels(trk, truth_label="trk_fake", truth_label_out="is_pv")

In [59]:
trk.head()

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2,0
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.10569,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2,0
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.95879,1,1
3,0,3,2.641448,-1.435747,-0.128747,-0.46875,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2,0
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1,1


In [60]:
mc.to_parquet("/media/lucas/QS/l1_nnt/mc_processed.parquet")

In [5]:
mc = pd.read_parquet("/media/lucas/QS/l1_nnt/mc_processed.parquet")

In [61]:
trk.to_parquet("/media/lucas/QS/l1_nnt/trk_processed.parquet")

In [6]:
trk = pd.read_parquet("/media/lucas/QS/l1_nnt/trk_processed.parquet")

In [6]:
!ls -lh /media/lucas/QS

ls: cannot access '/media/lucas/QS': No such file or directory


**Split dataset into two: Train and Test**

Lets use a 50:50 split so that the performance can be tested on the unseen dataset.

In [7]:
import random

In [8]:
import math

In [9]:
random.seed(1337)

In [10]:
max_n_events = trk.event_number.max() + 1

In [11]:
n_train_events = math.floor(max_n_events / 2)

In [12]:
n_train_events

111488

In [13]:
train_events = random.sample(range(max_n_events), n_train_events)

In [14]:
trk_train = trk.loc[trk["event_number"].isin(train_events)].copy()

In [15]:
mc_train = mc.loc[mc["event_number"].isin(train_events)].copy()

In [16]:
trk_test = trk.loc[~trk["event_number"].isin(train_events)].copy()

In [17]:
mc_test = mc.loc[~mc["event_number"].isin(train_events)].copy()

In [18]:
trk_test.head()

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
298,2,0,2.004334,-0.406667,-0.513415,9.84375,6.461751,1.076958,2.73447,3.727281,0.610109,5,0,0.689138,2,0
299,2,1,2.025913,-1.395448,-0.185742,-3.105469,6.034215,0.754277,2.798095,3.23612,0.714921,6,0,0.944477,2,0
300,2,2,2.249646,1.241507,-0.105314,1.40625,5.581336,0.697667,0.355793,5.225543,0.654216,6,0,0.943106,2,0
301,2,3,2.037065,-0.650425,-0.112909,-1.699219,2.661164,0.443527,1.124714,1.53645,0.775789,5,0,0.939647,2,0
302,2,4,3.175631,-1.402719,-0.042129,2.285156,7.39889,1.233148,4.642025,2.756865,1.498608,5,0,0.924746,2,0


# 1. Event Level Regression

In [19]:
trk.trk_number.max()

329

The maximum number of tracks in an event is 329, so pad until 330

In [20]:
feature_list = [
    "trk_pt",
    "trk_eta",
    "trk_phi",
    "trk_z0",
    "trk_chi2",
    "trk_chi2dof",
    "trk_chi2rphi",
    "trk_chi2rz",
    "trk_bendchi2",
    "trk_nstub",
    "trk_phiSector",
    "trk_fake",
]

In [24]:
# def create_input_feature_vector(df: pd.DataFrame, feature_list) -> np.array:
#     event_numbers = df.event_number.values

#     track_numbers = list(range(330))

#     multi_index = pd.MultiIndex.from_product(
#         [event_numbers, track_numbers], names=["event_number", "track_number"]
#     )
#     # Use large negative number for unphysical values across all features.
#     df.reindex(multi_index, fill_value=-9999)

#     return df.pivot_table(
#         index="event_number", columns="trk_number", values=feature_list
#     ).values

In [88]:
def reindex_frame(df: pd.DataFrame):
    event_numbers = df.event_number.values
    track_numbers = list(range(330))
    multi_index = pd.MultiIndex.from_product(
        [event_numbers, track_numbers], names=["event_number", "track_number"]
    )
    df.reindex(multi_index, fill_value=-9999)
    return df

In [89]:
trk.head()

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2,0
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.10569,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2,0
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.95879,1,1
3,0,3,2.641448,-1.435747,-0.128747,-0.46875,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2,0
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1,1


In [None]:
trk_re = reindex_frame(trk)

In [25]:
trk_train.head()

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2,0
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.10569,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2,0
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.95879,1,1
3,0,3,2.641448,-1.435747,-0.128747,-0.46875,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2,0
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1,1


In [None]:
X_train = create_input_feature_vector(trk_train, feature_list)

In [96]:
trk[["trk_pt", "trk_z0"]].values

(40780640, 2)

In [None]:
df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
      baz       zoo
bar   A  B  C   A  B  C
foo
one   1  2  3   x  y  z
two   4  5  6   q  w  t

In [118]:
trk10

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2,0
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.105690,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2,0
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.958790,1,1
3,0,3,2.641448,-1.435747,-0.128747,-0.468750,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2,0
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,9,196,1.950458,2.254519,-0.884692,6.738281,2.039042,0.509760,0.193708,1.845334,0.542706,4,8,0.990272,2,0
1857,9,197,3.892709,2.354093,-0.722013,2.578125,2.686348,0.447725,1.185857,1.500491,0.659815,5,8,0.980727,2,0
1858,9,198,2.410859,-2.034738,-0.548667,-4.746094,3.204123,0.534020,2.838934,0.365189,0.442229,5,8,0.987320,2,0
1859,9,199,3.310079,2.323107,-0.624017,5.742188,10.671175,1.778529,9.420842,1.250332,1.134960,5,8,0.940731,2,0


In [121]:
trk10.trk_number.max()

235

In [124]:
help(pd.pivot)

Help on function pivot in module pandas.core.reshape.pivot:

pivot(data: 'DataFrame', index: 'IndexLabel | None' = None, columns: 'IndexLabel | None' = None, values: 'IndexLabel | None' = None) -> 'DataFrame'
    Return reshaped DataFrame organized by given index / column values.
    
    Reshape data (produce a "pivot" table) based on column values. Uses
    unique values from specified `index` / `columns` to form axes of the
    resulting DataFrame. This function does not support data
    aggregation, multiple values will result in a MultiIndex in the
    columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
    
    Parameters
    ----------
    data : DataFrame
    index : str or object or a list of str, optional
        Column to use to make new frame's index. If None, uses
        existing index.
    
        .. versionchanged:: 1.1.0
           Also accept list of index names.
    
    columns : str or object or a list of str
        Column to use to make new fra

In [None]:
trk10.pivot_table(
    index="event_number",
    columns="trk_number",
    values=["trk_pt", "trk_z0"],
    fill_value=-9999,
)

(10, 472)

In [130]:
trk10[trk10.event_number == 4]

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
626,4,0,2.609780,0.285181,-0.442419,-1.171875,12.631740,1.578967,7.740102,4.891637,0.309389,6,0,0.924585,2,0
627,4,1,2.323567,-1.534982,-0.354673,3.046875,6.126259,1.021043,3.868710,2.257549,0.935210,5,0,0.910971,2,0
628,4,2,2.467112,0.730226,-0.341335,0.761719,2.486442,0.414407,0.398023,2.088419,0.351795,5,0,0.971333,2,0
629,4,3,2.074366,1.806646,-0.294020,9.667969,1.162425,0.193737,0.251386,0.911039,0.329354,5,0,0.943513,2,0
630,4,4,2.914677,0.320649,0.037127,-3.222656,9.568014,1.196002,5.318015,4.250000,0.622504,6,0,0.921294,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,4,231,2.229318,-2.361055,-1.048482,-5.039062,13.135766,3.283942,10.990524,2.145241,0.799293,4,8,0.924769,2,0
858,4,232,3.539052,-2.319690,-0.825073,-2.812500,12.030916,1.503865,4.887963,7.142954,0.862570,6,8,0.857672,2,0
859,4,233,2.090773,-2.153628,-0.357213,-0.820312,4.915720,0.819287,2.505373,2.410347,0.509606,5,8,0.930317,1,1
860,4,234,2.195808,-2.147346,-0.350267,10.605469,1.904705,0.238088,1.369372,0.535333,0.138398,6,8,0.953848,2,0


In [None]:
print (df.groupby(['InvoiceNo','CustomerID','Country'], 
                  as_index=False)['NoStockCode','Description','Quantity']
          .agg(lambda x: list(x)))

In [76]:
trk10

Unnamed: 0,event_number,trk_number,trk_pt,trk_eta,trk_phi,trk_z0,trk_chi2,trk_chi2dof,trk_chi2rphi,trk_chi2rz,trk_bendchi2,trk_nstub,trk_phiSector,trk_MVA1,trk_fake,is_pv
0,0,0,2.623967,1.301414,-0.289234,-0.878906,5.994866,0.999144,2.130341,3.864525,0.320722,5,0,0.942428,2,0
1,0,1,3.435026,0.566814,-0.159267,3.339844,45.105690,5.638211,42.600296,2.505395,0.590445,6,0,0.701423,2,0
2,0,2,3.774908,1.815973,-0.240375,4.921875,9.829946,1.228743,4.909814,4.920131,0.354615,6,0,0.958790,1,1
3,0,3,2.641448,-1.435747,-0.128747,-0.468750,4.197709,0.524714,0.738454,3.459255,0.618279,6,0,0.965898,2,0
4,0,4,2.320653,1.837421,0.032604,4.980469,3.273508,0.409188,0.946741,2.326767,1.142646,6,0,0.985416,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,9,196,1.950458,2.254519,-0.884692,6.738281,2.039042,0.509760,0.193708,1.845334,0.542706,4,8,0.990272,2,0
1857,9,197,3.892709,2.354093,-0.722013,2.578125,2.686348,0.447725,1.185857,1.500491,0.659815,5,8,0.980727,2,0
1858,9,198,2.410859,-2.034738,-0.548667,-4.746094,3.204123,0.534020,2.838934,0.365189,0.442229,5,8,0.987320,2,0
1859,9,199,3.310079,2.323107,-0.624017,5.742188,10.671175,1.778529,9.420842,1.250332,1.134960,5,8,0.940731,2,0


In [80]:
event_numbers = trk10.event_number.values

track_numbers = list(range(330))

multi_index = pd.MultiIndex.from_product(
    [event_numbers, track_numbers], names=["event_number", "trk_number"]
)
# Use large negative number for unphysical values across all features.
trk10_re = trk10.reindex(multi_index, fill_value=-9999)

In [82]:
event_numbers

array([0, 0, 0, ..., 9, 9, 9])

In [84]:
multi_index[0:20]

MultiIndex([(0,  0),
            (0,  1),
            (0,  2),
            (0,  3),
            (0,  4),
            (0,  5),
            (0,  6),
            (0,  7),
            (0,  8),
            (0,  9),
            (0, 10),
            (0, 11),
            (0, 12),
            (0, 13),
            (0, 14),
            (0, 15),
            (0, 16),
            (0, 17),
            (0, 18),
            (0, 19)],
           names=['event_number', 'trk_number'])

In [67]:
trk10.

(1861, 16)

In [64]:
trk10.groupby(["event_number"])["trk_z0", "trk_pt"].agg(lambda x: list(x)).agg(np.append)

  trk10.groupby(["event_number"])["trk_z0", "trk_pt"].agg(lambda x: list(x)).agg(np.append)


TypeError: _append_dispatcher() missing 1 required positional argument: 'values'

In [24]:
trk10 = trk.loc[trk["event_number"] < 10].copy()

In [29]:
a = trk10.groupby("event_number")["trk_z0"].apply(list)

In [31]:
b = trk10.groupby("event_number")["trk_pt"].apply(list)

In [61]:
trk10.groupby("event_number")[["trk_z0", "trk_pt"]]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9032baf850>

In [59]:
c = np.append(a, b)

In [56]:
len(a[0])

159

In [60]:
len(c[0])

159

In [40]:
b.apply(pd.Series).stack()

event_number     
0             0      2.623967
              1      3.435026
              2      3.774908
              3      2.641448
              4      2.320653
                       ...   
9             196    1.950458
              197    3.892709
              198    2.410859
              199    3.310079
              200    2.241018
Length: 1861, dtype: float64

In [38]:
np.stack([a, b])

array([[list([-0.87890625, 3.33984375, 4.921875, -0.46875, 4.98046875, -0.41015625, -0.05859375, 2.75390625, -3.6328125, -1.640625, -1.11328125, 5.5078125, -1.875, -1.11328125, -2.2265625, 6.09375, -0.52734375, -1.171875, 5.859375, -1.93359375, 5.625, -3.6328125, -2.98828125, 2.75390625, 2.8125, 1.34765625, 0.64453125, -5.15625, -1.9921875, -4.86328125, -1.69921875, 0.17578125, 0.99609375, 2.51953125, -5.33203125, -3.22265625, 4.21875, 7.8515625, -0.3515625, -0.46875, 0.17578125, 2.05078125, 2.63671875, -6.15234375, 2.16796875, 2.578125, -3.046875, 2.4609375, 2.34375, -0.87890625, 4.8046875, -5.21484375, 4.921875, 5.09765625, 4.16015625, -0.46875, -1.9921875, -6.328125, -2.4609375, -1.0546875, -4.16015625, -3.046875, -6.26953125, 5.15625, -3.515625, 4.921875, -3.10546875, -7.44140625, 3.046875, -1.875, -2.8125, -2.4609375, -2.98828125, -2.40234375, 3.22265625, 2.16796875, -6.97265625, -3.80859375, 6.50390625, 2.2265625, -1.640625, -5.0390625, -1.7578125, -4.1015625, -4.04296875, -0.937

In [30]:
a

event_number
0    [-0.87890625, 3.33984375, 4.921875, -0.46875, ...
1    [-2.578125, 2.8125, -4.04296875, -0.41015625, ...
2    [9.84375, -3.10546875, 1.40625, -1.69921875, 2...
3    [-2.51953125, -2.51953125, 2.9296875, 2.636718...
4    [-1.171875, 3.046875, 0.76171875, 9.66796875, ...
5    [-4.921875, -4.1015625, 1.58203125, -5.3320312...
6    [1.875, -0.703125, 7.79296875, -0.3515625, -0....
7    [1.0546875, -2.40234375, 1.0546875, -4.5117187...
8    [-4.39453125, 4.51171875, -0.46875, -5.0390625...
9    [10.37109375, 8.61328125, 0.3515625, -2.578125...
Name: trk_z0, dtype: object

In [22]:
trk.groupby("event_number")["trk_pt"].apply(list)

(222976,)