# End-to-End Architecture Performance Comparisons

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System imports
import os
import sys
from time import time as tt
import importlib

# External imports
import matplotlib.pyplot as plt
import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from torch_geometric.data import DataLoader

from itertools import chain
from random import shuffle, sample
from scipy.optimize import root_scalar as root

from torch.nn import Linear
import torch.nn.functional as F
from torch_cluster import knn_graph, radius_graph
import trackml.dataset
import torch_geometric
from itertools import permutations
import itertools
from sklearn import metrics
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, LightningDataModule
from pytorch_lightning.loggers import WandbLogger
from torch.utils.checkpoint import checkpoint

from argparse import Namespace
from trackml.score import score_event
from sklearn.cluster import DBSCAN

# Limit CPU usage on Jupyter
os.environ['OMP_NUM_THREADS'] = '4'

# Pick up local packages
sys.path.append('..')
sys.path.append('/global/homes/d/danieltm/ExaTrkX/end-to-end')
from LightningModules.GNN.Models.checkpoint_agnn import CheckpointedResAGNN
# from LightningModules.Embedding.utils import get_best_run, build_edges, res, graph_intersection

# Local imports
from prepare_utils import *
from performance_utils import *
from toy_utils import *
from models import *
from trainers import *
from lightning_modules.filter_scanner import Filter_Model
%matplotlib inline


# Get rid of RuntimeWarnings, gross
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
import wandb
import faiss
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
torch_seed = 0

# GNN Testing

## Testing

In [8]:
run_label = "rm3047br"
wandb_dir = "/global/cscratch1/sd/danieltm/ExaTrkX/wandb_data"
best_run_path = get_best_run(run_label,wandb_dir)

In [16]:
chkpnt = torch.load(best_run_path)
model = CheckpointedResAGNN.load_from_checkpoint(best_run_path)
model = model.to(device)

TypeError: __init__() missing 1 required positional argument: 'hparams'

In [23]:
model.eval()
with torch.no_grad():
    edge_total_positive, edge_total_true, edge_total_true_positive, edge_total_true_ground = 0, 0, 0, 0
    for i, batch in enumerate(model.val_dataloader().dataset[:5]):
        data = batch.to(device)

        output = (model(torch.cat([data.cell_data, data.x], axis=-1), data.edge_index).squeeze()
                  if ('ci' in model.hparams["regime"])
                  else model(data.x, data.edge_index).squeeze())

        #Edge filter performance
        preds = F.sigmoid(output) > 0.9 #Maybe send to CPU??
        edge_positive = preds.sum().float()

        if ('pid' in model.hparams["regime"]):
            y_pid = data.pid[data.edge_index[0]] == data.pid[data.edge_index[1]]
            edge_true = y_pid.sum().float()
            edge_true_positive = (y_pid & preds).sum().float()
        else:
            edge_true = data.y.sum()
            edge_true_ground = data.layerless_true_edges.shape[1]
            edge_true_positive = (data.y.bool() & preds).sum().float()
            
        edge_total_positive += edge_positive
        edge_total_true_positive += edge_true_positive
        edge_total_true += edge_true
        edge_total_true_ground += edge_true_ground
        
        print(i)

    edge_eff = (edge_total_true_positive / max(edge_total_true, 1))
    edge_ground_eff = (edge_total_true_positive / max(edge_total_true_ground, 1))
    edge_pur = (edge_total_true_positive / max(edge_total_positive, 1))

0
1
2
3
4


In [24]:
print("Eff:", edge_eff, "Pur:", edge_pur, "Ground eff:", edge_ground_eff)

Eff: tensor(0.8070, device='cuda:0') Pur: tensor(0.9520, device='cuda:0') Ground eff: tensor(0.6865, device='cuda:0')


## Truth Debugging

In [29]:
sample = model.val_dataloader().dataset[0]

In [51]:
sample[0]

Data(cell_data=[71081, 9], edge_index=[2, 987461], event_file=/global/cscratch1/sd/danieltm/ExaTrkX/trackml/train_all/event000001193, hid=[71081], layerless_true_edges=[2, 85024], layers=[71081], pid=[71081], x=[71081, 3], y=[987461], y_pid=[987461])

In [219]:
sample.layerless_true_edges

tensor([[ 2534,  7691, 13022,  ..., 79374, 89338, 93484],
        [ 7691, 13022, 19078,  ..., 89338, 93484, 96979]])

In [220]:
sample.edge_index

tensor([[     0,    323,    527,  ..., 103304, 103122, 103304],
        [   287,      0,      0,  ..., 103120, 103304, 103131]])

In [221]:
sample.y.sum()

tensor(117942.)

In [222]:
sample.edge_index[:,sample.y.bool()].shape

torch.Size([2, 117942])

In [223]:
sample.layerless_true_edges.shape

torch.Size([2, 123429])

In [224]:
sample.edge_index[:,sample.y.bool()].shape[1]/sample.layerless_true_edges.shape[1]

0.9555452932455095

## TrackML Debugging

### Ground Truth Level

In [205]:
sample = torch.load("/global/cscratch1/sd/danieltm/ExaTrkX/trackml_processed/filter_processed/0_pt_cut_endcaps_connected_high_eff/train/1000", map_location="cpu")

In [206]:
event_file = '/global/cscratch1/sd/danieltm/ExaTrkX/trackml/train_all/event000001000'
hits, particles, truth = trackml.dataset.load_event(
        event_file, parts=['hits', 'particles', 'truth'])

In [207]:
# Remove noise and assign track_id
hits = hits.merge(truth[['hit_id', 'weight', 'particle_id']], on='hit_id')
hits = hits[hits.particle_id != 0]
hids = sample.hid.cpu().numpy()

In [208]:
truth_graph = (sample.layerless_true_edges).cpu().numpy()
truth_graph = hids[truth_graph]
truth_graph = np.hstack([truth_graph, truth_graph[::-1]])

In [209]:
truth_graph_sp = sp.sparse.coo_matrix(([0.1]*truth_graph.shape[1], (truth_graph[0], truth_graph[1])), shape=(truth_graph.max()+1, truth_graph.max()+1))

In [210]:
clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1).fit_predict(truth_graph_sp)

In [211]:
track_list = np.vstack([np.unique(truth_graph), clustering[np.unique(truth_graph)]])
track_list = pd.DataFrame(track_list.T)
track_list.columns = ["hit_id", "track_id"]
score_event(hits, track_list)

1.0000000080094655

### Filter Truth Level

In [212]:
sample = torch.load("/global/cscratch1/sd/danieltm/ExaTrkX/trackml_processed/filter_processed/0_pt_cut_endcaps_connected_high_eff/train/1000", map_location="cpu")

In [213]:
sample

Data(cell_data=[103305, 9], edge_index=[2, 1831684], event_file=/global/cscratch1/sd/danieltm/ExaTrkX/trackml/train_all/event000001000, hid=[103305], layerless_true_edges=[2, 123429], layers=[103305], pid=[103305], x=[103305, 3], y=[1831684], y_pid=[1831684])

In [214]:
truth_graph = (sample.edge_index[:,sample.y_pid.bool()]).cpu().numpy()
truth_graph = hids[truth_graph]
truth_graph = np.hstack([truth_graph, truth_graph[::-1]])

In [215]:
truth_graph_sp = sp.sparse.coo_matrix(([0.1]*truth_graph.shape[1], (truth_graph[0], truth_graph[1])), shape=(truth_graph.max()+1, truth_graph.max()+1))

From clustering

In [216]:
clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1).fit_predict(truth_graph_sp)

In [217]:
track_list = np.vstack([np.unique(truth_graph), clustering[np.unique(truth_graph)]])
track_list = pd.DataFrame(track_list.T)
track_list.columns = ["hit_id", "track_id"]
score_event(hits, track_list)

0.9349574380163364

From ground truth, IF they have a true edge in the filtered set

In [228]:
truth_graph = (sample.edge_index[:,sample.y_pid.bool()]).cpu().numpy()

In [229]:
track_list = np.vstack([hids[np.unique(truth_graph)], sample.pid[np.unique(truth_graph)]])
track_list = pd.DataFrame(track_list.T)
track_list.columns = ["hit_id", "track_id"]
score_event(hits, track_list)

0.9592538354264974

### Noise Robustness

In [148]:
sample = torch.load("/global/cscratch1/sd/danieltm/ExaTrkX/trackml_processed/filter_processed/0_pt_cut_endcaps_connected_high_eff/train/1000", map_location="cpu")

In [149]:
event_file = '/global/cscratch1/sd/danieltm/ExaTrkX/trackml/train_all/event000001000'
hits, particles, truth = trackml.dataset.load_event(
        event_file, parts=['hits', 'particles', 'truth'])

In [150]:
# Assign track_id
hits = hits.merge(truth[['hit_id', 'weight', 'particle_id']], on='hit_id')
hids = sample.hid.cpu().numpy()

In [151]:
truth_graph = (sample.layerless_true_edges).cpu().numpy()
truth_graph = hids[truth_graph]
truth_graph = np.hstack([truth_graph, truth_graph[::-1]])

In [152]:
truth_graph_sp = sp.sparse.coo_matrix(([0.1]*truth_graph.shape[1], (truth_graph[0], truth_graph[1])), shape=(truth_graph.max()+1, truth_graph.max()+1))

In [153]:
clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1).fit_predict(truth_graph_sp)

In [154]:
track_list = np.vstack([np.unique(truth_graph), clustering[np.unique(truth_graph)]])
track_list = pd.DataFrame(track_list.T)
track_list.columns = ["hit_id", "track_id"]
score_event(hits, track_list)

1.0000000080094655

In [175]:
noise_idx = hits.hit_id[~hits.hit_id.isin(track_list.hit_id)]

In [170]:
random_noise = np.random.choice(hits.particle_id, (len(noise_idx),))

In [179]:
noise_idx = pd.DataFrame(noise_idx).assign(track_id=random_noise)

In [184]:
noise_joined = pd.concat([track_list, noise_idx])

In [185]:
score_event(hits, noise_joined)

1.0000000080094655

In [186]:
noise_joined

Unnamed: 0,hit_id,track_id
0,2,2
1,4,4
2,5,5
3,6,6
4,7,7
...,...,...
120908,120909,0
120912,120913,801644513243168768
120925,120926,589977461060534272
120928,120929,396324463789998080


# Graph Analysis

In [3]:
import cugraph
import cudf
import pandas as pd
import cupy as cp

In [4]:
import pandas

In [5]:
pandas.__version__

'1.1.4'

In [40]:
data = torch.load("/global/cscratch1/sd/danieltm/ExaTrkX/trackml-codalab/filter_processed/0_pt_cut_endcaps_unweighted/train/1045", map_location="cuda")

In [41]:
event_file = "/global/cscratch1/sd/danieltm/ExaTrkX/trackml-codalab/train_all/event000021045"
hits, particles, truth = trackml.dataset.load_event(
        event_file, parts=['hits', 'particles', 'truth'])

In [42]:
# Assign track_id
hits = hits.merge(truth[['hit_id', 'weight', 'particle_id']], on='hit_id')
hids = data.hid.cpu().numpy()

### DBSCAN

In [182]:
%%time
truth_graph = (data.layerless_true_edges).cpu().numpy()
truth_graph = hids[truth_graph]
truth_graph = np.hstack([truth_graph, truth_graph[::-1]])

truth_graph_sp = sp.sparse.coo_matrix(([0.1]*truth_graph.shape[1], (truth_graph[0], truth_graph[1])), shape=(truth_graph.max()+1, truth_graph.max()+1))

clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1).fit_predict(truth_graph_sp)

track_list = np.vstack([np.unique(truth_graph), clustering[np.unique(truth_graph)]])
track_list = pd.DataFrame(track_list.T)




CPU times: user 3.07 s, sys: 69.3 ms, total: 3.14 s
Wall time: 3.11 s


In [None]:
track_list.columns = ["hit_id", "track_id"]
score_event(hits, track_list)

In [160]:
%%time
clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1).fit_predict(truth_graph_sp)



CPU times: user 2.98 s, sys: 21.8 ms, total: 3 s
Wall time: 2.99 s


In [164]:
%%time
clustering = DBSCAN(eps=0.1, metric="precomputed", min_samples=1, n_jobs=2).fit_predict(truth_graph_sp)



CPU times: user 3 s, sys: 63.8 ms, total: 3.07 s
Wall time: 3.03 s


### CuGraph

In [43]:
data

Data(cell_data=[85715, 9], e_radius=[2, 1152717], event_file="/global/cscratch1/sd/danieltm/ExaTrkX/trackml-codalab/train_all/event000021045", hid=[85715], layerless_true_edges=[2, 101701], layers=[85715], pid=[85715], pt=[85715], true_weights=[101701], weights=[1152717], x=[85715, 3], y=[1152717], y_pid=[1152717])

In [44]:
%%time
truth_graph = (data.layerless_true_edges).cpu().numpy()
hids = data.hid.cpu().numpy()
truth_graph = hids[truth_graph]
truth_df = cudf.DataFrame(truth_graph.T)

G = cugraph.Graph()
G.from_cudf_edgelist(truth_df, source=0, destination=1, edge_attr=None)
labels = cugraph.components.connectivity.weakly_connected_components(G)
track_list = labels.to_pandas()
track_list.columns = ["track_id", "hit_id"]

CPU times: user 57 ms, sys: 8.15 ms, total: 65.1 ms
Wall time: 64.7 ms


In [31]:
score_event(hits, track_list)

0.9998926292950182