In [1]:
import sys, os
from Scripts import train_metric_learning, run_metric_learning_inference, train_gnn, run_gnn_inference, build_track_candidates
import yaml

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'{device} available')
import pandas as pd, numpy as np


from bokeh.io import output_notebook, show
output_notebook()
from bokeh.plotting import figure, row
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import viridis
from bokeh.models.annotations import Label

from Pipelines.Common_Tracking_Example.notebooks.ITk.Exploration.gnn_utils import infer_event
from Pipelines.TrackML_Example.notebooks.build_embedding import EmbeddingInferenceBuilder

from matplotlib import pyplot as plt

from pytorch_lightning import Trainer

from IPython.display import clear_output
CONFIG = 'pipeline_config.yaml'

INFO:Loading faiss with AVX2 support.
INFO:Successfully loaded faiss with AVX2 support.


cuda available


# 0. Download Data

In [None]:
!mkdir datasets
!wget https://portal.nersc.gov/cfs/m3443/dtmurnane/TrackML_Example/trackml_quickstart_dataset.tar.gz -O datasets/trackml_quickstart_dataset.tar.gz

In [10]:
%%capture
!tar -xvf datasets/trackml_quickstart_dataset.tar.gz -C datasets;
!rm datasets/trackml_quickstart_dataset.tar.gz

### Pipeline configurations

The configurations for the entire pipeline are defined under pipeline_config.yml. 

In [3]:
with open(CONFIG, 'r') as f:
    configs = yaml.load(f, Loader=yaml.FullLoader)

# 1. Train Metric Learning

## What it does
Broadly speaking, the first stage of our pipeline is embedding the space points on to graphs, in a way that is efficient, i.e. we miss as few points on a graph as possible. We train a MLP to transform the input feature vector of each space point $\mathbf{u}_i$ into an N-dimensional latent space $\mathbf{v}_i$. The graph is then constructed by connecting the space points whose Euclidean distance between the latent space points $$d_{ij} = \left| \mathbf{v}_i - \mathbf{v}_j \right| < r_{embedding}$$

## Training data
Let us take a look at the data before training. In this example pipeline, we have preprocessed the TrackML data into a more convenient form. We calculated directional information and summary statistics from the charge deposited in each spacepoints, and appended them to its cyclidrical coordinates. Let us load an example data file and inspect the content.

In [3]:
from Pipelines.TrackML_Example.LightningModules.Embedding.Models.layerless_embedding import LayerlessEmbedding

metric_learning_configs = configs['metric_learning_configs']

model = LayerlessEmbedding(metric_learning_configs)
model.setup(stage='fit')
clear_output()

print(model.trainset[0])
example_data = model.get_input_data(model.trainset[0])
example_data_df = pd.DataFrame(example_data.numpy())
example_data_df.head()

Data(x=[12083, 3], pid=[12083], modules=[12083], event_file='datasets/quickstart_example_1GeV/21045', hid=[12083], pt=[12083], weights=[10965], modulewise_true_edges=[2, 10965], layerwise_true_edges=[2, 14426], cell_data=[12083, 9], signal_true_edges=[2, 10965])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.323412,2.091356,0.844154,0.05,0.05625,0.3,-2.091356,0.962261,0.051929,0.083736,-0.958
1,6.0,0.308704,0.884925,1.229181,0.1,0.28125,0.3,0.80096,1.972132,0.115441,0.50132,-0.198762
2,6.0,0.312759,0.793395,1.423718,0.05,0.3375,0.3,0.956851,2.072294,0.031444,0.612759,0.041935
3,7.0,0.34282,0.772962,1.282741,0.1,0.3375,0.3,0.928149,-0.127298,0.031484,-0.159847,-0.085926
4,3.0,0.162364,1.440542,0.844154,0.1,0.1125,0.3,0.34865,2.327071,0.07183,0.609832,-0.018804


The input data is gotten by concatenating the cell data and cylindrical coordinate of each space point 

In [7]:
input_data = [model.trainset[0].cell_data.numpy(), model.trainset[0].x.numpy()]
input_data = np.concatenate(input_data, axis=1)
input_data = pd.DataFrame(input_data)
input_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.323412,2.091356,0.844154,0.05,0.05625,0.3,-2.091356,0.962261,0.051929,0.083736,-0.958
1,6.0,0.308704,0.884925,1.229181,0.1,0.28125,0.3,0.80096,1.972132,0.115441,0.50132,-0.198762
2,6.0,0.312759,0.793395,1.423718,0.05,0.3375,0.3,0.956851,2.072294,0.031444,0.612759,0.041935
3,7.0,0.34282,0.772962,1.282741,0.1,0.3375,0.3,0.928149,-0.127298,0.031484,-0.159847,-0.085926
4,3.0,0.162364,1.440542,0.844154,0.1,0.1125,0.3,0.34865,2.327071,0.07183,0.609832,-0.018804


In [8]:
with torch.no_grad():
    latent = model(example_data)

latent_df = pd.DataFrame(latent.numpy())
latent_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.053065,0.197346,0.262761,0.384845,-0.557306,-0.200907,-0.290017,-0.038574,-0.108788,-0.086592,-0.162083,-0.508856
1,0.454764,-0.016605,0.018029,0.318143,0.442434,-0.060065,0.056463,0.570523,0.214677,0.245423,0.03391,-0.236358
2,0.481569,-0.00423,0.017104,0.316161,0.450244,-0.035352,0.024962,0.552444,0.203532,0.238946,0.019835,-0.24315
3,0.366191,-0.028428,0.035583,0.39073,0.393709,-0.08474,0.204175,0.483061,-0.027342,0.352563,0.07468,-0.378564
4,0.535915,0.100143,-0.021422,0.323075,0.432406,0.093268,-0.138293,0.518372,0.178168,-0.27123,-0.032768,-0.089656


## Train metric learning model

Finally we come to model training. By default, we train the MLP for 30 epochs, which amounts to approximately 15 minutes. Feel free to adjust the epoch number in pipeline_config.yml

In [2]:
metric_learning_trainer, metric_learning_model = train_metric_learning(CONFIG)
clear_output()
log_file = os.path.join(metric_learning_trainer.logger.log_dir , 'metrics.csv')

If you have pretrained a model, load it by 

In [8]:
from Pipelines.TrackML_Example.LightningModules.Embedding.Models.layerless_embedding import LayerlessEmbedding
common_configs = configs["common_configs"]
metric_learning_model = LayerlessEmbedding.load_from_checkpoint(os.path.join(common_configs["artifact_directory"], "metric_learning", common_configs["experiment_name"]+".ckpt")).to(device)
metric_learning_model.setup(stage="fit")
clear_output()
# change your log file path here
log_file = '/global/cfs/cdirs/m3443/usr/pmtuan/Tracking-ML-Exa.TrkX/Examples/TrackML_Quickstart/artifacts/metric_learning/trackml_quickstart_1/version_34/metrics.csv'

## Plot training metrics

In [3]:
# log_file = os.path.join(metric_learning_trainer.logger.log_dir , 'metrics.csv')
metrics = pd.read_csv(log_file, sep=',')
train_metrics = metrics[ ~ metrics['train_loss'].isna() ][['epoch', 'train_loss']]
train_metrics['epoch'] -= 1
val_metrics = metrics[ ~ metrics['val_loss'].isna() ][['val_loss', 'eff', 'pur', 'current_lr', 'epoch']]
metrics = pd.merge(left=train_metrics, right=val_metrics, how='inner', on='epoch')
metrics.head()

Unnamed: 0,epoch,train_loss,val_loss,eff,pur,current_lr
0,0,0.012788,0.009776,0.781078,0.009526,9e-05
1,1,0.009825,0.009612,0.897273,0.010943,0.00018
2,2,0.009695,0.009067,0.937847,0.01144,0.00027
3,3,0.009516,0.004072,0.889358,0.019715,0.00036
4,4,0.009254,0.00395,0.864001,0.046348,0.00045


In [13]:
p1 = figure(title='Training validation loss', x_axis_label='Epoch', y_axis_label='Loss', y_axis_type="log")

source = ColumnDataSource(metrics)

cmap = viridis(3)

for idx, y in enumerate(['train_loss', 'val_loss']):
    p1.circle(y=y, x='epoch', source=source, color=cmap[idx], legend_label=y, name='circle')
    p1.line(x='epoch', y=y, source=source, color=cmap[idx], legend_label=y)
p1.add_tools(
    HoverTool(
        tooltips=[("epoch", "@epoch"), ('train_loss', '@train_loss'), ('val_loss', '@val_loss')],
        name='circle'
    )
)


p2 = figure(title='Purity on validation set', x_axis_label='Epoch', y_axis_label='Purity', y_range=(0,0.5))
p2.circle(y='pur', x='epoch', source=source, color=cmap[0], legend_label='Purity')
p2.line(x='epoch', y='pur', source=source, color=cmap[0], legend_label='Purity')

p3 = figure(title='Efficiency on validation set', x_axis_label='Epoch', y_axis_label='Efficiency', y_range=(0.7, 1))
p3.circle(y='eff', x='epoch', source=source, color=cmap[0], legend_label='Efficiency')
p3.line(x='epoch', y='eff', source=source, color=cmap[0], legend_label='Efficiency')

show(row([p1,p2, p3]))

## How efficiency, purity and loss are calculated

Efficiency and purity are functions of the the radius of the neighborhood around a particular point $P$ in the latent space from which k nearest neighbors $( Q_1, Q_2, ..., Q_k )$ are selected to form k edges to P: $( PQ_1, PQ_2, ..., PQ_k )$

### Get metrics by model.shared_evaluation

In [9]:
batch = metric_learning_model.testset[0].to(device)
metric_learning_model.to(device)

# under the hood, test test metrics are calculated using metric_learning_model.shared_evaluation method
with torch.no_grad():
    test_results = metric_learning_model.shared_evaluation(batch=batch , batch_idx=0, knn_radius=configs['metric_learning_configs']['r_test'], knn_num=configs['metric_learning_configs']['knn'])

# inspect the distances 
log_distances = torch.log10(test_results['distances']).cpu().numpy() / 2
histogram, edges = np.histogram(log_distances, bins=100, density=True)
p = figure(title='Histogram of log distance in latent space of predicted edges', x_axis_label=r'$$\log{d_{(P,Q)}}$$')
p.quad( bottom=0, top=histogram, left=edges[:-1], right=edges[1:] )
show(p)

# the efficiency, purity, and loss are also stored in test_results
print(f'Efficiency: {test_results["eff"]: .3f}')
print(f'Purity: {test_results["pur"]: .3f}')
print(f'Loss: {test_results["loss"]: .3f}')

Efficiency:  0.965
Purity:  0.360
Loss:  0.004


### Computing the efficiency and purity by hand (Optional)

Since efficiency and purity are important and recurring metrics throughout the pipeline, we detail here the steps to compute them. 

In [10]:
from Pipelines.TrackML_Example.LightningModules.Embedding.utils import build_edges

# get test input data from raw data
input_data = metric_learning_model.get_input_data(batch).to(device)

# compute latent space representation from input data
spatial = metric_learning_model.to(device)(input_data)

# compute bidirectional edges from true edges, so that edges 12 and 21 are both in the list. 
# this is because the metric learning model builds edge 12 by finding kNN of node 1 and builds edge 21 by finding kNN of node 2.
e_bidir = torch.cat([ batch.signal_true_edges, batch.signal_true_edges.flip(0) ], axis=-1)

# build the list of predicted edges from the latent space representation by using k-Nearest Neighbor
e_spatial = build_edges(spatial, spatial, indices=None, r_max=configs['metric_learning_configs']['r_test'], k_max=configs['metric_learning_configs']['knn'])

# compare to truth edges to classify predicted edges as either true or fake edges, store in y_cluster
e_spatial, y_cluster = metric_learning_model.get_truth(batch, e_spatial, e_bidir)

# get number of true edges in the test batch 
cluster_true = e_bidir.shape[1]
print( f'Number of true edges: {cluster_true}' )

# get number of edges predicted by the model that are true edges, or true positive
cluster_true_positive = y_cluster.sum()
print(f'Number of edges predicted by metric learning model that are true: {cluster_true_positive}')

# get number of edges predicted by the model
cluster_positive = len(e_spatial[0])
print(f'Number of edges predicted by metric learning model: {cluster_positive}')

# the efficiency is the proportion of true edges that are recognized by the model 
eff = cluster_true_positive / cluster_true
print(f'Efficiency: {eff: .3f}')

# purity is the proportion of predicted edges that are true
pur = cluster_true_positive / cluster_positive
print(f'Purity: {pur: .3f}')


Number of true edges: 26778
Number of edges predicted by metric learning model that are true: 25854
Number of edges predicted by metric learning model: 71876
Efficiency:  0.965
Purity:  0.360


The efficiency and purity will be used to evaluate model performance throughout the pipeline

### Computing the loss by hand (Optional)

The loss for metric learning is computed as follow 
$$ l_{ij} = \begin{cases}
   d_{ij}^2, & y_{ij} = 1 \\
  \max(0, r^2 - d_{ij}^2), & y_{ij}=-1
\end{cases} $$
and $$\mathcal{L} =\frac{1}{N} \sum_{i\ne j} l_{ij}$$

the lost function punishes a large distance between true pairs of hits and a large difference between distance of fake pairs and the margin. It ignores the pairs that are fake and outside the margin. 

In [39]:
spatial = spatial.to(device)
e_spatial = e_spatial.to(device)
margin = configs['metric_learning_configs']['r_test']

# convert truth to True = 1 and False = -1
hinge = y_cluster.float().to(device)
hinge[hinge==0] = -1

# calculate distance square of all edges
reference = spatial.index_select(0, e_spatial[1])
neighbors = spatial.index_select(0, e_spatial[0])
d_sq = torch.sum((reference - neighbors) ** 2, dim=-1)

# for true edges, the loss is simply the sum of square of distances. Minimizing this loss amounts minimizing the distance between true
# edge hits
positive_d_sq = d_sq[hinge==1]

# for fake edges, we want to push the distance between the fake-edge points outside of the margin. 
# if the fake-edge distance are less than the margin, we want to increase it closer to the margin, or minimizie its difference to the
# margin. Once it is above the margin, we don't care. So we do

# find the distance between fake edges
negative_d_sq = d_sq[hinge==-1]
# find those distances that are already above the margin and set to the margin. these will contribute 0 to the loss
negative_d_sq[negative_d_sq >= margin ** 2 ] = margin ** 2

# compute the loss 
loss = torch.mean( torch.concat([positive_d_sq, margin **2  - negative_d_sq]) , dim=-1)
print(loss.cpu().detach().numpy())

# this entire procedure is called hinge loss and can be conveniently computed by 
easy_loss = torch.nn.functional.hinge_embedding_loss(d_sq, hinge, margin=margin**2, reduction='mean')
print(easy_loss.cpu().detach().numpy())

0.0036259617
0.0036259622


## Evaluate efficiency, purity and loss as functions of neighbor radius 

Here we evaluate the model performace on the same sample test data as above sections, but look at how the efficiency and purity change with the embedding radius.

In [6]:
all_efficiencies, all_purities, all_losses = [], [], []
all_radius = np.arange(0.001, 0.15, 0.005)
all_radius = np.sort(np.append(all_radius, 0.1))
results = { 'eff': [], 'pur': [], 'loss': [], 'radius': all_radius }
metric_learning_model.to(device)
test_data = metric_learning_model.testset[0].to(device)

with torch.no_grad():
    for r in all_radius:
        test_results = metric_learning_model.shared_evaluation(
            test_data, 0, r, configs['metric_learning_configs']['knn'], log=False
        )
        for key in results:
            if key not in test_results: continue
            results[key].append( test_results[key].cpu().numpy() )
results = pd.DataFrame(results)

source = ColumnDataSource(results)
cmap = viridis(3)
titles = ['Efficiency', 'Purity', 'Loss'] 
figures = []
x='radius'
for idx, y in enumerate(['eff', 'pur', 'loss']):
    figures.append( figure(title=titles[idx], x_axis_label=x, y_axis_label=y) )
    figures[-1].circle(y=y, x=x, source=source, color=cmap[0], legend_label=y)
    figures[-1].line(x=x, y=y, source=source, color=cmap[0], legend_label=y)
    y_val = results[y][(results[x] - 0.1).abs().idxmin()].item()
    label = Label(x=0.1, y=y_val, x_offset=10, y_offset=-10, text="@ radius = 0.1, \n" + y + " = "+str(round(y_val, 3)), border_line_color='black', border_line_alpha=1.0,
      background_fill_color='white', background_fill_alpha=0.8)
    figures[-1].add_layout(label)

show(row(figures))

It is quite intuitive that the efficiency increases with neighbor radius. The wider the radius, the more generous we are in admitting a neighboring point in the latent as space as close-enough-to-form-an-edge, the more likely that we admit all true edges. Of course, we pay the price of admitting increasing numbers of fake edges. 

That the purity decreases with radius is also understandable as we admit more fake edges and true edges. 

## Plot example truth and predicted graphs

Reload model if necessary (if notebook crashes/times out...)

In [4]:
from Pipelines.TrackML_Example.LightningModules.Embedding.Models.layerless_embedding import LayerlessEmbedding
common_configs = configs["common_configs"]
metric_learning_model = LayerlessEmbedding.load_from_checkpoint(os.path.join(common_configs["artifact_directory"], "metric_learning", common_configs["experiment_name"]+".ckpt")).to(device)
metric_learning_model.setup(stage="fit")
clear_output()

In [8]:
# from matplotlib import pyplot as plt
test_data = metric_learning_model.testset[0].to(device)
test_results = metric_learning_model.to(device).shared_evaluation(test_data.to(device), 0, 0.12, 1000, log=False)

p = figure(title='Truth graphs', x_axis_label='x', y_axis_label='y', height=800, width=800) 
q = figure(title='Predicted graphs', x_axis_label='x', y_axis_label='y', height=800, width=800) 

true_edges = test_results['truth_graph']
true_unique, true_lengths = test_data.pid[true_edges[0]].unique(return_counts=True)
pred_edges = test_results['preds']
pid = test_data.pid
r, phi, z = test_data.cpu().x.T
x, y = r * torch.cos(phi * np.pi), r * torch.sin(phi * np.pi)
cmap = viridis(11)
source = ColumnDataSource(dict(x=x.numpy(), y=y.numpy()))
p.circle(x='x', y='y', source=source, color=cmap[0], size=1, alpha=0.1)
q.circle(x='x', y='y', source=source, color=cmap[0], size=1, alpha=0.1)

for i, track in enumerate(true_unique[true_lengths >= 10][:10]):
    
    # Get true track plot
    track_true_edges = true_edges[:, pid[ true_edges[0]] == track ]
    X_edges, Y_edges = x[track_true_edges].numpy(), y[track_true_edges].numpy()
    X = np.concatenate(X_edges)
    Y = np.concatenate(Y_edges)

    p.circle(X, Y, color=cmap[i], size=5)
    p.multi_line(X_edges.T.tolist(), Y_edges.T.tolist())

    track_pred_edges = pred_edges[:, (pid[pred_edges] == track).any(0)]

    X_edges, Y_edges = x[track_pred_edges].numpy(), y[track_pred_edges].numpy()
    X = np.concatenate(X_edges)
    Y = np.concatenate(Y_edges)

    q.circle(X, Y, color=cmap[i], size=5)
    q.multi_line(X_edges.T.tolist(), Y_edges.T.tolist())
    
show(row([p,q]))

## Histogram of graph size

In [69]:
all_true_edges = []
all_pred_edges = []
signal_true_edges = test_data.signal_true_edges
pred_edges = test_results['preds']
pid = test_data.pid
for track_id in test_data.pid.unique():
    e = signal_true_edges[:, pid[ signal_true_edges[0]] == track_id ]
    true_edges = pid[ e[0]] == pid[e[1]]
    all_true_edges.append( true_edges.sum().numpy() )

    e = pred_edges[:, (pid[ pred_edges[0]] == track_id) |(pid[ pred_edges[1]] == track_id)  ]
    true_edges = pid[ e[0]] == pid[e[1]]
    all_pred_edges.append( e.shape[1] )

histogram = np.histogram(all_true_edges, bins=20, range=(0,20))
max_edge = 700
pred_histogram = np.histogram(all_pred_edges, bins=max_edge, range=(0,max_edge))

true_histogram = pd.DataFrame(
    dict(
        low = histogram[1][:-1],
        high = histogram[1][1:],
        true_hist= histogram[0],
    )
)

pred_histogram = pd.DataFrame(
    dict(
        low = pred_histogram[1][:-1],
        high = pred_histogram[1][1:],
        pred_hist = pred_histogram[0]
    )
)


p1 =  figure(title='Histogram of truth edge number', x_axis_label='Edges', y_axis_label='Count', height=800, width=800) 
p2 =  figure(title='Histogram of predicted edge number', x_axis_label='Edges', y_axis_label='Count', height=800, width=800) 
p1.quad(bottom=0, top='true_hist', left='low', right='high', source=ColumnDataSource(true_histogram))
p2.quad(bottom=0, top='pred_hist', left='low', right='high', source=ColumnDataSource(pred_histogram))
show(row([p1,p2]))

## How edges are constructed from latent space representation (optional)

To construct the graphs from the latent space representation, we calculate the L2 distance squared from each hit to its $k$ closest hits in latent space, then connect those whose distance is less than some threshold $r$: $$E = \{ ij, ||\mathbf{r}_i-\mathbf{r}_j||_2 \le r \}$$ 
The search algorithm is performed using **faiss** library, which efficiently computes the distances and finds the k-NN from the latent space representation of the test data.

In [None]:
# import faiss

print_stuff = False

res = faiss.StandardGpuResources()
k_nn = configs['metric_learning_configs']['knn']
r_test = configs['metric_learning_configs']['r_test']

# find the distance squared and the indices of k-NN to each hit
d_sq, connect_to = faiss.knn_gpu(res=res, xq=spatial, xb=spatial, k=k_nn)

# connect_to contains the indices of k points closest to each point in the latent space, including itself. So each index vector starts with its own index in the array.abs(x)
# this serves as the "CONNECTED_TO" index array
if print_stuff:
    print(connect_to)
# filter and keep the end points of edges whose latent space distance is <= the chosen neighborhood radius
connect_to = connect_to[ d_sq <= r_test ]

# create a "CONNECTED_FROM" index array, containing the index of each nucleus point we are finding the k-NN of
connect_from = torch.Tensor.repeat(
    torch.arange(I.shape[0], device=device), (I.shape[1], 1), 1
).T.int()
if print_stuff:
    print(connect_from)
# filter and keep the starting points of edges whose latent space distance is <= the chosen neighborhood radius
connect_from = connect_from[ d_sq <= r_test ]

# stack the indices to get the edge list in the same format as the truth edge list
edgeg_list = torch.stack([connect_from, connect_to])

# remove self loop
edge_list = edgeg_list[:, edgeg_list[0] != edgeg_list[1] ]

if print_stuff:
    print(edge_list)

# 2. Construct input graphs to Graph Neural Network

This step performs model inference on the entire input datasets (train, validation and test), to obtain input graphs to the graph neural network.

In [82]:
graph_builder = run_metric_learning_inference(CONFIG)

INFO:-------------------- Step 2: Constructing graphs from metric learning model  --------------------
INFO:-------------------- a) Loading trained model --------------------
INFO:-------------------- b) Running inferencing --------------------
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021000
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021001
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021002


Training finished, running inference to build graphs...


100%|██████████| 80/80 [00:30<00:00,  2.60it/s]
100%|██████████| 10/10 [00:03<00:00,  2.53it/s]
100%|██████████| 10/10 [00:03<00:00,  2.55it/s]


# 3. Train graph neural networks

## Training data

In [11]:
from Pipelines.TrackML_Example.LightningModules.GNN.Models.interaction_gnn import InteractionGNN
gnn_configs = configs["gnn_configs"]
gnn_model = InteractionGNN(gnn_configs)
gnn_model.setup(stage='fit')
example_data = gnn_model.get_input_data(gnn_model.trainset[0])
print(example_data)
print(example_data.shape)

tensor([[ 4.0000,  0.2354,  1.0794,  ...,  0.0325,  0.6710, -0.0366],
        [ 3.0000,  0.1674,  1.4405,  ...,  0.0728,  0.3927, -0.0288],
        [ 3.0000,  0.1788,  1.3031,  ...,  0.0718, -0.8892, -0.0222],
        ...,
        [ 1.0000,  1.0000,  0.0673,  ...,  0.8696, -0.4375, -2.9445],
        [ 1.0000,  1.0000,  0.0673,  ...,  0.8714, -0.4375, -2.9525],
        [ 1.0000,  1.0000,  0.0673,  ...,  0.8705, -0.4377, -2.9555]])
torch.Size([15198, 11])


## Train GNN

In [41]:
gnn_trainer, gnn_model = train_gnn(CONFIG)
clear_output()
log_file = os.path.join(gnn_trainer.logger.log_dir , 'metrics.csv')

In [4]:
# if you have a pretrained model, load it by uncommenting and running the following lines
# from Pipelines.TrackML_Example.LightningModules.GNN.Models.interaction_gnn import InteractionGNN
# common_configs = configs["common_configs"]
# gnn_model = LayerlessEmbedding.load_from_checkpoint(os.path.join(common_configs["artifact_directory"], "gnn", common_configs["experiment_name"]+".ckpt")).to(device)
# gnn_model.setup(stage="fit")
# clear_output()

# # change your log file path here
log_file = '/global/cfs/cdirs/m3443/usr/pmtuan/Tracking-ML-Exa.TrkX/Examples/TrackML_Quickstart/artifacts/gnn/trackml_quickstart_1/version_34/metrics.csv'

## Plot learning curve

In [6]:
metrics = pd.read_csv(log_file, sep=',')
train_metrics = metrics[ ~ metrics['train_loss'].isna() ][['epoch', 'train_loss']]
train_metrics['epoch'] -= 1
val_metrics = metrics[ ~ metrics['val_loss'].isna() ][['val_loss', 'eff', 'pur', 'current_lr', 'epoch', 'auc']]
metrics = pd.merge(left=train_metrics, right=val_metrics, how='inner', on='epoch')
metrics.head()

Unnamed: 0,epoch,train_loss,val_loss,eff,pur,current_lr,auc
0,0,0.80385,0.823251,0.647557,0.529028,0.0002,0.718894
1,1,0.779369,0.823775,0.652259,0.52756,0.0004,0.719257
2,2,0.774875,0.820854,0.650706,0.531685,0.0006,0.722497
3,3,0.765334,0.822643,0.718655,0.5046,0.0008,0.731244
4,4,0.737088,0.773427,0.655642,0.569548,0.001,0.753819


In [11]:
p1 = figure(title='Training validation loss', x_axis_label='Epoch', y_axis_label='Loss', y_axis_type="log")

source = ColumnDataSource(metrics)

cmap = viridis(3)

for idx, y in enumerate(['train_loss', 'val_loss']):
    p1.circle(y=y, x='epoch', source=source, color=cmap[idx], legend_label=y, name='circle')
    p1.line(x='epoch', y=y, source=source, color=cmap[idx], legend_label=y)
p1.add_tools(
    HoverTool(
        tooltips=[("epoch", "@epoch"), ('train_loss', '@train_loss'), ('val_loss', '@val_loss')],
        name='circle'
    )
)

p2 = figure(title='Purity on validation set', x_axis_label='Epoch', y_axis_label='Purity', y_range=(0.4,1))
p2.circle(y='pur', x='epoch', source=source, color=cmap[0], legend_label='Purity')
p2.line(x='epoch', y='pur', source=source, color=cmap[0], legend_label='Purity')

p3 = figure(title='Efficiency on validation set', x_axis_label='Epoch', y_axis_label='Efficiency', y_range=(0.4, 1))
p3.circle(y='eff', x='epoch', source=source, color=cmap[0], legend_label='Efficiency')
p3.line(x='epoch', y='eff', source=source, color=cmap[0], legend_label='Efficiency')

p4 = figure(title='AUC on validation set', x_axis_label='Epoch', y_axis_label='AUC', y_range=(0.7, 1))
p4.circle(y='auc', x='epoch', source=source, color=cmap[0], legend_label='AUC')
p4.line(x='epoch', y='auc', source=source, color=cmap[0], legend_label='AUC')

show(row([p1,p2, p3, p4]))

# Step 4: GNN inference 

In [12]:
run_gnn_inference(CONFIG)

INFO:--------------------- Step 4: Scoring graph edges using GNN  ---------------------
INFO:---------------------------- a) Loading trained model ----------------------------
INFO:----------------------------- b) Running inferencing -----------------------------


Training finished, running inference to filter graphs...
Building train


100%|██████████| 80/80 [00:00<00:00, 40920.04it/s]


Building val


100%|██████████| 10/10 [00:00<00:00, 26329.59it/s]


Building test


100%|██████████| 10/10 [00:00<00:00, 13769.88it/s]


# Step 5: Build track candidates from GNN

In [13]:
build_track_candidates(CONFIG)

INFO:-----------  Step 5: Building track candidates from the scored graph  -----------
INFO:---------------------------- a) Loading scored graphs ----------------------------
INFO:---------------------------- b) Labelling graph nodes ----------------------------


  0%|          | 0/114 [00:00<?, ?it/s]