In [1]:
import sys, os
from Scripts import train_metric_learning, run_metric_learning_inference, train_gnn, run_gnn_inference
import yaml

import torch 
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'{device} available')
import pandas as pd, numpy as np

from bokeh.io import output_notebook, show
output_notebook()
from bokeh.plotting import figure, row
from bokeh.models import ColumnDataSource
from bokeh.palettes import viridis

from Pipelines.Common_Tracking_Example.notebooks.ITk.Exploration.gnn_utils import infer_event
from Pipelines.TrackML_Example.notebooks.build_embedding import EmbeddingInferenceBuilder


from IPython.display import clear_output
CONFIG = 'pipeline_config.yaml'

INFO:Loading faiss with AVX2 support.
INFO:Successfully loaded faiss with AVX2 support.


cuda available


# 0. Download Data

In [None]:
!mkdir datasets
!wget https://portal.nersc.gov/cfs/m3443/dtmurnane/TrackML_Example/trackml_quickstart_dataset.tar.gz -O datasets/trackml_quickstart_dataset.tar.gz
!tar -xvf datasets/trackml_quickstart_dataset.tar.gz -C datasets

### Pipeline configurations

The configurations for the entire pipeline are defined under pipeline_config.yml. 

In [3]:
with open(CONFIG, 'r') as f:
    configs = yaml.load(f, Loader=yaml.FullLoader)
print(yaml.dump(configs))

common_configs:
  artifact_directory: artifacts
  experiment_name: trackml_quickstart_1
  gpus: 1
  max_epochs: 20
gnn_configs:
  aggregation: sum_max
  cell_channels: 8
  datatype_names:
  - train
  - val
  - test
  datatype_split:
  - 80
  - 10
  - 10
  edge_cut: 0.5
  factor: 0.3
  hidden: 128
  hidden_activation: SiLU
  input_dir: datasets/quickstart_metric_learning_processed
  layernorm: true
  lr: 0.001
  mask_background: true
  n_graph_iters: 8
  nb_edge_layer: 3
  nb_node_layer: 3
  noise: false
  output_dir: datasets/quickstart_gnn_processed
  patience: 10
  pt_background_min: 1.0
  pt_signal_min: 1.0
  regime:
  - - pid
  spatial_channels: 3
  truth_key: pid_signal
  warmup: 5
  weight: 2
metric_learning_configs:
  activation: Tanh
  cell_channels: 9
  emb_dim: 12
  emb_hidden: 1024
  factor: 0.5
  input_dir: datasets/quickstart_example_1GeV
  knn: 100
  lr: 0.001
  margin: 0.1
  nb_layer: 4
  output_dir: datasets/quickstart_metric_learning_processed
  patience: 30
  points_p

In [35]:
import sys
import os
import yaml
import argparse
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
import torch

sys.path.append("../../")
# sys.path.append('./')
from Pipelines.TrackML_Example.LightningModules.Embedding.Models.layerless_embedding import LayerlessEmbedding
from utils import headline

from pytorch_lightning import Callback

class DeviceCallback(Callback):

    def on_batch_start(self, trainer, pl_module):
        print( next(pl_module.parameters()).device.type )

def train(config_file="pipeline_config.yaml"):

    logging.info(headline("Step 1: Running metric learning training"))

    with open(config_file) as file:
        all_configs = yaml.load(file, Loader=yaml.FullLoader)
    
    common_configs = all_configs["common_configs"]
    metric_learning_configs = all_configs["metric_learning_configs"]

    logging.info(headline("a) Initialising model"))

    model = LayerlessEmbedding(metric_learning_configs)
    
    model.to(device)

    logging.info(headline("b) Running training" ))

    save_directory = os.path.join(common_configs["artifact_directory"], "metric_learning")
    logger = CSVLogger(save_directory, name=common_configs["experiment_name"])

    trainer = Trainer(
        accelerator='gpu' if torch.cuda.is_available() else None,
        auto_select_gpus=True,
        gpus=1,
        max_epochs=common_configs["max_epochs"],
        logger=logger, 
        # callbacks=[DeviceCallback()]
    )

    logging.info(headline(f"Training model on {model.device}"))

    trainer.fit(model)
    
    logging.info(headline(f"Trained model on {model.device}"))

    logging.info(headline("c) Saving model") )

    os.makedirs(save_directory, exist_ok=True)
    trainer.save_checkpoint(os.path.join(save_directory, common_configs["experiment_name"]+".ckpt"))

    return trainer, model

# 1. Train Metric Learning

## What it does
Broadly speaking, the first stage of our pipeline is embedding the space points on to graphs, in a way that is efficient, i.e. we miss as few points on a graph as possible. We train a MLP to transform the input feature vector of each space point $\mathbf{u}_i$ into an N-dimensional latent space $\mathbf{v}_i$. The graph is then constructed by connecting the space points whose Euclidean distance between the latent space points $$d_{ij} = \left| \mathbf{v}_i - \mathbf{v}_j \right| < r_{embedding}$$

## Training data
Let us take a look at the data before training. In this example pipeline, we have preprocessed the TrackML data into a more convenient form. We calculated directional information and summary statistics from the charge deposited in each spacepoints, and append them to its cyclidrical coordinates. Let us load an example data file and inspect the content.

In [4]:
from Pipelines.TrackML_Example.LightningModules.Embedding.Models.layerless_embedding import LayerlessEmbedding

metric_learning_configs = configs['metric_learning_configs']

model = LayerlessEmbedding(metric_learning_configs)
model.setup(stage='fit')
clear_output()

print(model.trainset[0])
example_data = model.get_input_data(model.trainset[0])
example_data_df = pd.DataFrame(example_data.numpy())
example_data_df.head()

Data(x=[12083, 3], pid=[12083], modules=[12083], event_file='/global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021045', hid=[12083], pt=[12083], weights=[10965], modulewise_true_edges=[2, 10965], layerwise_true_edges=[2, 14426], cell_data=[12083, 9], signal_true_edges=[2, 10965])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.323412,2.091356,0.844154,0.05,0.05625,0.3,-2.091356,0.962261,0.051929,0.083736,-0.958
1,6.0,0.308704,0.884925,1.229181,0.1,0.28125,0.3,0.80096,1.972132,0.115441,0.50132,-0.198762
2,6.0,0.312759,0.793395,1.423718,0.05,0.3375,0.3,0.956851,2.072294,0.031444,0.612759,0.041935
3,7.0,0.34282,0.772962,1.282741,0.1,0.3375,0.3,0.928149,-0.127298,0.031484,-0.159847,-0.085926
4,3.0,0.162364,1.440542,0.844154,0.1,0.1125,0.3,0.34865,2.327071,0.07183,0.609832,-0.018804


The input data is gotten by concatenating the cell data and cylindrical coordinate of each space point 

In [5]:
input_data = [model.trainset[0].cell_data.numpy(), model.trainset[0].x.numpy()]
input_data = np.concatenate(input_data, axis=1)
input_data = pd.DataFrame(input_data)
input_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.323412,2.091356,0.844154,0.05,0.05625,0.3,-2.091356,0.962261,0.051929,0.083736,-0.958
1,6.0,0.308704,0.884925,1.229181,0.1,0.28125,0.3,0.80096,1.972132,0.115441,0.50132,-0.198762
2,6.0,0.312759,0.793395,1.423718,0.05,0.3375,0.3,0.956851,2.072294,0.031444,0.612759,0.041935
3,7.0,0.34282,0.772962,1.282741,0.1,0.3375,0.3,0.928149,-0.127298,0.031484,-0.159847,-0.085926
4,3.0,0.162364,1.440542,0.844154,0.1,0.1125,0.3,0.34865,2.327071,0.07183,0.609832,-0.018804


In [6]:
with torch.no_grad():
    latent = model(example_data)

latent_df = pd.DataFrame(latent.numpy())
latent_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.758347,0.015815,-0.216978,0.001863,-0.198355,-0.032176,-0.080935,-0.195768,0.440127,-0.010136,0.259614,0.176371
1,0.224708,0.057013,-0.554482,-0.22801,-0.203369,-0.378222,0.414617,-0.000834,-0.062257,-0.112517,0.423644,-0.18571
2,0.260113,0.091689,-0.560551,-0.232449,-0.180301,-0.381621,0.415849,0.008346,-0.063875,-0.105468,0.40631,-0.155725
3,0.361378,-0.026165,-0.49386,-0.010938,-0.436425,-0.254425,0.516231,0.100032,0.035344,-0.098932,0.249549,-0.140332
4,-0.103032,0.337436,-0.528402,-0.27131,-0.101918,-0.412171,0.050567,-0.036743,-0.004053,-0.002928,0.563244,0.145793


## Train model

Finally we come to model training. By default, we train the MLP for 20 epochs, which amounts to approximately 15 minutes. Feel free to adjust the epoch number in pipeline_config.yml

In [4]:
metric_leraning_trainer, metric_learning_model = train_metric_learning(CONFIG)
clear_output()

## Plot training metrics

In [5]:
log_file = os.path.join(metric_leraning_trainer.logger.log_dir , 'metrics.csv')
metrics = pd.read_csv(log_file, sep=',')
train_metrics = metrics[ ~ metrics['train_loss'].isna() ][['epoch', 'train_loss']]
train_metrics['epoch'] -= 1
val_metrics = metrics[ ~ metrics['val_loss'].isna() ][['val_loss', 'eff', 'pur', 'current_lr', 'epoch']]
metrics = pd.merge(left=train_metrics, right=val_metrics, how='inner', on='epoch')
metrics.head()

Unnamed: 0,epoch,train_loss,val_loss,eff,pur,current_lr
0,0,0.012257,0.009874,0.745082,0.009087,0.0002
1,1,0.009993,0.009858,0.821396,0.010018,0.0004
2,2,0.009968,0.009678,0.878741,0.010717,0.0006
3,3,0.010037,0.009856,0.783437,0.009555,0.0008
4,4,0.009981,0.009795,0.847687,0.010338,0.001


In [6]:
p1 = figure(title='Training validation loss', x_axis_label='Epoch', y_axis_label='Loss')

source = ColumnDataSource(metrics)

cmap = viridis(3)

for idx, y in enumerate(['train_loss', 'val_loss']):
    p1.circle(y=y, x='epoch', source=source, color=cmap[idx], legend_label=y)
    p1.line(x='epoch', y=y, source=source, color=cmap[idx], legend_label=y)


p2 = figure(title='Purity on validation set', x_axis_label='Epoch', y_axis_label='Purity')
p2.circle(y='pur', x='epoch', source=source, color=cmap[0], legend_label='Purity')
p2.line(x='epoch', y='pur', source=source, color=cmap[0], legend_label='Purity')

p3 = figure(title='Efficiency on validation set', x_axis_label='Epoch', y_axis_label='Efficiency')
p3.circle(y='eff', x='epoch', source=source, color=cmap[0], legend_label='Efficiency')
p3.line(x='epoch', y='eff', source=source, color=cmap[0], legend_label='Efficiency')

show(row([p1,p2, p3]))

## Evaluate model performance on sample test data

Here we evaluate the model performace on one sample test data. We look at how the efficiency and purity change with the embedding radius.

In [9]:
all_efficiencies, all_purities, all_losses = [], [], []
all_radius = np.arange(0.001, 0.2, 0.005)
results = { 'eff': [], 'pur': [], 'loss': [], 'radius': all_radius }
metric_learning_model.to(device)
test_data = metric_learning_model.testset[0].to(device)

with torch.no_grad():
    for r in all_radius:
        test_results = metric_learning_model.shared_evaluation(
            test_data, 0, r, 1000, log=False
        )
        for key in results:
            if key not in test_results: continue
            results[key].append( test_results[key].cpu().numpy() )
results = pd.DataFrame(results)

In [10]:
source = ColumnDataSource(results)
cmap = viridis(3)
titles = ['Efficiency', 'Purity', 'Loss'] 
figures = []
x='radius'
for idx, y in enumerate(['eff', 'pur', 'loss']):
    figures.append( figure(title=titles[idx], x_axis_label=x, y_axis_label=y) )
    figures[-1].circle(y=y, x=x, source=source, color=cmap[0], legend_label=y)
    figures[-1].line(x=x, y=y, source=source, color=cmap[0], legend_label=y)

show(row(figures))

# 2. Construct graphs from metric learning inference

This step performs model inference on the entire input datasets (train, validation and test), to obtain input graphs to the graph neural network.

In [11]:
graph_builder = run_metric_learning_inference(CONFIG)

INFO:-------------------- Step 2: Constructing graphs from metric learning model  --------------------
INFO:-------------------- a) Loading trained model --------------------
INFO:-------------------- b) Running inferencing --------------------
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021000
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021001
INFO:Loaded event: /global/cfs/cdirs/m3443/data/trackml-codalab/train_all/event000021002


Training finished, running inference to build graphs...


100%|██████████| 80/80 [00:30<00:00,  2.65it/s]
100%|██████████| 10/10 [00:03<00:00,  2.99it/s]
100%|██████████| 10/10 [00:03<00:00,  3.20it/s]


# 3. Train graph neural networks

In [12]:
train_gnn(CONFIG)

INFO:--------------------  Step 3: Running GNN training  --------------------
INFO:-------------------- a) Initialising model --------------------
INFO:-------------------- b) Running training --------------------
INFO:GPU available: True, used: True
INFO:TPU available: False, using: 0 TPU cores
INFO:IPU available: False, using: 0 IPUs
INFO:HPU available: False, using: 0 HPUs
INFO:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:
  | Name                   | Type       | Params
------------------------------------------------------
0 | node_encoder           | Sequential | 35.1 K
1 | edge_encoder           | Sequential | 66.4 K
2 | edge_network           | Sequential | 82.8 K
3 | node_network           | Sequential | 82.8 K
4 | output_edge_classifier | Sequential | 83.2 K
------------------------------------------------------
350 K     Trainable params
0         Non-trainable params
350 K     Total params
1.401     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 16.91 GiB (GPU 0; 39.59 GiB total capacity; 28.55 GiB already allocated; 8.76 GiB free; 28.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Step 4: GNN inference 

In [3]:
run_gnn_inference(CONFIG)

INFO:-------------------- Step 4: Scoring graph edges using GNN  --------------------
INFO:-------------------- a) Loading trained model --------------------
INFO:-------------------- b) Running inferencing --------------------


Training finished, running inference to filter graphs...
Building train


100%|██████████| 80/80 [00:00<00:00, 59567.61it/s]


Building val


100%|██████████| 10/10 [00:00<00:00, 10335.89it/s]


Building test


100%|██████████| 10/10 [00:00<00:00, 10325.71it/s]
