# Example of Graph Neural Network

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# System imports
import os
import sys
import yaml

# External imports
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.decomposition import PCA
from sklearn.metrics import auc
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

sys.path.append('..')
device = "cuda" if torch.cuda.is_available() else "cpu"

## Attention Mechanism

In [5]:
from LightningModules.GNN.Models.agnn import ResAGNN
from LightningModules.GNN.Models.vanilla_agnn import VanillaResAGNN
from LightningModules.GNN.Models.checkpoint_agnn import CheckpointedResAGNN
from LightningModules.GNN.Models.interaction_multistep_gnn import CheckpointedInteractionMultistepGNN

In [4]:
with open("example_gnn.yaml") as f:
        hparams = yaml.load(f, Loader=yaml.FullLoader)

In [5]:
model = VanillaResAGNN(hparams)

In [4]:
model = CheckpointedResAGNN(hparams)

NameError: name 'hparams' is not defined

In [5]:
model = CheckpointedInteractionMultistepGNN(hparams)

### Dataset

In [6]:
%%time
model.setup(stage="fit")

CPU times: user 4.15 s, sys: 834 ms, total: 4.98 s
Wall time: 1.66 s


In [7]:
sample = model.trainset[0]

In [8]:
sample

Data(cell_data=[52573, 11], edge_index=[2, 40419], event_file="/project/projectdirs/m3443/data/ITk-upgrade/processed/full_events_v4/event000010001", hid=[52573], modulewise_true_edges=[2, 46739], nhits=[52573], pid=[52573], primary=[52573], pt=[52573], signal_true_edges=[2, 13312], x=[52573, 3], y=[40419], y_pid=[287180])

In [9]:
sample.y.sum()/sample.signal_true_edges.shape[1]

tensor(0.9715)

In [11]:
sample.y.sum()/sample.edge_index.shape[1]

tensor(0.3199)

In [13]:
edges = sample.edge_index

In [13]:
pid = sample.pid

In [14]:
edges.shape

torch.Size([2, 287180])

In [14]:
(sample.pid[edges[0]] == sample.pid[edges[1]]).sum()/sample.edge_index.shape[1]

tensor(0.5314)

### Memory Test

In [13]:
%%time
model.setup(stage="fit")

CPU times: user 14.6 s, sys: 1.01 s, total: 15.6 s
Wall time: 11.6 s


In [14]:
sample = model.trainset[0].to(device)

In [15]:
model = model.to(device)

In [16]:
torch.cuda.reset_peak_memory_stats()
output = model(sample.x.to(device), sample.edge_index.to(device))

In [17]:
print(torch.cuda.max_memory_allocated()/1024**3, "Gb")

9.714438438415527 Gb


### Train GNN

In [10]:
import ninja

In [8]:
from pytorch_lightning.plugins import DeepSpeedPlugin

In [16]:
model.setup(stage="fit")

In [18]:
import torch

In [11]:
logger = WandbLogger(project="ITk_0.5GeV_GNN", group="InitialTest")
trainer = Trainer(gpus=1, max_epochs=1, logger=logger, plugins=DeepSpeedPlugin(
        stage=3,
        offload_optimizer=True,
        offload_parameters=True,
    ), precision=16)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1
Enabling DeepSpeed FP16.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




You have not specified an optimizer or scheduler within the DeepSpeed config.Using `configure_optimizers` to define optimizer and scheduler.


Initializing ZeRO Stage 3
Using /global/u2/d/danieltm/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


ImportError: /global/u2/d/danieltm/.cache/torch_extensions/utils/utils.so: cannot open shared object file: No such file or directory

In [6]:
logger = WandbLogger(project="ITk_0.5GeV_GNN", group="InitialTest")
trainer = Trainer(gpus=1, max_epochs=1, logger=logger, precision=16)
trainer.fit(model)

Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[34m[1mwandb[0m: Currently logged in as: [33mmurnanedaniel[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Set SLURM handle signals.

  | Name          | Type       | Params
---------------------------------------------
0 | edge_network  | Sequential | 5.3 K 
1 | node_network  | Sequential | 3.4 K 
2 | input_network | Sequential | 1.3 K 
---------------------------------------------
10.0 K    Trainable params
0         Non-trainable params
10.0 K    Total params
0.040     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  eff = torch.tensor(edge_true_positive / edge_true)
  pur = torch.tensor(edge_true_positive / edge_positive)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Vanilla

In [9]:
print(torch.cuda.max_memory_allocated()/1024**3, "Gb")

7.048240661621094 Gb


FP16

In [12]:
print(torch.cuda.max_memory_allocated()/1024**3, "Gb")

5.811363697052002 Gb


## Load Model

In [6]:
checkpoint_path = "/global/cscratch1/sd/danieltm/ExaTrkX/itk_lightning_checkpoints/ITk_0.5GeV_GNN/k53btbvu/checkpoints/last.ckpt"
checkpoint = torch.load(checkpoint_path)

model = CheckpointedResAGNN.load_from_checkpoint(checkpoint_path).to(device)

In [7]:
model.eval();

In [8]:
model.hparams["datatype_split"] = [1, 1, 10]
model.setup(stage="fit")

In [9]:
model = model.to(device)

In [10]:
output_dir = "/project/projectdirs/m3443/data/ITk-upgrade/processed/gnn_processed/0.5GeV_testing"

In [11]:
model.testset[0]

Data(cell_data=[60001, 11], edge_index=[2, 359055], event_file="/project/projectdirs/m3443/data/ITk-upgrade/processed/full_events_v4/event000010008", hid=[60001], modulewise_true_edges=[2, 53358], nhits=[60001], pid=[60001], primary=[60001], pt=[60001], signal_true_edges=[2, 53033], x=[60001, 3], y=[359055], y_pid=[359055])

In [55]:
with torch.no_grad():
    for batch in model.test_dataloader():

        print(batch)

        output = model.shared_evaluation(batch.to(device), 0, log=False)

        print(output)
        print(os.path.split(batch.event_file[0])[-1])

        gnn_results = np.vstack([batch.edge_index.cpu().numpy(), 
                                 output["score"].cpu().numpy(), 
                                 output["truth"].cpu().numpy()])
        
        gnn_recarray = np.rec.fromarrays(gnn_results, names=["senders", "receivers", "score", "truth"])

        with open(os.path.join(output_dir, os.path.split(batch.event_file[0])[-1][-4:] + ".npz"), 'wb') as f:
            np.save(f, gnn_recarray)

Batch(batch=[60001], cell_data=[60001, 11], edge_index=[2, 359055], event_file=[1], hid=[60001], modulewise_true_edges=[2, 53358], nhits=[60001], pid=[60001], primary=[60001], pt=[60001], ptr=[2], signal_true_edges=[2, 53033], x=[60001, 3], y=[359055], y_pid=[359055])
{'loss': tensor(0.0613, device='cuda:0'), 'preds': tensor([False, False, False,  ...,  True,  True,  True], device='cuda:0'), 'score': tensor([1.0530e-05, 3.5816e-03, 6.0193e-07,  ..., 9.9665e-01, 9.9994e-01,
        9.3922e-01], device='cuda:0'), 'truth': tensor([0., 0., 0.,  ..., 1., 1., 1.], device='cuda:0')}
event000010008
Batch(batch=[65045], cell_data=[65045, 11], edge_index=[2, 424981], event_file=[1], hid=[65045], modulewise_true_edges=[2, 58058], nhits=[65045], pid=[65045], primary=[65045], pt=[65045], ptr=[2], signal_true_edges=[2, 57724], x=[65045, 3], y=[424981], y_pid=[424981])
{'loss': tensor(0.0618, device='cuda:0'), 'preds': tensor([False, False, False,  ...,  True,  True,  True], device='cuda:0'), 'score'

In [32]:
dtype = np.dtype([('senders', "f8"), ('receivers', "f8"), ('truth', "f8"), ('score', "f8")])

In [44]:
recarray = np.rec.fromarrays(gnn_results.T, names=dtype.names)

In [43]:
gnn_results.shape

(414394, 4)

In [48]:
recarray["truth"].shape

(414394,)