# Message-passing graph neural network for hit classification

In [1]:
# Training concurrency
import os
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

cuda = False

In [2]:
# System imports
from __future__ import print_function
import os
import multiprocessing as mp
from timeit import default_timer as timer

# Externals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.model_selection import train_test_split

# Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

# Local imports
from estimator import Estimator
from acts import process_hits_files, select_barrel_hits

%matplotlib notebook

In [3]:
if cuda:
    np_to_torch = lambda x, volatile=False: (
        Variable(torch.from_numpy(x.astype(np.float32)), volatile=volatile).cuda())
else:
    np_to_torch = lambda x, volatile=False: (
        Variable(torch.from_numpy(x.astype(np.float32)), volatile=volatile))

torch_to_np = lambda x: x.cpu().data.numpy()

## Prepare the dataset

I'm deciding what I want to do as I go along.

I want to demonstrate some kind of binary hit classification.
I could use the 10-track artificial samples from the segment classification notebook,
or I could try to build more realistic sub-graphs from real events. I think the more interesting case is the latter one.

So the procedure will roughly be:
- select all barrel hits and remove duplicates
- select good target track samples (perfect 10-layers)
- for each track, select a fixed number of hits in the neighborhood of the true hits

I think I've implemented most of this already in other notebooks.
- selecting closest hits was done in tree_search/RNNHitClassifier.ipynb

In [8]:
def select_hits(hits):
    # Select all barrel hits
    vids = [8, 13, 17]
    hits = hits[np.logical_or.reduce([hits.volid == v for v in vids])]
    # Re-enumerate the volume and layer numbers for convenience
    volume = pd.Series(-1, index=hits.index, dtype=np.int8)
    vid_groups = hits.groupby('volid')
    for i, v in enumerate(vids):
        volume[vid_groups.get_group(v).index] = i
    # This assumes 4 layers per volume (except last volume)
    layer = (hits.layid / 2 - 1 + volume * 4).astype(np.int8)
    # Select the columns we need
    hits = (hits[['evtid', 'barcode', 'r', 'phi', 'z']]
            .assign(volume=volume, layer=layer))
    # Remove duplicate hits
    hits = hits.loc[
        hits.groupby(['evtid', 'barcode', 'layer'], as_index=False).r.idxmin()
    ]
    return hits

def select_signal_hits(hits):
    # Filter tracks that hit every layer
    hits = (hits.groupby(['evtid', 'barcode'])
            .filter(lambda x: len(x.layer.unique()) == 10))

In [9]:
input_dir = '/bigdata/shared/ACTS/prod_mu10_pt1000_2017_07_29'

In [None]:
all_files = os.listdir(input_dir)
hits_files = sorted(f for f in all_files if f.startswith('clusters'))

n_files = 1
hits_files = [os.path.join(input_dir, hf) for hf in hits_files[:n_files]]

In [None]:
%%time

n_workers = 1
with mp.Pool(processes=n_workers) as pool:
    hits = process_hits_files(hits_files, pool)
    print('Applying selections')
    hits = pool.map(select_hits, hits)

# Concatenate everything together
hits = pd.concat(hits, ignore_index=True)

Loading /bigdata/shared/ACTS/prod_mu10_pt1000_2017_07_29/clusters_1.csv


In [None]:
hits