In [2]:
!pip install torch==1.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip uninstall -y torchtext
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
!pip install torch-geometric==1.6.3
!pip install pytorch-lightning==1.1.8 

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html


In [1]:
import numpy as np
import h5py
import torch
import torch.nn.functional as F
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
from torch.nn import BatchNorm1d, Linear, ReLU, Sequential
from torch_geometric.nn import DynamicEdgeConv, global_mean_pool
from torch_geometric.data import Batch

In [2]:
class HDF5Dataset(Dataset):
    def __init__(self, path, features=["pos_x", "pos_y", "pos_z", "time", "dir_x", "dir_y", "dir_z"], y_feature="particle_type", particle_type=13.0, batch_size=64):
        """ Loads the data from the hdf5 format provided by OrcaSong and converts it to data that can be used by PyTorch
        
        Args:
            path (str): path to the dataset
            features (list[str]): List of features to select from the event data and use as input features
            y_feature  (str): Output feature to select
            particle_type (None or float):  ID of the particle you want to classify, it will be label 0 and all else will be label 1.
                                            Must be None when y_feature is not `particle_type`
            batch_size (int): number of samples in mini batch
        Examples:
            Electron vs Background classification (default):
            ```
                HDF5Dataset("pathtodata.h5", y_feature="particle_type", particle_type=13.0)
            ```
            Energy regression with only xyzct:
            ```
                HDF5Dataset("pathtodata.h5", features=["pos_x", "pos_y", "pos_z", "time"], y_feature="energy", particle_type=None)
            ```

        Lookup of table for particle_type of Leptons:
          electron          | 11
          electron neutrino | 12
          muon              | 13
          muon neutrino     | 14
          tau               | 15
          tau neutrino      | 16
        Antiparticle is the same as particle but with minus sign
        Source: https://pdg.lbl.gov/2007/reviews/montecarlorpp.pdf
        """
        with h5py.File(path, "r") as f:
            self.groups = list(dict(f).keys())
            self.length = len(f["y"]) // batch_size + 1
            self._max_index = len(f["y"])
            print("The available y features are: ", f["y"][0].dtype.names)
        self.filename = path
        if y_feature!="particle_type":
            assert particle_type==None, "Selected a y_feature other than 'particle_type' and specified some value for particle_type as argument, which must be None for non particle_type output feature."
        self.y_feature = y_feature
        self.particle_type = particle_type
        self.batch_size = batch_size
        self._cache_x_column_names()
        self.x_mask = self.init_x_mask(features)


    def _cache_x_column_names(self):
        """Cache which columns are available in the features

        Raises:
            ValueError: It failed to read the hit_info columns
        """
        try:
            with h5py.File(self.filename, "r") as f:
                self.x_feature_dict = {
                    f["x"].attrs[f"hit_info_{i}"]: i for i in range(f["x"].shape[-1])
                }
            print("cached the following x input features", self.x_feature_dict)
        except Exception:
            raise ValueError("Can not read column names from dataset attributes")

    def init_x_mask(self, features):
        """Compute a mask that is used to select the feature columns from the data

        Args:
            features (list[str]): list of features present to load

        Returns:
            np.array: selection of column index from the features to use
        """
        x_mask = [self.x_feature_dict[feat] for feat in features]
        return np.array(x_mask)

    def __getitem__(self, index):
        """Get an sample from the h5 dataset
        x contains: (x,y,z,ct, dir_x, dir_y, dir_z)
        y contains a label neutrino (1) or muon (0)

        Args:
            index (int): index of the batch

        Returns:
            x (torch.Tensor): Tensor with the x data (for each of the vertices)
            y (torch.Tensor): Tensor with the y data (for the graph)
            batch_idx (torch.Tensor): Tensor that assigns the right batch index to each x point
        """
        with h5py.File(self.filename, "r") as f:
            if (index + 1) * self.batch_size >= self._max_index:
                  index = slice(index * self.batch_size, self._max_index)
            else:
                  index = slice(index * self.batch_size, (index + 1) * self.batch_size)
            x = f["x"][index][:]
            lengths = (np.sum(x[:, :, -1:], axis=1)).astype(int)
            batch_idx = np.hstack(
                  [
                      np.ones(length) * batch_idx
                      for batch_idx, length in enumerate(lengths)
                  ]
              )
            x = x[x[:, :, -1] == 1][:, self.x_mask]
            y = f["y"][index][:][self.y_feature]
            if self.y_feature=="particle_type" and self.particle_type:
                y = torch.LongTensor(~(abs(y) == self.particle_type))
            else:
                y = torch.Tensor(y)
        return x, y, torch.LongTensor(batch_idx)

    def __len__(self):
        return self.length


In [1]:
class DECNetwork(pl.LightningModule):
    def __init__(self, batchnorm_kwargs=None, conf=None):
        """Dynamic EdgeConvolution Network https://arxiv.org/abs/1801.07829 with
           the dynamic KNN computation as presented in https://arxiv.org/abs/1902.08570 """
        super().__init__()
        ## Lightning configuration
        self.accuracy = pl.metrics.Accuracy()
        ## Defining the Network Architecture
        nn = Sequential(
            Linear(2 * 7, 64),
            BatchNorm1d(64, **batchnorm_kwargs),
            ReLU(),
            Linear(64, 64),
            BatchNorm1d(64, **batchnorm_kwargs),
            ReLU(),
            Linear(64, 64),
            BatchNorm1d(64, **batchnorm_kwargs),
            ReLU(),
        )
        self.edge_1 = DynamicEdgeConv(nn, aggr="mean", k=32)
        nn = Sequential(
            Linear(128, 128),
            BatchNorm1d(128, **batchnorm_kwargs),
            ReLU(),
            Linear(128, 128),
            BatchNorm1d(128, **batchnorm_kwargs),
            ReLU(),
            Linear(128, 128),
            BatchNorm1d(128, **batchnorm_kwargs),
            ReLU(),
        )
        self.edge_2 = DynamicEdgeConv(nn, aggr="mean", k=32)
        nn = Sequential(
            Linear(256, 256),
            BatchNorm1d(256, **batchnorm_kwargs),
            ReLU(),
            Linear(256, 256),
            BatchNorm1d(256, **batchnorm_kwargs),
            ReLU(),
            Linear(256, 256),
            BatchNorm1d(256, **batchnorm_kwargs),
            ReLU(),
        )
        self.edge_3 = DynamicEdgeConv(nn, aggr="mean", k=32)
        self.shortcut_1 = Sequential(Linear(7, 64), BatchNorm1d(64), ReLU())
        self.shortcut_2 = Sequential(Linear(64, 128), BatchNorm1d(128), ReLU())
        self.shortcut_3 = Sequential(Linear(128, 256), BatchNorm1d(256), ReLU())
        self.lin_1 = Linear(256, 256)
        self.lin_2 = Linear(256, 128)
        self.lin_3 = Linear(128, 2)

    def forward(self, data):
        x, batch_idx = data.x, data.batch
        # in lightning, forward defines the prediction/inference actions
        # edgeconv layer 1
        sc = self.shortcut_1(x)
        x = self.edge_1(x, batch_idx)
        x = F.relu(x + sc)
        # edgeconv layer 2
        sc = self.shortcut_2(x)
        x = self.edge_2(x, batch_idx)
        x = F.relu(x + sc)
        # edgeconv layer 3
        sc = self.shortcut_3(x)
        x = self.edge_3(x, batch_idx)
        x = F.relu(x + sc)
        x = global_mean_pool(x, batch=batch_idx)
        # now apply
        x = F.relu(self.lin_1(x))
        x = F.relu(self.lin_2(x))
        x = self.lin_3(x)
        return F.log_softmax(x, dim=1)

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop. It is independent of forward
        batch = Batch(
            x=batch[0].squeeze(), y=batch[1].squeeze(), batch=batch[2].squeeze(),
        )
        out = self.forward(batch)
        loss = F.nll_loss(out, batch.y)
        self.log("train_loss", loss)
        self.log('train_acc_step', self.accuracy(out, batch.y), on_epoch=False)
        return loss

    def validation_step(self, batch, batch_idx):
        batch = Batch(
            x=batch[0].squeeze(), y=batch[1].squeeze(), batch=batch[2].squeeze(),
        )
        y_hat = self.forward(batch)
        loss = F.nll_loss(y_hat, batch.y)
        self.log("val_loss", loss)
        self.log('valid_acc', self.accuracy(y_hat, batch.y), on_step=True, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


NameError: name 'pl' is not defined

In [9]:
train_data = HDF5Dataset("small_set_1_train_0_shuffled.h5",features=["pos_x", "pos_y", "pos_z", "time", "dir_x", "dir_y", "dir_z","n_hits"], y_feature=["energy"], particle_type=None, batch_size=16)


The available y features are:  ('event_id', 'particle_type', 'energy', 'is_cc', 'bjorkeny', 'dir_x', 'dir_y', 'dir_z', 'time_interaction', 'run_id', 'vertex_pos_x', 'vertex_pos_y', 'vertex_pos_z', 'n_hits', 'weight_w1', 'weight_w2', 'weight_w3', 'n_gen', 'prod_identifier', 'std_dir_x', 'std_dir_y', 'std_dir_z', 'std_beta0', 'std_lik', 'std_n_hits_gandalf', 'std_pos_x', 'std_pos_y', 'std_pos_z', 'std_energy', 'std_lik_energy', 'std_length', 'group_id')
cached the following x input features {'channel_id': 0, 'dir_x': 1, 'dir_y': 2, 'dir_z': 3, 'dom_id': 4, 'du': 5, 'floor': 6, 'group_id': 7, 'pos_x': 8, 'pos_y': 9, 'pos_z': 10, 't0': 11, 'time': 12, 'tot': 13, 'triggered': 14, 'is_valid': 15}


KeyError: 'n_hits'

In [11]:
for a in train_data.dataloader:
    print(a)
    break

AttributeError: 'HDF5Dataset' object has no attribute 'dataloader'