In [1]:
# Imports

import os
import re
from argparse import ArgumentParser
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torch_geometric as pyg
import torch_geometric.nn as pygnn
import torchmetrics.functional as metrics
from torchmetrics import Accuracy, F1Score

import pytorch_lightning as pl
import pytorch_lightning.callbacks as callbacks

import dgl

DEVICE = torch.device("cpu")

# Import GPU-related things
if torch.cuda.is_available():
    # import cupy as np
    # import cudf as pd

    # Ensure that all operations are deterministic on GPU (if used) for reproducibility
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

    DEVICE = torch.device("cuda:0")
# else:

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/")

print('CUDA:', torch.cuda.is_available())
print("Device:", DEVICE)

temp_path = './temp'
data_path = './data'

if not os.path.exists(temp_path):
    os.mkdir(temp_path)

CUDA: True
Device: cuda:0


In [2]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [3]:
dataset_dgl = dgl.data.CSVDataset('./graph_data')
dgl.data.utils.add_nodepred_split(dataset_dgl, [0.8, 0.1, 0.1])
dataset_batched = dgl.batch(dataset_dgl)

Done loading data from cached files.


In [4]:
dataset_nx = dgl.to_networkx(
    dataset_batched,
    node_attrs=[
        'label',
        'type',
        'depDeg',
        'forC',
        'isA',
        'totDeg',
        'totH',
        'totV',
        'isR',
        'train_mask',
        'test_mask',
        'val_mask'
    ],
    edge_attrs=[
        'label',
        'bond_type'
    ]
)

dataset_nx

<networkx.classes.multidigraph.MultiDiGraph at 0x7fde4d1b1cf0>

In [5]:
dataset = pyg.utils.from_networkx(
    dataset_nx,
    group_node_attrs=[
        'label',
        'type',
        'depDeg',
        'forC',
        'isA',
        'totDeg',
        'totH',
        'totV',
        'isR',
        'train_mask',
        'test_mask',
        'val_mask'
    ],
    group_edge_attrs=[
        'label',
        'bond_type'
    ]
)

dataset

Data(edge_index=[2, 2489088], id=[2489088], x=[2407753, 13], edge_attr=[2489088, 2])

In [6]:
def prepare_dataset(input_dataset):
    dataset_out = input_dataset

    dataset_features = torch.column_stack(
        [input_dataset.x[:, 0], input_dataset.x[:, 2:-3]]
    )
    dataset_target = input_dataset.x[:, 1]

    dataset_out.x = dataset_features.float()
    dataset_out.y = dataset_target.long()
    dataset_out.train_mask = input_dataset.x[:, -3].bool()
    dataset_out.test_mask = input_dataset.x[:, -2].bool()
    dataset_out.val_mask = input_dataset.x[:, -1].bool()
    dataset_out.num_node_features = dataset_out.x.size(1)
    dataset_out.num_classes = torch.unique(dataset_out.y).size(0)

    print(dataset_out.num_classes)

    return dataset_out

In [7]:
dataset_t = prepare_dataset(dataset)

38


In [8]:
class MolNet(pl.LightningModule):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        hidden_channels: int = 256,
        num_layers: int = 2,
        dropout: float = 0.0
    ):
        super().__init__()

        self.gnn = pygnn.GraphSAGE(
            in_channels,
            hidden_channels,
            num_layers,
            out_channels,
            dropout=dropout,
            norm=nn.BatchNorm1d(hidden_channels)
        )

        self.train_acc = Accuracy()
        self.train_f1 = F1Score()

        self.val_acc = Accuracy()
        self.val_f1 = F1Score()

        self.test_acc = Accuracy()
        self.test_f1 = F1Score()

    def forward(self, x, edge_index):
        return self.gnn(x, edge_index)

    def training_step(self, graph, batch_idx):
        y_hat = self(graph.x[graph.train_mask], graph.edge_index)
        y = graph.y[graph.train_mask]
        loss = F.cross_entropy(y_hat, y)

        self.train_acc(y_hat.softmax(dim=-1), y)
        self.train_f1(y_hat.softmax(dim=-1), y)

        self.log(
            'train_acc',
            self.train_acc,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

        self.log(
            'train_f1',
            self.train_f1,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

        return loss

    def validation_step(self, graph, batch_idx):
        y_hat = self(graph.x[graph.val_mask], graph.edge_index)
        y = graph.y[graph.val_mask]

        self.val_acc(y_hat.softmax(dim=-1), y)
        self.val_f1(y_hat.softmax(dim=-1), y)

        self.log(
            'val_acc',
            self.val_acc,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

        self.log(
            'val_f1',
            self.val_f1,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

    def test_step(self, graph, batch_idx):
        y_hat = self(data.x[graph.test_mask], data.edge_index)
        y = graph.y[graph.test_mask]

        self.test_acc(y_hat.softmax(dim=-1), y)
        self.test_f1(y_hat.softmax(dim=-1), y)

        self.log(
            'test_acc',
            self.test_acc,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

        self.log(
            'test_f1',
            self.test_f1,
            prog_bar=True,
            on_step=False,
            on_epoch=True
        )

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

In [9]:
def main(hparams):
    #     optimizer = torch.optim.Adamax(model.parameters(), lr=1e-6, weight_decay=5e-4)

    #     model.train()
    #     for epoch in range(750):
    #         optimizer.zero_grad()
    #         out = model(dataset_gpu.x, dataset_gpu.edge_index)
    #         loss = F.cross_entropy(out[dataset_gpu.train_mask], dataset_gpu.y[dataset_gpu.train_mask])
    #         loss.backward()
    #         optimizer.step()

    #         train_f1 = metrics.f1_score(
    #             out[dataset_gpu.test_mask],
    #             dataset_gpu.y[dataset_gpu.test_mask],
    #             average='weighted',
    #             num_classes=dataset_gpu.num_classes
    #         )
    #         print(loss)
    #         print(f'Epoch {epoch}: loss={loss}, f1={train_f1}')

    #     model.eval()
    #     pred = model(dataset_gpu.x, dataset_gpu.edge_index).argmax(dim=1)

    #     acc = metrics.accuracy(
    #         pred[dataset_gpu.test_mask],
    #         dataset_gpu.y[dataset_gpu.test_mask],
    #         average='weighted',
    #         num_classes=dataset_gpu.num_classes
    #     )

    #     f1 = metrics.f1_score(
    #         pred[dataset_gpu.test_mask],
    #         dataset_gpu.y[dataset_gpu.test_mask],
    #         average='weighted',
    #         num_classes=dataset_gpu.num_classes
    #     )

    #     print(f'Accuracy: {acc:.4f}')
    #     print(f'F1: {f1:.4f}')

    datamodule = pyg.data.LightningNodeData(
        dataset_t,
        dataset_t.train_mask,
        dataset_t.val_mask,
        dataset_t.test_mask,
        loader='neighbor',
        batch_size=1,
        num_neighbors=[25, 10],
    )

    model = MolNet(dataset_t.num_node_features, dataset_t.num_classes)

    # devices = torch.cuda.device_count()
    strategy = pl.strategies.SingleDeviceStrategy(device=DEVICE)
    checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_acc', save_top_k=1)

    trainer = pl.Trainer(
        strategy=strategy,
        accelerator='gpu',
        devices=1,
        max_epochs=20,
        callbacks=[checkpoint],
        fast_dev_run=True,
        precision=16
    )

    trainer.fit(model, datamodule)
    trainer.test(ckpt_path='best', datamodule=datamodule)

In [10]:
if __name__ == "__main__":
    if not in_ipython():
        root_dir = os.path.dirname(os.path.realpath(__file__))
        parser = ArgumentParser(add_help=False)
        hyperparams = parser.parse_args()

        # TRAIN
        main(hyperparams)
    else:
        main(None)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name      | Type      | Params
----------------------------------------
0 | gnn       | GraphSAGE | 24.9 K
1 | train_acc | Accuracy  | 0     
2 | val_acc   | Accuracy  | 0     
3 | test_acc  | Accuracy  | 0     
----------------------------------------
24.9 K    Trainable params
0         Non-trainable params
24.9 K    Total params
0.050     Total estimated model params size (MB)


ValueError: num_samples should be a positive integer value, but got num_samples=0