In [1]:
# Imports

import os
import re
from argparse import ArgumentParser
from glob import glob

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torch_geometric as pyg
from torch_geometric.nn import GCNConv

import pytorch_lightning as pl

import dgl

DEVICE = torch.device("cpu")

# Import GPU-related things
if torch.cuda.is_available():
    # import cupy as np
    # import cudf as pd

    # Ensure that all operations are deterministic on GPU (if used) for reproducibility
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

    DEVICE = torch.device("cuda:0")
# else:

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/")

print('CUDA:', torch.cuda.is_available())
print("Device:", DEVICE)

temp_path = './temp'
data_path = './data'

if not os.path.exists(temp_path):
    os.mkdir(temp_path)

CUDA: True
Device: cuda:0


In [2]:
xyz_filepath_list = list(glob(f'{data_path}/*.xyz'))

xyz_filepath_list.sort()
print('total xyz filepath # ', len(xyz_filepath_list))
xyz_filepath_list[0]

total xyz filepath #  133885


'./data/dsgdb9nsd_000001.xyz'

In [3]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [4]:
dataset_dgl = dgl.data.CSVDataset('./graph_data')
dgl.data.utils.add_nodepred_split(dataset_dgl, [0.8, 0.1, 0.1])
dataset_batched = dgl.batch(dataset_dgl)

Done loading data from cached files.


In [None]:
dataset_nx = dgl.to_networkx(
    dataset_batched,
    node_attrs=[
        'label',
        'type',
        'depDeg',
        'forC',
        'isA',
        'radEl',
        'totDeg',
        'totH',
        'totV',
        'isR',
        'train_mask',
        'test_mask',
        'val_mask'
    ],
    edge_attrs=[
        'label',
        'bond_type'
    ]
)

dataset_nx

In [None]:
dataset = pyg.utils.from_networkx(
    dataset_nx,
    group_node_attrs=[
        'label',
        'type',
        'depDeg',
        'forC',
        'isA',
        'radEl',
        'totDeg',
        'totH',
        'totV',
        'isR',
        'train_mask',
        'test_mask',
        'val_mask'
    ],
    group_edge_attrs=[
        'label',
        'bond_type'
    ]
)

dataset

In [None]:
class PYGNet(nn.Module):
    def __init__(self, graph_dataset):
        super().__init__()
        self.conv1 = GCNConv(graph_dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, graph_dataset.num_classes)

    def forward(self, graph):
        x, edge_index = graph.x, graph.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
# train.py
def main(hparams):
    dataset_features = torch.column_stack([dataset.x[:, 0], dataset.x[:, 2:]])
    dataset_target = dataset.x[:, 1]
    dataset_t = dataset
    dataset_t.x = dataset_features.float()
    dataset_t.y = dataset_target.long()
    dataset_t.num_node_features = dataset_t.x.size(1)
    dataset_t.num_classes = torch.unique(dataset_t.y).size(0)
    dataset_t.train_mask = dataset_t.x[:, -3].bool()
    dataset_t.test_mask = dataset_t.x[:, -2].bool()
    dataset_t.val_mask = dataset_t.x[:, -1].bool()

    dataset_gpu = dataset_t.to(DEVICE)
    model = PYGNet(dataset_gpu).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    model.train()
    for epoch in range(2):
        optimizer.zero_grad()
        out = model(dataset_gpu)
        loss = F.cross_entropy(out[dataset_gpu.train_mask], dataset_gpu.y[dataset_gpu.train_mask])
        loss.backward()
        optimizer.step()

    model.eval()
    pred = model(dataset_gpu).argmax(dim=1)
    correct = (pred[dataset_gpu.test_mask] == dataset_gpu.y[dataset_gpu.test_mask]).sum()
    acc = int(correct) / int(dataset_gpu.test_mask.sum())
    print(f'Accuracy: {acc:.4f}')

In [None]:
if __name__ == "__main__":
    if not in_ipython():
        root_dir = os.path.dirname(os.path.realpath(__file__))
        parser = ArgumentParser(add_help=False)
        hyperparams = parser.parse_args()

        # TRAIN
        main(hyperparams)
    else:
        main(None)