In [1]:
######################## think about graph task config types

# source: https://ogb.stanford.edu/docs/graphprop/#ogbg-mol

from ogb.graphproppred import GraphPropPredDataset

dataset = GraphPropPredDataset(name = 'ogbg-molhiv')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]


Using backend: pytorch[22:46:52] /opt/dgl/src/runtime/tensordispatch.cc:43: TensorDispatcher: dlopen failed: /home/jimmyzxj/miniconda3/envs/py39/lib/python3.9/site-packages/dgl/tensoradapter/pytorch/libtensoradapter_pytorch_1.10.2.so: cannot open shared object file: No such file or directory



In [2]:
import json
import numpy as np
import torch
import scipy.sparse as sparse

In [3]:
print(dataset[0][0].keys())
print(dataset[0][1])

dict_keys(['edge_index', 'edge_feat', 'node_feat', 'num_nodes'])
[0]


In [4]:
from tqdm import tqdm
node_list = []
labels = []
edges = []
edge_feats = []
node_feats = []
num_nodes = 0
for g, label in tqdm(dataset):
    node_list.append(np.arange(g["num_nodes"]) + num_nodes)  # All the nodes are considered in a single graph
    edges.append(np.stack(g["edge_index"]).T + num_nodes)
    labels.append(label)
    edge_feats.append(g["edge_feat"])
    node_feats.append(g["node_feat"])
    num_nodes += g["num_nodes"]

100%|██████████| 41127/41127 [00:00<00:00, 67841.44it/s]


In [5]:
_edges = np.concatenate(edges, axis=0)
_labels = np.stack(labels).squeeze()
_edge_feats = np.concatenate(edge_feats)
_node_feats = np.concatenate(node_feats)

In [6]:
_node_list = sparse.lil_matrix((len(dataset), num_nodes))
for i, indices in enumerate(node_list):
    _node_list[i, indices] = 1

In [7]:
print(_edges.shape, _labels.shape, _edge_feats.shape, _node_feats.shape, _node_list.shape)

(2259376, 2) (41127,) (2259376, 3) (1049163, 9) (41127, 1049163)


In [8]:
metadata ={
    "description": "OGBg-molhiv dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of ogbg-molhiv dataset.",
                "type": "int",
                "format": "SparseTensor",
                "file": "ogbg-molhiv.npz",
                "key": "node_feats"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "ogbg-molhiv.npz",
                "key": "edge"
            },
            "EdgeFeature": {
                "description": "Node features of ogbg-molhiv dataset.",
                "type": "int",
                "format": "SparseTensor",
                "file": "ogbg-molhiv.npz",
                "key": "edge_feats"
            }
        },
        "Graph": {
            "_NodeList": {
                "file": "ogbg-molhiv.npz",
                "type": "int",
                "format": "SparseTensor",
                "key": "node_list",
            },
            # "_EdgeList": {
            #     "file": "ogbg-molhiv.npz",
            #     "type": "int",
            #     "format": "SparseTensor",
            #     "key": "edge_matrix"
            # },  _EdgeList is not required for this dataset
            "GraphLabel": {
                "file": "ogbg-molhiv.npz",
                "type": "int",
                "format": "Tensor",
                "key": "graph_class"
            }
        }
    },
    "citation": "@inproceedings{Wu2018Stanford,\ntitle={Moleculenet: a benchmark for molecular machine learning},\nauthor={Zhenqin Wu, Bharath Ramsundar, Evan N Feinberg, Joseph Gomes, Caleb Geniesse, Aneesh SPappu, Karl Leswing, and Vijay Pande},\nbooktitle={Chemical Science},\npages={513=520},\nyear={2018}\n}"
 }

In [9]:
data = {
    "node_feats": _node_feats,
    "graph_class": _labels,
    "edge": _edges,
    "edge_feats": _edge_feats,
    "node_list": _node_list,
}

In [10]:
np.savez_compressed("ogbg-molhiv.npz", **data)


In [11]:
cora = np.load("ogbg-molhiv.npz", allow_pickle=True)
cora.files

['node_feats', 'graph_class', 'edge', 'edge_feats', 'node_list']

In [12]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [13]:
task = {
    "description": "The task is to predict the target molecular properties as accurately as possible, where the molecular properties are cast as binary labels, e.g, whether a molecule inhibits HIV virus replication or not. Note that some datasets (e.g., ogbg-molpcba) can have multiple tasks, and can contain nan that indicates the corresponding label is not assigned to the molecule.",
    "type": "GraphClassification",
    "feature": [
        "Node/NodeFeature",
        "Edge/EdgeFeature"
    ],
    "target": "Graph/GraphLabel",
    "num_classes": 2,
    "train_set": {
        "file": "ogbg-molhiv_task.npz",
        "key": "train"
    },
    "val_set": {
        "file": "ogbg-molhiv_task.npz",
        "key": "val"
    },
    "test_set": {
        "file": "ogbg-molhiv_task.npz",
        "key": "test"
    }
}

In [14]:
task_data = {
    "train": train_idx,
    "val": valid_idx,
    "test": test_idx
}
np.savez_compressed("ogbg-molhiv_task.npz", **task_data)
with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)

In [15]:
np.unique(_labels)

array([0, 1])