In [1]:
######################## think about graph task config types

# source: https://ogb.stanford.edu/docs/graphprop/#ogbg-mol

from ogb.graphproppred import GraphPropPredDataset

dataset = GraphPropPredDataset(name = 'ogbg-molhiv')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]


Using backend: pytorch


In [2]:
import json
import numpy as np
import torch
import scipy.sparse as sparse

In [3]:
graph, label = dataset[0] # graph: library-agnostic graph objec
print(graph.keys())
print("graph count:", len(dataset))

# init node and edge index matrix
node_feat = graph["node_feat"]
edge_feat = graph["edge_feat"]
edge_index = graph["edge_index"].T
node_size_list = [graph["num_nodes"]]
edge_size_list = [len(edge_feat)]
graph_class = [label]
for idx in range(1, len(dataset)):
    if idx % 1000 == 0:
        print(idx, "/", len(dataset))
    graph, label = dataset[idx]
    graph_class.append(label)
    node_feat = np.concatenate([node_feat, graph["node_feat"]])
    edge_feat = np.concatenate([edge_feat, graph["edge_feat"]])
    edge_index = np.concatenate([edge_index, graph["edge_index"].T + node_size_list[-1]])
    node_size_list.append(graph['num_nodes'])
    edge_size_list.append(len(edge_feat))

print(node_feat.shape)
print(edge_feat.shape)
print(edge_index.shape)

dict_keys(['edge_index', 'edge_feat', 'node_feat', 'num_nodes'])
graph count: 41127
1000 / 41127
2000 / 41127
3000 / 41127


KeyboardInterrupt: 

In [4]:
node_matrix = sparse.csr_matrix((len(dataset), np.sum(node_size_list)), dtype=np.int8)
edge_matrix = sparse.csr_matrix((len(dataset), np.sum(edge_size_list)), dtype=np.int8)
for i in range(len(node_size_list)):
    if i % 1000 == 0:
        print(i, "/", len(node_size_list))
    node_matrix[i, np.arange(node_size_list[i])] = 1
    edge_matrix[i, np.arange(edge_size_list[i])] = 1

0 / 3186


  self._set_arrayXarray(i, j, x)


KeyboardInterrupt: 

In [5]:
metadata ={
    "description": "OGBg-molhiv dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of ogbg-molhiv dataset.",
                "type": "int",
                "format": "SparseTensor",
                "file": "ogbg-molhiv.npz",
                "key": "node_feats"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "ogbg-molhiv.npz",
                "key": "edge"
            },
            "EdgeFeature": {
                "description": "Node features of ogbg-molhiv dataset.",
                "type": "int",
                "format": "SparseTensor",
                "file": "ogbg-molhiv.npz",
                "key": "edge_feats"
            }
        },
        "Graph": {
            "_NodeMatrix": {
                "file": "ogbg-molhiv.npz",
                "type": "int",
                "format": "SparseTensor",
                "key": "node_matrix",
            },
            "_EdgeMatrix": {
                "file": "ogbg-molhiv.npz",
                "type": "int",
                "format": "SparseTensor",
                "key": "edge_matrix"
            },
            "GraphLabel": {
                "file": "ogbg-molhiv.npz",
                "type": "int",
                "format": "Tensor",
                "key": "graph_class"
            }
        }
    },
    "citation": "@inproceedings{Wu2018Stanford,\ntitle={Moleculenet: a benchmark for molecular machine learning},\nauthor={Zhenqin Wu, Bharath Ramsundar, Evan N Feinberg, Joseph Gomes, Caleb Geniesse, Aneesh SPappu, Karl Leswing, and Vijay Pande},\nbooktitle={Chemical Science},\npages={513=520},\nyear={2018}\n}"
 }

In [6]:
node_feats = node_feat
graph_class = np.array(graph_class)
edge = edge_index
edge_feats = edge_feat
node_matrix = node_matrix
edge_matrix = edge_matrix

data = {
    "node_feats": node_feats,
    "graph_class": graph_class,
    "edge": edge,
    "edge_feats": edge_feat,
    "node_matrix": node_matrix,
    "edge_matrix": edge_matrix
}

In [7]:
np.savez_compressed("ogbg-molhiv.npz", **data)


In [8]:
cora = np.load("ogbg-molhiv.npz", allow_pickle=True)
cora.files

['node_feats',
 'graph_class',
 'edge',
 'edge_feats',
 'node_matrix',
 'edge_matrix']

In [9]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [10]:
task = {
    "description": "The task is to predict the target molecular properties as accurately as possible, where the molecular properties are cast as binary labels, e.g, whether a molecule inhibits HIV virus replication or not. Note that some datasets (e.g., ogbg-molpcba) can have multiple tasks, and can contain nan that indicates the corresponding label is not assigned to the molecule.",
    "type": "NodeClassification",
    "feature": [{
        "object": "Node",
        "attribute": "NodeFeature"
    },
        {
            "object": "Edge",
            "attribute": "EdgeFeature"
        }],
    "target": {
        "object": "Graph",
        "attribute": "GraphLabel",
        "num_classes": np.max(graph_class)
    },
    "train_index": {
        "file": "ogbg-molhiv_task.npz",
        "key": "train"
    },
    "val_index": {
        "file": "ogbg-molhiv_task.npz",
        "key": "val"
    },
    "test_index": {
        "file": "ogbg-molhiv_task.npz",
        "key": "test"
    }
}


In [11]:
task_data = {
    "train": train_idx,
    "val": valid_idx,
    "test": test_idx
}
np.savez_compressed("ogbg-molhiv.npz", **task_data)
with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)