In [34]:
# source: https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv

from ogb.nodeproppred import NodePropPredDataset

dataset = NodePropPredDataset(name = "ogbn-arxiv")

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0]

In [35]:
import json
import numpy as np
import torch
import scipy.sparse as sparse


In [36]:
metadata ={
    "description": "OGBN-ARXIV dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of ogbn-arxiv dataset.",
                "type": "float",
                "format": "Tensor",
                "file": "ogbn-arxiv.npz",
                "key": "node_feats"
            },
            "NodeLabel": {
                "description": "Node labels of ogbn-arxiv dataset, int ranged from 1 to 40.",
                "type": "int",
                "format": "Tensor",
                "file": "ogbn-arxiv.npz",
                "key": "node_class"
            },
            "NodeYear": {
                "description": "Year of the article represented by the Node",
                "type": "int",
                "format": "Tensor",
                "file": "ogbn-arxiv.npz",
                "key": "node_year"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "ogbn-arxiv.npz",
                "key": "edge"
            }
        },
        "Graph": {
            "_NodeList": {
                "file": "ogbn-arxiv.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "ogbn-arxiv.npz",
                "key": "edge_list"
            }
        }
    },
    "citation": "@inproceedings{wang2020microsoft,\ntitle={Microsoft academic graph: When experts are not enough},\nauthor={Wang, Kuansan and Shen, Zhihong and Huang, Chiyuan and Wu, Chieh-Han and Dong, Yuxiao and Kanakia, Anshul},\nbooktitle={Quantitative Science Studies},\npages={396--413},\nyear={2020}\n}"
 }

In [37]:
node_feats = graph['node_feat'] # (169343, 128)
node_class = label.reshape(-1,)  # (169343,)
node_year = graph['node_year']
edge = graph['edge_index'].T # (1166243, 2)
node_list = np.ones((1, graph['num_nodes']))
edge_list = np.ones((1, edge.shape[0]))

data = {
    "node_feats": node_feats,
    "node_class": node_class,
    "node_year":node_year,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [38]:
np.savez_compressed("ogbn-arxiv.npz", **data)


In [39]:
cora = np.load("ogbn-arxiv.npz", allow_pickle=True)
cora.files

['node_feats', 'node_class', 'node_year', 'edge', 'node_list', 'edge_list']

In [40]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [41]:
task = {
    "description": "The task is to predict the 40 subject areas of arXiv CS papers, e.g., cs.AI, cs.LG, and cs.OS, which are manually determined (i.e., labeled) by the paper’s authors and arXiv moderators. With the volume of scientific publications doubling every 12 years over the past century, it is practically important to automatically classify each publication’s areas and topics. Formally, the task is to predict the primary categories of the arXiv papers, which is formulated as a 40-class classification problem.",
    "type": "NodeClassification",
    "feature": [{
        "object": "Node",
        "attribute": "NodeFeature"
    },
        {
            "object": "Node",
            "attribute": "NodeYear"
        }],
    "target": {
        "object": "Node",
        "attribute": "NodeLabel",
        "num_classes": 40
    },
    "train_node": {
        "file": "ogbn-arxiv_task.npz",
        "key": "train"
    },
    "val_node": {
        "file": "ogbn-arxiv_task.npz",
        "key": "val"
    },
    "test_node": {
        "file": "ogbn-arxiv_task.npz",
        "key": "test"
    }
}


In [42]:
task_data = {
    "train": train_idx,
    "val": valid_idx,
    "test": test_idx
}
np.savez_compressed("ogbn-arxiv_task.npz", **task_data)

with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)

