In [1]:

# source: https://ogb.stanford.edu/docs/linkprop/#ogbl-collab

from ogb.linkproppred import LinkPropPredDataset

dataset = LinkPropPredDataset(name = 'ogbl-collab')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]



Using backend: pytorch


In [2]:
import json
import numpy as np
import torch
import scipy.sparse as sparse



In [3]:
metadata ={
    "description": "OGBL-COLLAB dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of ogbl-collab dataset.",
                "type": "float",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "node_feats"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "ogbl-collab.npz",
                "key": "edge"
            },
            "EdgeWeight":{
                "description": "Number of co-authored papers published in that year",
                "type": "int",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "edge_weight"
            },
            "EdgeYear":{
                "description": "Year of the collaboration represented by the Edge",
                "type": "int",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "edge_year"
            },
        },
        "Graph": {
            "_NodeList": {
                "file": "ogbl-collab.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "ogbl-collab.npz",
                "key": "edge_list"
            }
        }
    },
    "citation": "@inproceedings{wang2020microsoft,\ntitle={Microsoft academic graph: When experts are not enough},\nauthor={Wang, Kuansan and Shen, Zhihong and Huang, Chiyuan and Wu, Chieh-Han and Dong, Yuxiao and Kanakia, Anshul},\nbooktitle={Quantitative Science Studies},\npages={396--413},\nyear={2020}\n}"
 }

In [4]:
node_feats = graph['node_feat'] # (235868, 128)
edge_year = graph['edge_year']
edge_weight = graph['edge_weight'].reshape(-1,) # (2358104, )
edge = graph['edge_index'].T # (2358104, 2)

node_list = np.ones((1, graph['num_nodes']))
edge_list = np.ones((1, edge.shape[0]))

data = {
    "node_feats": node_feats,
    "edge_year": edge_year,
    "edge_weight":edge_weight,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [5]:
np.savez_compressed("ogbl-collab.npz", **data)


In [6]:
cora = np.load("ogbl-collab.npz", allow_pickle=True)
cora.files

['node_feats', 'edge_year', 'edge_weight', 'edge', 'node_list', 'edge_list']

In [7]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [8]:
task = {
    "description": "The task is to predict the future author collaboration relationships given the past collaborations. The goal is to rank true collaborations higher than false collaborations. Specifically, we rank each true collaboration among a set of 100,000 randomly-sampled negative collaborations, and count the ratio of positive edges that are ranked at K-place or above (Hits@K). We found K = 50 to be a good threshold in our preliminary experiments.",
    "type": "TimeDependentLinkPrediction", # simple/multi-edge graph link prediction / time-dependent link pred
    "feature": [{
        "object": "Node",
        "attribute": "NodeFeature"
    },
        {
            "object": "Edge", # multi edge between two nodes, how dataloader processes this ？？https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/collab/mlp.py
            "attribute": "EdgeWeight"
        }],
    "time": {
        "object": "Edge",
        "attribute": "EdgeYear"
    },
    "train_edge": {
        "file": "ogbl-collab_task.npz",
        "key": "train"
    },
    "valid_edge": {
        "file": "ogbl-collab_task.npz",
        "key": "val"
    },
    "test_edge": {
        "file": "ogbl-collab_task.npz",
        "key": "test"
    }
}

In [9]:
task_data = {# train_time_start, train_time_end, val_time_start, val_time_end, test.....
    "train": train_edge,
    "val": valid_edge,
    "test": test_edge
}
np.savez_compressed("ogbl-collab_task.npz", **task_data)

with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)