In [93]:

# source: https://ogb.stanford.edu/docs/linkprop/#ogbl-collab

from ogb.linkproppred import LinkPropPredDataset

dataset = LinkPropPredDataset(name = 'ogbl-collab')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]



In [94]:
import json
import numpy as np
import torch
import scipy.sparse as sparse


In [95]:
metadata ={
    "description": "OGBL-COLLAB dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of ogbl-collab dataset.",
                "type": "float",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "node_feats"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "ogbl-collab.npz",
                "key": "edge"
            },
            "EdgeWeight":{
                "description": "Number of co-authored papers published in that year",
                "type": "int",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "edge_weight"
            },
            "EdgeYear":{
                "description": "Year of the collaboration represented by the Edge",
                "type": "int",
                "format": "Tensor",
                "file": "ogbl-collab.npz",
                "key": "edge_year"
            },
        },
        "NegativeEdge": {
            "_Edge": {
            "file": "ogbl-collab.npz",
            "key": "neg_edge"
            }
        },
        "Graph": {
            "_NodeList": {
                "file": "ogbl-collab.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "ogbl-collab.npz",
                "key": "edge_list"
            }
        }
    },
    "citation": "@inproceedings{wang2020microsoft,\ntitle={Microsoft academic graph: When experts are not enough},\nauthor={Wang, Kuansan and Shen, Zhihong and Huang, Chiyuan and Wu, Chieh-Han and Dong, Yuxiao and Kanakia, Anshul},\nbooktitle={Quantitative Science Studies},\npages={396--413},\nyear={2020}\n}"
 }

In [96]:
'''how to find index
train_edge_idx = []
valid_edge_idx = []
test_edge_idx = []
edge = graph['edge_index'].T
edge_year = graph['edge_year']

temp_edge_list = train_edge['edge']

for i in range(len(temp_edge_list)):
    if i % 1000 == 0:
        print(i, '/', len(temp_edge_list))
    tool = edge == temp_edge_list[i]
    idx_list = np.where(tool[:,0] * tool[:,1] == True)[0]
    for idx in idx_list:
        if train_edge['year'][i] == edge_year[idx]:
            train_edge_idx.append(idx)
'''

"how to find index\ntrain_edge_idx = []\nvalid_edge_idx = []\ntest_edge_idx = []\nedge = graph['edge_index'].T\nedge_year = graph['edge_year']\n\ntemp_edge_list = train_edge['edge']\n\nfor i in range(len(temp_edge_list)):\n    if i % 1000 == 0:\n        print(i, '/', len(temp_edge_list))\n    tool = edge == temp_edge_list[i]\n    idx_list = np.where(tool[:,0] * tool[:,1] == True)[0]\n    for idx in idx_list:\n        if train_edge['year'][i] == edge_year[idx]:\n            train_edge_idx.append(idx)\n"

In [97]:
node_feats = graph['node_feat'] # (235868, 128)
edge_year = graph['edge_year']
edge_weight = graph['edge_weight'].reshape(-1,) # (2358104, )
edge = graph['edge_index'].T # (2358104, 2)
neg_edge = np.concatenate([valid_edge['edge_neg'], test_edge['edge_neg']])
node_list = np.ones((1, graph['num_nodes']))
edge_list = np.ones((1, edge.shape[0]))

data = {
    "node_feats": node_feats,
    "edge_year": edge_year,
    "edge_weight":edge_weight,
    "edge": edge,
    "neg_edge": neg_edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [98]:
np.savez_compressed("ogbl-collab.npz", **data)


In [99]:
cora = np.load("ogbl-collab.npz", allow_pickle=True)
cora.files

['node_feats',
 'edge_year',
 'edge_weight',
 'edge',
 'neg_edge',
 'node_list',
 'edge_list']

In [100]:
json.dump(metadata, open("metadata.json", "w"), indent=4)


In [101]:
task = {
    "description": "The task is to predict the future author collaboration relationships given the past collaborations. The goal is to rank true collaborations higher than false collaborations. Specifically, we rank each true collaboration among a set of 100,000 randomly-sampled negative collaborations, and count the ratio of positive edges that are ranked at K-place or above (Hits@K). We found K = 50 to be a good threshold in our preliminary experiments.",
    "type": "TimeDependentLinkPrediction",
    "feature": [{
        "object": "Node",
        "attribute": "NodeFeature"
    },
        {
            "object": "Edge",
            "attribute": "EdgeWeight"
        }],
    "time": {
        "object": "Edge",
        "attribute": "EdgeYear"
    },
    "valid_neg": {
        "file": "ogbl-collab_task.npz",
        "key": "val"
    },
    "test_neg": {
        "file": "ogbl-collab_task.npz",
        "key": "test"
    },
    "train_time_window": {
        "file": "ogbl-collab_task.npz",
        "key": "train_time_window"
    },
    "valid_time_window": {
        "file": "ogbl-collab_task.npz",
        "key": "valid_time_window"
    },
    "test_time_window": {
        "file": "ogbl-collab_task.npz",
        "key": "test_time_window"
    },
}

In [102]:
task_data = {
    "val": np.arange(0, valid_edge['edge_neg'].shape[0]),
    "test": np.arange(0, test_edge['edge_neg'].shape[0]),
    "train_time_window": (1963, 2017),
    "valid_time_window": (2018, 2018),
    "test_time_window": (2019, 2019)
}
np.savez_compressed("ogbl-collab_task.npz", **task_data)

with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)


In [90]:
# 1. pre-store neg - index
# 2. no pre-store - time

(100000, 2)