In [1]:
# source: https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag

from ogb.nodeproppred import NodePropPredDataset

dataset = NodePropPredDataset(name="ogbn-mag")

split_idx = dataset.get_idx_split()

train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


Downloading http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip


Downloaded 0.40 GB: 100%|██████████| 413/413 [00:57<00:00,  7.13it/s]


Extracting dataset/mag.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 8176.03it/s]

Saving...





In [2]:
import numpy as np


In [3]:
graph, label = dataset[0]

print(graph.keys())


dict_keys(['edge_index_dict', 'edge_feat_dict', 'node_feat_dict', 'num_nodes_dict', 'node_year', 'edge_reltype'])


In [4]:
label_array = label['paper'].reshape(-1, )


In [5]:
node_year_array = graph['node_year']['paper'].reshape(-1, )


In [6]:
PaperNode_id = np.arange(0, graph['num_nodes_dict']['paper'])
paper_feats = graph['node_feat_dict']['paper']
paper_class = label_array
paper_year = node_year_array
AuthorNode_id = np.arange(
    PaperNode_id[-1] + 1,
    PaperNode_id.shape[-1] + 1 + graph['num_nodes_dict']['author'])
InstitutionNode_id = np.arange(
    AuthorNode_id[-1] + 1,
    AuthorNode_id[-1] + 1 + graph['num_nodes_dict']['institution'])
FieldOfStudy_id = np.arange(
    InstitutionNode_id[-1] + 1,
    InstitutionNode_id[-1] + 1 + graph['num_nodes_dict']['field_of_study'])


In [7]:
author_institution_id = np.arange(
    0, graph['edge_index_dict'][('author', 'affiliated_with',
                                 'institution')].shape[1])
author_paper_id = np.arange(
    author_institution_id[-1] + 1, author_institution_id[-1] + 1 +
    graph['edge_index_dict'][('author', 'writes', 'paper')].shape[1])
paper_paper_id = np.arange(
    author_paper_id[-1] + 1, author_paper_id[-1] + 1 +
    graph['edge_index_dict'][('paper', 'cites', 'paper')].shape[1])
paper_FieldOfStudy_id = np.arange(
    paper_paper_id[-1] + 1, paper_paper_id[-1] + 1 +
    graph['edge_index_dict'][('paper', 'has_topic',
                              'field_of_study')].shape[1])

author_institution_edge = np.stack([
    graph['edge_index_dict'][('author', 'affiliated_with',
                              'institution')][0, :] + AuthorNode_id[0],
    graph['edge_index_dict'][('author', 'affiliated_with',
                              'institution')][1, :] + InstitutionNode_id[0]
]).T

author_paper_edge = np.stack([
    graph['edge_index_dict'][('author', 'writes', 'paper')][0, :] +
    AuthorNode_id[0],
    graph['edge_index_dict'][('author', 'writes', 'paper')][1, :] +
    PaperNode_id[0]
]).T

paper_paper_edge = np.stack([
    graph['edge_index_dict'][('paper', 'cites', 'paper')][0, :] +
    PaperNode_id[0],
    graph['edge_index_dict'][('paper', 'cites', 'paper')][1, :] +
    PaperNode_id[0]
]).T

paper_FieldOfStudy_edge = np.stack([
    graph['edge_index_dict'][('paper', 'has_topic', 'field_of_study')][0, :] +
    PaperNode_id[0],
    graph['edge_index_dict'][('paper', 'has_topic', 'field_of_study')][1, :] +
    FieldOfStudy_id[0]
]).T

node_list = np.ones((1, FieldOfStudy_id[-1] + 1))
edge_list = np.ones((1, paper_FieldOfStudy_id[-1] + 1))


In [8]:
data = {
    "PaperNode_id": PaperNode_id,
    "paper_feats": paper_feats,
    "paper_class": paper_class,
    "paper_year": paper_year,
    "AuthorNode_id": AuthorNode_id,
    "InstitutionNode_id": InstitutionNode_id,
    "FieldOfStudyNode_id": FieldOfStudy_id,
    "author_institution_id": author_institution_id,
    "author_paper_id": author_paper_id,
    "paper_paper_id": paper_paper_id,
    "paper_FieldOfStudy_id": paper_FieldOfStudy_id,
    "author_institution_edge": author_institution_edge,
    "author_paper_edge": author_paper_edge,
    "paper_paper_edge": paper_paper_edge,
    "paper_FieldOfStudy_edge": paper_FieldOfStudy_edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [9]:
from gli.utils import save_data
save_data("ogbn-mag", **data)

Save all dense arrays to ogbn-mag.npz, including ['PaperNode_id', 'paper_feats', 'paper_class', 'paper_year', 'AuthorNode_id', 'InstitutionNode_id', 'FieldOfStudyNode_id', 'author_institution_id', 'author_paper_id', 'paper_paper_id', 'paper_FieldOfStudy_id', 'author_institution_edge', 'author_paper_edge', 'paper_paper_edge', 'paper_FieldOfStudy_edge', 'node_list', 'edge_list']


In [10]:
# Convert idx to unique idx for 3 splits
# Because the _ID starts from paper, we do not need to reindex them
abs_train_idx = train_idx["paper"]
abs_val_idx = valid_idx["paper"]
abs_test_idx = test_idx["paper"]

In [11]:
task_data = {"train": abs_train_idx, "val": abs_val_idx, "test": abs_test_idx}
save_data("ogbn-mag_task", **task_data)

Save all dense arrays to ogbn-mag_task.npz, including ['train', 'val', 'test']
