In [None]:
# source: https://ogb.stanford.edu/docs/nodeprop/#ogbn-mag

from ogb.nodeproppred import NodePropPredDataset

dataset = NodePropPredDataset(name = "ogbn-mag")

split_idx = dataset.get_idx_split()

train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0]

In [None]:
import json
import numpy as np
import torch
import scipy.sparse as sparse

In [None]:
graph, label = dataset[0]

print(graph.keys())
print(graph['edge_feat_dict'])
# graph['node_feat_dict']

In [None]:
graph['node_feat_dict']['paper'].shape

In [None]:
graph['edge_index_dict']

In [None]:
PaperNode_id = np.arange(0, graph['num_nodes_dict']['paper'])
paper_feats = graph['node_feat_dict']['paper']
paper_class = label
paper_year = graph['node_year']
AuthorNode_id = np.arange(PaperNode_id[-1]+1, PaperNode_id.shape[-1]+1 + graph['num_nodes_dict']['author'])
InstitutionNode_id = np.arange(AuthorNode_id[-1]+1, AuthorNode_id[-1]+1 + graph['num_nodes_dict']['institution'])
FieldOfStudy_id = np.arange(InstitutionNode_id[-1]+1, InstitutionNode_id[-1]+1 + graph['num_nodes_dict']['field_of_study'])


In [None]:
author_institution_id = np.arange(0, graph['edge_index_dict'][('author','affiliated_with','institution')].shape[1])
author_paper_id = np.arange(author_institution_id[-1]+1, author_institution_id[-1]+1 + graph['edge_index_dict'][('author','writes', 'paper')].shape[1])
paper_paper_id = np.arange(author_paper_id[-1]+1, author_paper_id[-1]+1 + graph['edge_index_dict'][('paper','cites','paper')].shape[1])
paper_FieldOfStudy_id = np.arange(paper_paper_id[-1]+1, paper_paper_id[-1]+1 + graph['edge_index_dict'][('paper','has_topic','field_of_study')].shape[1])

author_institution_edge = np.stack([graph['edge_index_dict'][('author','affiliated_with','institution')][0, :] + PaperNode_id[0],
                                   graph['edge_index_dict'][('author','affiliated_with','institution')][1, :] + InstitutionNode_id[0]]).T

author_paper_edge = np.stack([graph['edge_index_dict'][('author','writes', 'paper')][0, :] + AuthorNode_id[0],
                             graph['edge_index_dict'][('author','writes', 'paper')][1, :] + PaperNode_id[0]]).T

paper_paper_edge = np.stack([graph['edge_index_dict'][('paper','cites','paper')][0, :] + PaperNode_id[0],
                            graph['edge_index_dict'][('paper','cites','paper')][1, :] + PaperNode_id[0]]).T

paper_FieldOfStudy_edge = np.stack([graph['edge_index_dict'][('paper','has_topic','field_of_study')][0, :] + PaperNode_id[0],
                                   graph['edge_index_dict'][('paper','has_topic','field_of_study')][1, :] + FieldOfStudy_id[0]]).T

node_list = np.ones((1, FieldOfStudy_id[-1]+1))
edge_list = np.ones((1, paper_FieldOfStudy_id[-1]+1))

In [None]:
metadata ={
    "description": "OGBN-MAG dataset.",
    "data": {
        "Node": {
            "PaperNode": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "PaperNode_id"
                },
                "PaperFeature": {
                    "description": "Node features of ogbn-mag dataset.",
                    "type": "float",
                    "format": "Tensor",
                    "file": "ogbn-mag.npz",
                    "key": "paper_feats",
                },
                "PaperLabel": {
                    "description": "Node labels of ogbn-mag dataset, int ranged from 1 to 40.",
                    "type": "int",
                    "format": "Tensor",
                    "file": "ogbn-mag.npz",
                    "key": "paper_class"
                },
                "PaperYear": {
                    "description": "Year of the article represented by the Node",
                    "type": "int",
                    "format": "Tensor",
                    "file": "ogbn-mag.npz",
                    "key": "paper_year"
                }
            },
            "AuthorNode": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "AuthorNode_id"
                },
            },
            "InstitutionNode": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "InstitutionNode_id"
                },
            },
            "FieldOfStudyNode": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "FieldOfStudyNode_id"
                },
            }
        },
        "Edge": {
            "Author_affiliated_with_Institution": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "author_institution_id"
                },
                "_Edge": {
                    "file": "ogbn-mag.npz",
                    "key": "author_institution_edge"
                },
            },
            "Author_writes_Paper": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "author_paper_id"
                },
                "_Edge": {
                    "file": "ogbn-mag.npz",
                    "key": "author_paper_edge"
                },
            },
            "Paper_cites_Paper": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "paper_paper_id"
                },
                "_Edge": {
                    "file": "ogbn-mag.npz",
                    "key": "paper_paper_edge"
                },
            },
            "Paper_has_topic_FieldOfStudy": {
                "_ID": {
                    "file": "ogbn-mag.npz",
                    "key": "paper_FieldOfStudy_id"
                },
                "_Edge": {
                    "file": "ogbn-mag.npz",
                    "key": "paper_FieldOfStudy_edge"
                },
            },
        },
        "Graph": {
            "_NodeList": {
                "file": "ogbn-mag.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "ogbn-mag.npz",
                "key": "edge_list"
            }
        }
    },
    "citation": "@inproceedings{wang2020microsoft,\ntitle={Microsoft academic graph: When experts are not enough},\nauthor={Wang, Kuansan and Shen, Zhihong and Huang, Chiyuan and Wu, Chieh-Han and Dong, Yuxiao and Kanakia, Anshul},\nbooktitle={Quantitative Science Studies},\npages={396--413},\nyear={2020}\n}"
 }

In [None]:

data = {
    "PaperNode_id":  PaperNode_id,
    "paper_feats":  paper_feats              ,
    "paper_class":  paper_class              ,
    "paper_year":  paper_year                ,
    "AuthorNode_id":  AuthorNode_id          ,
    "InstitutionNode_id":  InstitutionNode_id,
    "FieldOfStudy_id":  FieldOfStudy_id      ,
    "author_institution_id": author_institution_id,
    "author_paper_id": author_paper_id      ,
    "paper_paper_id": paper_paper_id       ,
    "paper_FieldOfStudy_id": paper_FieldOfStudy_id,
    "author_institution_edge":  author_institution_edge,
    "author_paper_edge      ": author_paper_edge       ,
    "paper_paper_edge       ":  paper_paper_edge       ,
    "paper_FieldOfStudy_edge":  paper_FieldOfStudy_edge,
    "node_list              ": node_list               ,
    "edge_list              ": edge_list
}

In [None]:
np.savez_compressed("ogbn-mag.npz", **data)


In [None]:
mag = np.load("ogbn-mag.npz", allow_pickle=True)
mag.files

In [None]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

In [None]:
task = {
    "description": "The ogbn-mag dataset is a heterogeneous network composed of a subset of the Microsoft Academic Graph (MAG) [1]. It contains four types of entities—papers (736,389 nodes), authors (1,134,649 nodes), institutions (8,740 nodes), and fields of study (59,965 nodes)—as well as four types of directed relations connecting two types of entities—an author is “affiliated with” an institution, an author “writes” a paper, a paper “cites” a paper, and a paper “has a topic of” a field of study. Similar to ogbn-mag, each paper is associated with a 128-dimensional word2vec feature vector, and all the other types of entities are not associated with input node features.",
    "type": "NodeClassification",
    "feature": [{
        "object": "PaperNode",
        "attribute": "PaperNodeFeature"
    },
        {
            "object": "PaperNode",
            "attribute": "PaperYear"
        }],
    "target": {
        "object": "PaperNode",
        "attribute": "PaperNode",
        "num_classes": 349
    },
    "train_set": {
        "file": "ogbn-mag_task.npz",
        "key": "train"
    },
    "val_set": {
        "file": "ogbn-mag_task.npz",
        "key": "val"
    },
    "test_set": {
        "file": "ogbn-mag_task.npz",
        "key": "test"
    }
}

In [None]:
task_data = {
    "train": train_idx,
    "val": valid_idx,
    "test": test_idx
}
np.savez_compressed("ogbn-mag_task.npz", **task_data)

In [None]:
with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)

