# Cora Example

## Data

In [4]:
metadata ={
    "description": "CORA dataset.",
    "data": {
        "Node": {
            "NodeFeature": {
                "description": "Node features of Cora dataset, 1/0-valued vectors.",
                "type": "int",
                "format": "SparseTensor",
                "file": "cora.npz",
                "key": "node_feats"
            },
            "NodeLabel": {
                "description": "Node labels of Cora dataset, int ranged from 1 to 7.",
                "type": "int",
                "format": "Tensor",
                "file": "cora.npz",
                "key": "node_class"
            }
        },
        "Edge": {
            "_Edge": {
                "file": "cora.npz",
                "key": "edge"
            }
        },
        "Graph": {
            "_NodeList": {
                "file": "cora.npz",
                "key": "node_list"
            },
            "_EdgeList": {
                "file": "cora.npz",
                "key": "edge_list"
            }
        }
    },
    "citation": "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}"
 }
 

In [5]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from dgl.data import CoraGraphDataset
dataset = CoraGraphDataset()
graph = dataset[0]

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)
Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /root/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.


We can save the data file in only one file `cora.npz`.

In [6]:
# cora_feat
node_feats = sparse.csr_matrix(graph.ndata["feat"].numpy())
# cora_class
node_class = graph.ndata["label"].numpy()  # (2708,)
# cora_edge
edge = torch.stack(graph.edges()).numpy().T
# cora only has 1 single connected graph
node_list = np.ones((1, graph.num_nodes()))  # (1, 2708)
edge_list = np.ones((1, graph.num_edges()))  # (1, 10556)

data = {
    "node_feats": node_feats,
    "node_class": node_class,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [7]:
np.savez_compressed("cora.npz", **data)

In [8]:
cora = np.load("cora.npz", allow_pickle=True)
cora.files

['node_feats', 'node_class', 'edge', 'node_list', 'edge_list']

In [9]:
!du cora.npz -h

116K	cora.npz


In [10]:
json.dump(metadata, open("metadata.json", "w"), indent=4)

## Task

In [11]:
task = {
    "description": "Node classification on CORA dataset. Planetoid split.",
    "type": "NodeClassification",
    "feature": [{
        "object": "Node",
        "attribute": "NodeFeature"
    }],
    "target": {
        "object": "Node",
        "attribute": "NodeLabel",
        "num_classes": 7
    },
    "train_set": {
        "file": "cora_task.npz",
        "key": "train"
    },
    "val_set": {
        "file": "cora_task.npz",
        "key": "val"
    },
    "test_set": {
        "file": "cora_task.npz",
        "key": "test"
    }
}


In [12]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [13]:
task_data = {
    "train": train_set,
    "val": val_set,
    "test": test_set
}
np.savez_compressed("cora_task.npz", **task_data)

In [14]:
with open("./task.json", "w") as fp:
    json.dump(task, fp, indent=4)