# Cora Example

## Data

In [1]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from dgl.data import CoraGraphDataset

dataset = CoraGraphDataset()
graph = dataset[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


We can save the data file in only one file `cora.npz`.

In [2]:
# node features
node_feats = sparse.csr_matrix(graph.ndata["feat"].numpy())
# node labels
node_class = graph.ndata["label"].numpy()  # (2708,)
# edge list
edge = torch.stack(graph.edges()).numpy().T
print(edge.shape)

(10556, 2)


In [3]:
from gli.io import save_graph, Attribute

node_attrs = [
    Attribute(
        "NodeFeature",
        node_feats,
        "Node features of Cora dataset, 1/0-valued vectors.",
        "int",
        "SparseTensor",
    ),
    Attribute(
        "NodeLabel",
        node_class,
        "Node labels of Cora dataset, int ranged from 1 to 7.",
        "int",
        "Tensor",
    )
]

metadata = save_graph(
    name="cora",
    edge=edge,
    num_nodes=graph.num_nodes(),
    node_attrs=node_attrs,
    description="CORA dataset.",
    cite=
    "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}",
)

The metadata.json and graph data (.npz files) is now saved in the current directory.

In [4]:
# Print metadata
print(json.dumps(metadata, indent=2))

{
  "description": "CORA dataset.",
  "data": {
    "Node": {
      "NodeFeature": {
        "description": "Node features of Cora dataset, 1/0-valued vectors.",
        "type": "int",
        "format": "SparseTensor",
        "file": "cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz"
      },
      "NodeLabel": {
        "description": "Node labels of Cora dataset, int ranged from 1 to 7.",
        "type": "int",
        "format": "Tensor",
        "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz",
        "key": "Node_NodeLabel"
      }
    },
    "Edge": {
      "_Edge": {
        "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz",
        "key": "Edge_Edge"
      }
    },
    "Graph": {
      "_NodeList": {
        "file": "cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz"
      }
    }
  },
  "citation": "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nautho

## Task

In [5]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [6]:
from gli.io import save_task_node_classification

task_data = save_task_node_classification(
    name="cora",
    description="Node classification on CORA dataset. Planetoid split.",
    feature=["Node/NodeFeature"],
    target="Node/NodeLabel",
    num_classes=7,
    train_set=train_set,
    val_set=val_set,
    test_set=test_set,
    task_id="1"
)

The task data (.json and .npz files) is now saved in the current directory.

In [7]:
print(json.dumps(task_data, indent=2))

{
  "description": "Node classification on CORA dataset. Planetoid split.",
  "type": "NodeClassification",
  "feature": [
    "Node/NodeFeature"
  ],
  "target": "Node/NodeLabel",
  "num_classes": 7,
  "train_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "train_set"
  },
  "val_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "val_set"
  },
  "test_set": {
    "file": "cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz",
    "key": "test_set"
  }
}


Test loading the data.

In [8]:
from gli.dataloading import read_gli_graph, read_gli_task, combine_graph_and_task

g = read_gli_graph("./metadata.json")
t = read_gli_task("./task_node_classification_1.json")
data = combine_graph_and_task(g, t)
data[0]

CORA dataset.
Node classification on CORA dataset. Planetoid split.


  return torch.sparse_csr_tensor(crow_indices,


Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

After adding LICENSE and README.md, the dataset directory will be the following.

In [9]:
!tree .

[1;36m.[00m
├── LICENSE
├── README.md
├── cora.ipynb
├── cora__graph__6c912909fa18eff10797210ea5e485fe.npz
├── cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz
├── cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz
├── cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz
├── metadata.json
└── task_node_classification_1.json

0 directories, 9 files
