# Citeseer Example

## Data

In [1]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from dgl.data import CiteseerGraphDataset
dataset = CiteseerGraphDataset()
graph = dataset[0]

  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


We can save the data file in only one file `citeseer.npz`.

In [2]:
# citeseer_feat
node_feats = sparse.csr_matrix(graph.ndata["feat"].numpy())
# citeseer_class
node_class = graph.ndata["label"].numpy()  # (3327,)
# citeseer_edge
edge = torch.stack(graph.edges()).numpy().T
# citeseer only has 1 single graph, there are some isolated nodes in the graph. 
# These isolated nodes are added as zero-vecs into the right position
node_list = np.ones((1, graph.num_nodes()))  # (1, 3327)
edge_list = np.ones((1, graph.num_edges()))  # (1, 9228)

data = {
    "node_feats": node_feats,
    "node_class": node_class,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [3]:
from gli.io import save_graph, Attribute

node_attrs = [
    Attribute(
        "NodeFeature",
        node_feats,
        "Node features of Citeseer dataset, 1/0-valued vectors.",
        "int",
        "SparseTensor",
    ),
    Attribute(
        "NodeLabel",
        node_class,
        "Node labels of Citeseer dataset, int ranged from 1 to 6.",
        "int",
        "Tensor",
    )
]

metadata = save_graph(
    name="citeseer",
    edge=edge,
    num_nodes=graph.num_nodes(),
    node_attrs=node_attrs,
    description="CITESEER dataset.",
    cite=
    "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}",
)

The metadata.json and graph data (.npz files) is now saved in the current directory.

In [4]:
# Print metadata
print(json.dumps(metadata, indent=2))

{
  "description": "CITESEER dataset.",
  "data": {
    "Node": {
      "NodeFeature": {
        "description": "Node features of Citeseer dataset, 1/0-valued vectors.",
        "type": "int",
        "format": "SparseTensor",
        "file": "citeseer__graph__Node_NodeFeature__48cffb6534f4b56a45196efa8b32cdac.sparse.npz"
      },
      "NodeLabel": {
        "description": "Node labels of Citeseer dataset, int ranged from 1 to 6.",
        "type": "int",
        "format": "Tensor",
        "file": "citeseer__graph__aed93544b5c54381d05b5452603278fb.npz",
        "key": "Node_NodeLabel"
      }
    },
    "Edge": {
      "_Edge": {
        "file": "citeseer__graph__aed93544b5c54381d05b5452603278fb.npz",
        "key": "Edge_Edge"
      }
    },
    "Graph": {
      "_NodeList": {
        "file": "citeseer__graph__Graph_NodeList__be3f84ead018cfb899bd6f98d0bb92db.sparse.npz"
      }
    }
  },
  "citation": "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning wi

## Task

In [5]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [6]:
from gli.io import save_task_node_classification

task_data = save_task_node_classification(
    name="citeseer",
    description="Node classification on CITESEER dataset. Planetoid split.",
    feature=["Node/NodeFeature"],
    target="Node/NodeLabel",
    num_classes=6,
    train_set=train_set,
    val_set=val_set,
    test_set=test_set,
    task_id="1"
)

The task data (.json and .npz files) is now saved in the current directory.

In [7]:
print(json.dumps(task_data, indent=2))

{
  "description": "Node classification on CITESEER dataset. Planetoid split.",
  "type": "NodeClassification",
  "feature": [
    "Node/NodeFeature"
  ],
  "target": "Node/NodeLabel",
  "num_classes": 6,
  "train_set": {
    "file": "citeseer__task_node_classification_1__d0b7b5f7e7e7cb9b84e0b3e97354e16e.npz",
    "key": "train_set"
  },
  "val_set": {
    "file": "citeseer__task_node_classification_1__d0b7b5f7e7e7cb9b84e0b3e97354e16e.npz",
    "key": "val_set"
  },
  "test_set": {
    "file": "citeseer__task_node_classification_1__d0b7b5f7e7e7cb9b84e0b3e97354e16e.npz",
    "key": "test_set"
  }
}


Test loading the data.

In [8]:
from gli.dataloading import read_gli_graph, read_gli_task, combine_graph_and_task

g = read_gli_graph("./metadata.json")
t = read_gli_task("./task_node_classification_1.json")
data = combine_graph_and_task(g, t)
data[0]

CITESEER dataset.
Node classification on CITESEER dataset. Planetoid split.


  return torch.sparse_csr_tensor(crow_indices,


Graph(num_nodes=3327, num_edges=9228,
      ndata_schemes={'NodeFeature': Scheme(shape=(3703,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

After adding LICENSE and README.md, the dataset directory will be the following.

In [9]:
!tree .

[1;36m.[0m
├── LICENSE
├── README.md
├── citeseer.ipynb
├── citeseer__graph__Graph_NodeList__be3f84ead018cfb899bd6f98d0bb92db.sparse.npz
├── citeseer__graph__Node_NodeFeature__48cffb6534f4b56a45196efa8b32cdac.sparse.npz
├── citeseer__graph__aed93544b5c54381d05b5452603278fb.npz
├── citeseer__task_node_classification_1__d0b7b5f7e7e7cb9b84e0b3e97354e16e.npz
├── metadata.json
├── task_node_classification_1.json
└── urls.json

1 directory, 10 files


In [1]:
print('123')

123
