# Citeseer Example

## Data

In [1]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from dgl.data import CiteseerGraphDataset
dataset = CiteseerGraphDataset()
graph = dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


We can save the data file in only one file `citeseer.npz`.

In [2]:
# citeseer_feat
node_feats = sparse.csr_matrix(graph.ndata["feat"].numpy())
# citeseer_class
node_class = graph.ndata["label"].numpy()  # (3327,)
# citeseer_edge
edge = torch.stack(graph.edges()).numpy().T
# citeseer only has 1 single graph, there are some isolated nodes in the graph. 
# These isolated nodes are added as zero-vecs into the right position
node_list = np.ones((1, graph.num_nodes()))  # (1, 3327)
edge_list = np.ones((1, graph.num_edges()))  # (1, 9228)

data = {
    "node_feats": node_feats,
    "node_class": node_class,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

In [3]:
from gli.utils import save_data
save_data("citeseer", **data)

Save all dense arrays to citeseer.npz, including ['node_class', 'edge', 'node_list', 'edge_list']
Save sparse matrix node_feats to citeseer_node_feats.sparse.npz


## Task

In [5]:
train_set = graph.ndata["train_mask"].nonzero().squeeze().numpy()
val_set = graph.ndata["val_mask"].nonzero().squeeze().numpy()
test_set = graph.ndata["test_mask"].nonzero().squeeze().numpy()

In [6]:
task_data = {
    "train": train_set,
    "val": val_set,
    "test": test_set
}

save_data("citeseer_task", **task_data)


Save all dense arrays to citeseer_task.npz, including ['train', 'val', 'test']
