In [1]:
import json
import numpy as np
import torch
import scipy.sparse as sparse
from torch_geometric.datasets import WebKB

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = WebKB(root='/tmp/webkb', name='cornell')
# dataset = WebKB(root='/tmp/webkb', name='texas')
dataset = WebKB(root='/tmp/webkb', name='wisconsin')
graph = dataset[0]
print(graph)
print("Length of dataset:",len(dataset)) # 1
print("NumNodes: ",graph.x.shape[0])
print("NumEdges: ",graph.edge_index.shape[1])
print("NumFeats: ",dataset.num_node_features)
print("NumClasses: ",dataset.num_classes)
print("NumTrainingSamples: ",graph.train_mask.nonzero().squeeze().numpy().shape[0])
print("NumValidationSamples: ", graph.val_mask.nonzero().squeeze().numpy().shape[0])
print("NumTestSamples: ", graph.test_mask.nonzero().squeeze().numpy().shape[0])


Data(x=[251, 1703], edge_index=[2, 515], y=[251], train_mask=[251, 10], val_mask=[251, 10], test_mask=[251, 10])
Length of dataset: 1
NumNodes:  251
NumEdges:  515
NumFeats:  1703
NumClasses:  5
NumTrainingSamples:  1200
NumValidationSamples:  800
NumTestSamples:  510


In [3]:
# wisconsin_feat
node_feats = sparse.csr_matrix(graph.x.numpy()) # (251, 1703)
# wisconsin_class
node_class = graph.y.numpy() # (251,)
# wisconsin_edge
edge = graph.edge_index.T.numpy() # (515, 2)
# wisconsin only has 1 single connected graph
node_list = np.ones((1, graph.x.shape[0]))  # (1, 251)
edge_list = np.ones((1, graph.edge_index.shape[1]))  # (1, 515)
data = {
    "node_feats": node_feats,
    "node_class": node_class,
    "edge": edge,
    "node_list": node_list,
    "edge_list": edge_list
}

We can save the data file in only one file `wisconsin.npz`.

In [4]:
np.savez_compressed("wisconsin.npz", **data)
wisconsin = np.load("wisconsin.npz", allow_pickle=True)
wisconsin.files

['node_feats', 'node_class', 'edge', 'node_list', 'edge_list']

In [5]:
!du wisconsin.npz -h

36K	wisconsin.npz


In [6]:
print("Graph is undirected?: ",graph.is_undirected())


Graph is undirected?:  False


## Task

In [9]:
print(graph.train_mask.shape)

torch.Size([251, 10])


In [10]:
train_set = []
val_set = []
test_set = []
for i in range(graph.train_mask.shape[1]):
    train_set.append(graph.train_mask[:,i].nonzero().squeeze().numpy())
    val_set.append(graph.val_mask[:,i].nonzero().squeeze().numpy())
    test_set.append(graph.test_mask[:,i].nonzero().squeeze().numpy())

In [11]:
task_data = {}
for i in range(graph.train_mask.shape[1]):
    task_data['train_'+str(i)] = train_set[i]
    task_data['val_'+str(i)] = val_set[i]
    task_data['test_'+str(i)] = test_set[i]
np.savez_compressed("wisconsin_task.npz", **task_data)