In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"
import dgl
import dgl.graphbolt as gb
# Install the CPU version.
device = torch.device("cpu")

OnDiskDataset for Homogeneous Graph

In [2]:
base_dir = '../../../data/dgl/ondisk_dataset_homograph'
os.makedirs(base_dir, exist_ok=True)
print(f"Created base directory: {base_dir}")

Created base directory: ../../../data/dgl/ondisk_dataset_homograph


Generate graph structure data

In [3]:
import numpy as np
import pandas as pd
num_nodes = 1000
num_edges =10*num_nodes 
edges_path = os.path.join(base_dir,"edges.csv")
edges = np.random.randint(0, num_nodes, size=(num_edges, 2))
print(f"Part of edges: {edges[:5, :]}")
df = pd.DataFrame(edges)
df.to_csv(edges_path, index=False, header=False)

print(f"Edges are saved into {edges_path}")

Part of edges: [[797 441]
 [623 664]
 [690 323]
 [375 684]
 [756 976]]
Edges are saved into ../../../data/dgl/ondisk_dataset_homograph\edges.csv


Generate feature data for graph

In [4]:
# Generate node feature in numpy array.
node_feat_0_path = os.path.join(base_dir, "node-feat-0.npy")
node_feat_0 = np.random.rand(num_nodes, 5)
print(f"Part of node feature [feat_0]: {node_feat_0[:3, :]}")
np.save(node_feat_0_path, node_feat_0)
print(f"Node feature [feat_0] is saved to {node_feat_0_path}\n")

# Generate another node feature in torch tensor
node_feat_1_path = os.path.join(base_dir, "node-feat-1.pt")
node_feat_1 = torch.rand(num_nodes, 5)
print(f"Part of node feature [feat_1]: {node_feat_1[:3, :]}")
torch.save(node_feat_1, node_feat_1_path)
print(f"Node feature [feat_1] is saved to {node_feat_1_path}\n")

# Generate edge feature in numpy array.
edge_feat_0_path = os.path.join(base_dir, "edge-feat-0.npy")
edge_feat_0 = np.random.rand(num_edges, 5)
print(f"Part of edge feature [feat_0]: {edge_feat_0[:3, :]}")
np.save(edge_feat_0_path, edge_feat_0)
print(f"Edge feature [feat_0] is saved to {edge_feat_0_path}\n")

# Generate another edge feature in torch tensor
edge_feat_1_path = os.path.join(base_dir, "edge-feat-1.pt")
edge_feat_1 = torch.rand(num_edges, 5)
print(f"Part of edge feature [feat_1]: {edge_feat_1[:3, :]}")
torch.save(edge_feat_1, edge_feat_1_path)
print(f"Edge feature [feat_1] is saved to {edge_feat_1_path}\n")


Part of node feature [feat_0]: [[0.04501775 0.86130038 0.80438487 0.41769995 0.03278492]
 [0.3294835  0.84267415 0.24491389 0.26657671 0.61897753]
 [0.05025079 0.40449633 0.40596401 0.48071805 0.35369897]]
Node feature [feat_0] is saved to ../../../data/dgl/ondisk_dataset_homograph\node-feat-0.npy

Part of node feature [feat_1]: tensor([[0.8364, 0.8196, 0.2657, 0.2338, 0.4022],
        [0.7175, 0.2167, 0.6442, 0.3115, 0.3865],
        [0.9904, 0.7448, 0.7098, 0.7667, 0.9385]])
Node feature [feat_1] is saved to ../../../data/dgl/ondisk_dataset_homograph\node-feat-1.pt

Part of edge feature [feat_0]: [[0.52706591 0.05666878 0.8666067  0.49138129 0.7295974 ]
 [0.5019747  0.79082874 0.09770859 0.092976   0.82352974]
 [0.04433413 0.55900351 0.80682557 0.55105994 0.2960257 ]]
Edge feature [feat_0] is saved to ../../../data/dgl/ondisk_dataset_homograph\edge-feat-0.npy

Part of edge feature [feat_1]: tensor([[0.5054, 0.1275, 0.5535, 0.7997, 0.4028],
        [0.6465, 0.7800, 0.8730, 0.6349, 0.2

Generate tasksÂ¶

Node classification task 

In [5]:
num_trains = int(num_nodes * 0.6)
num_vals = int(num_nodes * 0.2)
num_tests = num_nodes - num_trains - num_vals

ids = np.arange(num_nodes)
np.random.shuffle(ids)

nc_train_ids_path = os.path.join(base_dir, "nc-train-ids.npy")
nc_train_ids = ids[:num_trains]
print(f"Part of train ids for node classification: {nc_train_ids[:3]}")
np.save(nc_train_ids_path, nc_train_ids)
print(f"NC train ids are saved to {nc_train_ids_path}\n")

nc_train_labels_path = os.path.join(base_dir, "nc-train-labels.pt")
nc_train_labels = torch.randint(0, 10, (num_trains,))
print(f"Part of train labels for node classification: {nc_train_labels[:3]}")
torch.save(nc_train_labels, nc_train_labels_path)
print(f"NC train labels are saved to {nc_train_labels_path}\n")

nc_val_ids_path = os.path.join(base_dir, "nc-val-ids.npy")
nc_val_ids = ids[num_trains:num_trains+num_vals]
print(f"Part of val ids for node classification: {nc_val_ids[:3]}")
np.save(nc_val_ids_path, nc_val_ids)
print(f"NC val ids are saved to {nc_val_ids_path}\n")

nc_val_labels_path = os.path.join(base_dir, "nc-val-labels.pt")
nc_val_labels = torch.randint(0, 10, (num_vals,))
print(f"Part of val labels for node classification: {nc_val_labels[:3]}")
torch.save(nc_val_labels, nc_val_labels_path)
print(f"NC val labels are saved to {nc_val_labels_path}\n")

nc_test_ids_path = os.path.join(base_dir, "nc-test-ids.npy")
nc_test_ids = ids[-num_tests:]
print(f"Part of test ids for node classification: {nc_test_ids[:3]}")
np.save(nc_test_ids_path, nc_test_ids)
print(f"NC test ids are saved to {nc_test_ids_path}\n")

nc_test_labels_path = os.path.join(base_dir, "nc-test-labels.pt")
nc_test_labels = torch.randint(0, 10, (num_tests,))
print(f"Part of test labels for node classification: {nc_test_labels[:3]}")
torch.save(nc_test_labels, nc_test_labels_path)
print(f"NC test labels are saved to {nc_test_labels_path}\n")

Part of train ids for node classification: [474 353 844]
NC train ids are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-train-ids.npy

Part of train labels for node classification: tensor([5, 3, 3])
NC train labels are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-train-labels.pt

Part of val ids for node classification: [ 18  47 688]
NC val ids are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-val-ids.npy

Part of val labels for node classification: tensor([1, 5, 3])
NC val labels are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-val-labels.pt

Part of test ids for node classification: [252 943 443]
NC test ids are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-test-ids.npy

Part of test labels for node classification: tensor([6, 1, 1])
NC test labels are saved to ../../../data/dgl/ondisk_dataset_homograph\nc-test-labels.pt



link prediction task 

In [6]:
num_trains = int(num_edges * 0.6)
num_vals = int(num_edges * 0.2)
num_tests = num_edges - num_trains - num_vals

lp_train_node_pairs_path = os.path.join(base_dir, "lp-train-node-pairs.npy")
lp_train_node_pairs = edges[:num_trains, :]
print(f"Part of train node pairs for link prediction: {lp_train_node_pairs[:3]}")
np.save(lp_train_node_pairs_path, lp_train_node_pairs)
print(f"LP train node pairs are saved to {lp_train_node_pairs_path}\n")

lp_val_node_pairs_path = os.path.join(base_dir, "lp-val-node-pairs.npy")
lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]
print(f"Part of val node pairs for link prediction: {lp_val_node_pairs[:3]}")
np.save(lp_val_node_pairs_path, lp_val_node_pairs)
print(f"LP val node pairs are saved to {lp_val_node_pairs_path}\n")

lp_val_neg_dsts_path = os.path.join(base_dir, "lp-val-neg-dsts.pt")
lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))
print(f"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:3]}")
torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)
print(f"LP val negative dsts are saved to {lp_val_neg_dsts_path}\n")

lp_test_node_pairs_path = os.path.join(base_dir, "lp-test-node-pairs.npy")
lp_test_node_pairs = edges[-num_tests, :]
print(f"Part of test node pairs for link prediction: {lp_test_node_pairs[:3]}")
np.save(lp_test_node_pairs_path, lp_test_node_pairs)
print(f"LP test node pairs are saved to {lp_test_node_pairs_path}\n")

lp_test_neg_dsts_path = os.path.join(base_dir, "lp-test-neg-dsts.pt")
lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))
print(f"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:3]}")
torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)
print(f"LP test negative dsts are saved to {lp_test_neg_dsts_path}\n")

Part of train node pairs for link prediction: [[797 441]
 [623 664]
 [690 323]]
LP train node pairs are saved to ../../../data/dgl/ondisk_dataset_homograph\lp-train-node-pairs.npy

Part of val node pairs for link prediction: [[ 69 947]
 [985 310]
 [626 707]]
LP val node pairs are saved to ../../../data/dgl/ondisk_dataset_homograph\lp-val-node-pairs.npy

Part of val negative dsts for link prediction: tensor([[514, 333, 894,   7, 976, 934, 190, 313, 931, 819],
        [263, 857, 146, 233, 998,  93,  69, 941, 679, 309],
        [484, 306, 743, 135, 525, 626, 142, 595, 535,  59]])
LP val negative dsts are saved to ../../../data/dgl/ondisk_dataset_homograph\lp-val-neg-dsts.pt

Part of test node pairs for link prediction: [837 811]
LP test node pairs are saved to ../../../data/dgl/ondisk_dataset_homograph\lp-test-node-pairs.npy

Part of test negative dsts for link prediction: tensor([[392, 647,  13,  47, 707, 483, 432, 123, 717, 501],
        [521, 592, 616, 185, 204, 686, 111,  63,  47, 390

Organize Data into YAML File

In [9]:
yaml_content = f"""
    dataset_name: homogeneous_graph_nc_lp
    graph:
      nodes:
        - num: {num_nodes}
      edges:
        - format: csv
          path: {os.path.basename(edges_path)}
    feature_data:
      - domain: node
        name: feat_0
        format: numpy
        path: {os.path.basename(node_feat_0_path)}
      - domain: node
        name: feat_1
        format: torch
        path: {os.path.basename(node_feat_1_path)}
      - domain: edge
        name: feat_0
        format: numpy
        path: {os.path.basename(edge_feat_0_path)}
      - domain: edge
        name: feat_1
        format: torch
        path: {os.path.basename(edge_feat_1_path)}
    tasks:
      - name: node_classification
        num_classes: 10
        train_set:
          - data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_train_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_train_labels_path)}
        validation_set:
          - data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_val_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_val_labels_path)}
        test_set:
          - data:
              - name: seed_nodes
                format: numpy
                path: {os.path.basename(nc_test_ids_path)}
              - name: labels
                format: torch
                path: {os.path.basename(nc_test_labels_path)}
      - name: link_prediction
        num_classes: 10
        train_set:
          - data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_train_node_pairs_path)}
        validation_set:
          - data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_val_node_pairs_path)}
              - name: negative_dsts
                format: torch
                path: {os.path.basename(lp_val_neg_dsts_path)}
        test_set:
          - data:
              - name: node_pairs
                format: numpy
                path: {os.path.basename(lp_test_node_pairs_path)}
              - name: negative_dsts
                format: torch
                path: {os.path.basename(lp_test_neg_dsts_path)}
"""
metadata_path = os.path.join(base_dir, "metadata.yaml")
with open(metadata_path, "w") as f:
  f.write(yaml_content)

load created data 

In [14]:
dataset = gb.OnDiskDataset(base_dir).load(tasks="node_classification")
print(dataset)
graph = dataset.graph
print(f"Loaded graph: {graph}\n")

feature = dataset.feature
print(f"Loaded feature store: {feature}\n")

tasks = dataset.tasks
nc_task = tasks[0]
print(f"Loaded node classification task: {nc_task}\n")
# lp_task = tasks[1]
# print(f"Loaded link prediction task: {lp_task}\n")

The dataset is already preprocessed.
<dgl.graphbolt.impl.ondisk_dataset.OnDiskDataset object at 0x000001721CF3F310>
Loaded graph: FusedCSCSamplingGraph(csc_indptr=tensor([    0,    10,    24,  ...,  9987,  9992, 10000], dtype=torch.int32),
                      indices=tensor([ 78, 248, 122,  ..., 747, 666, 809], dtype=torch.int32),
                      total_num_nodes=1000, num_edges=10000,)

Loaded feature store: TorchBasedFeatureStore(
    {(<OnDiskFeatureDataDomain.NODE: 'node'>, None, 'feat_0'): TorchBasedFeature(
        feature=tensor([[0.0450, 0.8613, 0.8044, 0.4177, 0.0328],
                        [0.3295, 0.8427, 0.2449, 0.2666, 0.6190],
                        [0.0503, 0.4045, 0.4060, 0.4807, 0.3537],
                        ...,
                        [0.0928, 0.5588, 0.9849, 0.6688, 0.5997],
                        [0.5797, 0.9340, 0.8711, 0.6672, 0.6018],
                        [0.4929, 0.6106, 0.6122, 0.2440, 0.5539]], dtype=torch.float64),
        metadata={},
    )