In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"
import dgl
import dgl.graphbolt as gb
import torch.nn as nn
import torch.nn.functional as F
import sklearn.metrics
device = torch.device("cpu")

Loading Dataset

In [2]:
dataset = gb.BuiltinDataset("ogbn-arxiv",root="../../../data/datasets").load()

The dataset is already preprocessed.


Dataset consists of graph, feature and tasks.

In [3]:
graph = dataset.graph
feature = dataset.feature
train_set = dataset.tasks[0].train_set
valid_set = dataset.tasks[0].validation_set
test_set = dataset.tasks[0].test_set
task_name = dataset.tasks[0].metadata["name"]
num_classes = dataset.tasks[0].metadata["num_classes"]
print(f"Task: {task_name}. Number of classes: {num_classes}")

Task: node_classification. Number of classes: 40


Defining Neighbor Sampler and Data Loader in DGL

In [6]:
datapipe = gb.ItemSampler(train_set, batch_size=1024, shuffle=True)
datapipe = datapipe.sample_neighbor(graph, [4, 4])
datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
datapipe = datapipe.copy_to(device)
train_dataloader = gb.DataLoader(datapipe, num_workers=0)

iterate over the data loader 

In [7]:
data = next(iter(train_dataloader))
print(data)

MiniBatch(seeds=tensor([124023,  46807,  54874,  ..., 145854, 125684,  49347]),
          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([    0,     4,     8,  ..., 14676, 14680, 14684], dtype=torch.int32),
                                                                         indices=tensor([   0, 1024, 3871,  ..., 7848, 3870, 4319], dtype=torch.int32),
                                                           ),
                                               original_row_node_ids=tensor([124023,  46807,  54874,  ..., 162909, 129411,  65153]),
                                               original_edge_ids=tensor([2439621, 2273367,  853857,  ...,  541096, 2316641, 2139748]),
                                               original_column_node_ids=tensor([124023,  46807,  54874,  ...,  58554,   6358,   1043]),
                            ),
                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([   0,    4,    8,  ...,

 node IDs from MFGs

In [8]:
mfgs =data.blocks
input_nodes = mfgs[0].srcdata[dgl.NID]
print(f"Input nodes {input_nodes }. ")

Input nodes tensor([124023,  46807,  54874,  ..., 162909, 129411,  65153]). 


defining model

In [9]:
from dgl.nn import SAGEConv

class myModel(nn.Module):
    def __init__(self ,in_feat,h_feat,num_classes):
        super().__init__()
        self.conv1 = SAGEConv(in_feat,h_feat,aggregator_type="mean")
        self.conv2 = SAGEConv(h_feat,num_classes,aggregator_type="mean")
        self.h_feat = h_feat 
    def forward(self ,mfgs,x):
        h= self.conv1(mfgs[0],x)
        h = F.relu(h)
        h = self.conv2(mfgs[1],h)
        return h 
in_size = feature.size("node", None, "feat")[0]
model = myModel(in_size,16,num_classes)

train model

In [10]:
optimizer = torch.optim.Adam(model.parameters())

define valdiation loader 

In [11]:
datapipe = gb.ItemSampler(valid_set, batch_size=1024, shuffle=True)
datapipe = datapipe.sample_neighbor(graph, [4, 4])
datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
datapipe = datapipe.copy_to(device)
valid_dataloader = gb.DataLoader(datapipe, num_workers=0)

In [16]:
import tqdm 

for epoch in range(100):
    model.train()
    with tqdm.tqdm(train_dataloader) as tq:
        for step , data in enumerate(tq):
            x = data.node_features["feat"]
            labels = data.labels

            predictions= model(data.blocks,x)

            loss = F.cross_entropy(predictions,labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            accuracy = sklearn.metrics.accuracy_score(
                labels.cpu().numpy(),predictions.argmax(1).detach().cpu().numpy()
            )
            tq.set_postfix(
                 {"loss": "%.03f" % loss.item(), "acc": "%.03f" % accuracy},
                refresh=False,
            )
    model.eval()
    predictions = []
    labels = []
    with tqdm.tqdm(valid_dataloader) as tq:
        for step , data in enumerate(tq):
            x = data.node_features["feat"]
            labels.append(data.labels.cpu().numpy())
            predictions.append(model(data.blocks, x).argmax(1).cpu().numpy())
        predictions = np.concatenate(predictions)
        labels = np.concatenate(labels)
        accuracy = sklearn.metrics.accuracy_score(labels, predictions)
        print("Epoch {} Validation Accuracy {}".format(epoch, accuracy))

        # Note that this tutorial do not train the whole model to the end.
    
        break

89it [00:03, 24.58it/s, loss=1.817, acc=0.511]
30it [00:00, 39.63it/s]

Epoch 0 Validation Accuracy 0.5140776536125373



