# Introduction

The source code: https://github.com/dglai/WSDM21-Hands-on-Tutorial/blob/main/L2_large_link_prediction.ipynb

This replica is simple but clear in showing how unsupvised learning works in DGL module.

# loading dataset

In [73]:
import dgl
import torch
from dgl.data import AsNodePredDataset
import numpy as np 

device = 'cpu'

In [2]:
def load_cora():
    data0 = dgl.data.CSVDataset('../graph_dgl/cora_csv/')
    data = AsNodePredDataset(data0, split_ratio=(0.5,0.2,0.3))
    g = data[0]
    g.ndata["features"] = g.ndata.pop("feat")
    g.ndata["labels"] = g.ndata.pop("label")
    return g, data.num_classes

In [7]:
raw_g, n_classes = load_cora()

Done loading data from cached files.


In [8]:
g = dgl.add_reverse_edges(raw_g)

In [17]:
node_features = g.ndata['features']
node_labels = g.ndata['labels']
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print('Number of classes: {:d}'.format(num_classes))

Number of classes: 7


In [18]:
train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
val_nid = torch.nonzero(g.ndata['val_mask'], as_tuple=True)[0]
test_nid = torch.nonzero(~(g.ndata['train_mask'] | g.ndata['val_mask']), as_tuple=True)[0]

In [23]:
train_nid.shape, val_nid.shape, test_nid.shape

(torch.Size([1354]), torch.Size([541]), torch.Size([813]))

# Defining Neighbor Sampler and Data Loader in DGL

DGL provides `dgl.dataloading.EdgeDataLoader` to iterate over edges for edge classification or link prediction tasks.

To perform link prediction, you need to specify a negative sampler. DGL provides builtin negative samplers such as `dgl.dataloading.negative_sampler.Uniform`. 

Here this tutorial uniformly **draws $N$ negative examples per positive example**.

In [44]:
negative_sampler = dgl.dataloading.negative_sampler.Uniform(3)  # N = 3

In [45]:
sampler = dgl.dataloading.MultiLayerNeighborSampler([4,4])  # select 4 on each layer
train_dataloader = dgl.dataloading.EdgeDataLoader(
    g,
    torch.arange(g.number_of_edges()), # the edges to iterate over 
    sampler,                           # neighbor sampler
    negative_sampler=negative_sampler, 
    device = device,
    # --- the following arguments are inherited from Pytorch Dataloader ---
    batch_size = 128,
    shuffle = True,
    drop_last = False, # if we drop the last incomplete batch
    num_workers = 0
)

In [49]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

# 128 * 3 = 384

Number of input nodes: 2021
Positive graph # nodes: 574 # edges: 128
Negative graph # nodes: 574 # edges: 384
[Block(num_src_nodes=2021, num_dst_nodes=1411, num_edges=4545), Block(num_src_nodes=1411, num_dst_nodes=574, num_edges=1785)]


In [50]:
pos_graph

Graph(num_nodes=574, num_edges=128,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'features': Scheme(shape=(1433,), dtype=torch.int64), 'labels': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})

In [51]:
neg_graph

Graph(num_nodes=574, num_edges=384,
      ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [56]:
mfgs[0].num_dst_nodes()

1411

In [64]:
mfgs[0].srcdata['features'][:100].shape

torch.Size([100, 1433])

In [65]:
mfgs[0].num_dst_nodes()

1411

# Defining Model for Node Representation (Emb)

In [53]:
import torch.nn as nn 
import torch.nn.functional as F
from dgl.nn import SAGEConv

To illustrate the following code:

Recall that the MFGs yielded by the NodeDataLoader and EdgeDataLoader have the property that the ***first few source nodes are always identical to the destination nodes***:

[LINK](https://github.com/dglai/WSDM21-Hands-on-Tutorial/blob/cd3e28bae0395868e30e8c0fc002bf0b77739585//L4_message_passing.ipynb)

In [72]:
print(torch.equal(mfgs[0].srcdata[dgl.NID][: mfgs[0].num_dst_nodes()], mfgs[0].dstdata[dgl.NID]))

True


In [74]:
class Model(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(Model, self).__init__()
        self.h_feats = h_feats
        self.in_feats = in_feats
        self.conv1 = SAGEConv(self.in_feats, self.h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(self.h_feats, self.h_feats, aggregator_type='mean')
        
    def forward(self, mfgs, x):
        h_dst = x[: mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
        h_dst = h[: mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

In [75]:
model = Model(num_features, 256).to(device)

# Defining the Score Predictor for Edges