# Introduction

The source code: https://github.com/dglai/WSDM21-Hands-on-Tutorial/blob/main/L2_large_link_prediction.ipynb

This replica is simple but clear in showing how unsupvised learning works in DGL module.

# loading dataset

In [73]:
import dgl
import torch
from dgl.data import AsNodePredDataset
import numpy as np 

device = 'cpu'

In [99]:
def load_cora():
    data0 = dgl.data.CSVDataset('../graph_dgl/cora_csv/')
    data = AsNodePredDataset(data0, split_ratio=(0.5,0.2,0.3))
    g = data[0]
    g.ndata["features"] = g.ndata.pop("feat").float()
    g.ndata["labels"] = g.ndata.pop("label")
    return g, data.num_classes

In [100]:
raw_g, n_classes = load_cora()

Done loading data from cached files.


In [101]:
g = dgl.add_reverse_edges(raw_g)

In [102]:
node_features = g.ndata['features']
node_labels = g.ndata['labels']
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print('Number of classes: {:d}'.format(num_classes))

Number of classes: 7


In [103]:
train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
val_nid = torch.nonzero(g.ndata['val_mask'], as_tuple=True)[0]
test_nid = torch.nonzero(~(g.ndata['train_mask'] | g.ndata['val_mask']), as_tuple=True)[0]

In [104]:
train_nid.shape, val_nid.shape, test_nid.shape

(torch.Size([1354]), torch.Size([541]), torch.Size([813]))

# Defining Neighbor Sampler and Data Loader in DGL

DGL provides `dgl.dataloading.EdgeDataLoader` to iterate over edges for edge classification or link prediction tasks.

To perform link prediction, you need to specify a negative sampler. DGL provides builtin negative samplers such as `dgl.dataloading.negative_sampler.Uniform`. 

Here this tutorial uniformly **draws $N$ negative examples per positive example**.

In [105]:
negative_sampler = dgl.dataloading.negative_sampler.Uniform(3)  # N = 3

In [106]:
sampler = dgl.dataloading.MultiLayerNeighborSampler([4,4])  # select 4 on each layer
train_dataloader = dgl.dataloading.EdgeDataLoader(
    g,
    torch.arange(g.number_of_edges()), # the edges to iterate over 
    sampler,                           # neighbor sampler
    negative_sampler=negative_sampler, 
    device = device,
    # --- the following arguments are inherited from Pytorch Dataloader ---
    batch_size = 128,
    shuffle = True,
    drop_last = False, # if we drop the last incomplete batch
    num_workers = 0
)



In [107]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

# 128 * 3 = 384

Number of input nodes: 2002
Positive graph # nodes: 566 # edges: 128
Negative graph # nodes: 566 # edges: 384
[Block(num_src_nodes=2002, num_dst_nodes=1395, num_edges=4499), Block(num_src_nodes=1395, num_dst_nodes=566, num_edges=1762)]




In [108]:
pos_graph

Graph(num_nodes=566, num_edges=128,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'features': Scheme(shape=(1433,), dtype=torch.float32), 'labels': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})

In [109]:
neg_graph

Graph(num_nodes=566, num_edges=384,
      ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [110]:
mfgs[0].num_dst_nodes()

1395

In [111]:
mfgs[0].srcdata['features'][:100].shape

torch.Size([100, 1433])

In [112]:
mfgs[0].num_dst_nodes()

1395

# Defining Model for Node Representation (Emb)

In [113]:
import torch.nn as nn 
import torch.nn.functional as F
from dgl.nn import SAGEConv

To illustrate the following code:

Recall that the MFGs yielded by the NodeDataLoader and EdgeDataLoader have the property that the ***first few source nodes are always identical to the destination nodes***:

[LINK](https://github.com/dglai/WSDM21-Hands-on-Tutorial/blob/cd3e28bae0395868e30e8c0fc002bf0b77739585//L4_message_passing.ipynb)

In [114]:
print(torch.equal(mfgs[0].srcdata[dgl.NID][: mfgs[0].num_dst_nodes()], mfgs[0].dstdata[dgl.NID]))

True


In [115]:
class Model(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(Model, self).__init__()
        self.h_feats = h_feats
        self.in_feats = in_feats
        self.conv1 = SAGEConv(self.in_feats, self.h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(self.h_feats, self.h_feats, aggregator_type='mean')
        
    def forward(self, mfgs, x):
        h_dst = x[: mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
        h_dst = h[: mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

# Defining the Score Predictor for Edges

After getting the node embbeding, we continue to predict the score of the `real` and `non-existent` edges in the sampled minibatch.

In [116]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:,0]

In [118]:
model = Model(num_features, 256).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

# Defining Training Loop 

In [118]:
model = Model(num_features, 256).to(device)
predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))

In [119]:
import tqdm
import sklearn.metrics

In [120]:
best_accuracy = 0
best_model_path = '../../../../models/model.pt'

In [None]:
for epoch in range(100):
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, pos_graph, neg_graph, mfgs) in enumerate(tq):
            inputs = mfgs[0].srcdata['features']
            outputs = model(mfgs, inputs)
            pos_score = predictor(pos_graph, outputs)
            neg_score = predictor(neg_graph, outputs)
            
            # the score and label of edges (real and non-existent)
            score = torch.cat([pos_score, neg_score])
            label = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            loss = F.binary_cross_entropy_with_logits(score, label)
            
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)
            
#             if (step+1)%500==0:
#                 model.eval()
                
#                 model.train()

100%|██████████| 85/85 [00:04<00:00, 20.41it/s, loss=0.651]
100%|██████████| 85/85 [00:03<00:00, 23.26it/s, loss=0.596]
100%|██████████| 85/85 [00:02<00:00, 33.84it/s, loss=0.595]
100%|██████████| 85/85 [00:02<00:00, 28.76it/s, loss=0.573]
100%|██████████| 85/85 [00:02<00:00, 32.35it/s, loss=0.571]
100%|██████████| 85/85 [00:02<00:00, 29.87it/s, loss=0.568]
100%|██████████| 85/85 [00:02<00:00, 28.41it/s, loss=0.567]
100%|██████████| 85/85 [00:02<00:00, 29.76it/s, loss=0.567]
100%|██████████| 85/85 [00:03<00:00, 26.62it/s, loss=0.567]
100%|██████████| 85/85 [00:02<00:00, 32.83it/s, loss=0.572]
100%|██████████| 85/85 [00:03<00:00, 25.84it/s, loss=0.542]
100%|██████████| 85/85 [00:03<00:00, 27.37it/s, loss=0.574]
100%|██████████| 85/85 [00:02<00:00, 28.85it/s, loss=0.548]
100%|██████████| 85/85 [00:02<00:00, 30.12it/s, loss=0.567]
100%|██████████| 85/85 [00:03<00:00, 24.17it/s, loss=0.571]
100%|██████████| 85/85 [00:02<00:00, 31.13it/s, loss=0.554]
100%|██████████| 85/85 [00:02<00:00, 31.

In [122]:
(input_nodes, pos_graph, neg_graph, mfgs) = next(iter(train_dataloader))

In [123]:
inputs = mfgs[0].srcdata['features']

In [131]:
inputs.dtype, inputs.shape

(torch.float32, torch.Size([1998, 1433]))

In [127]:
outputs = model(mfgs, inputs)

In [130]:
mfgs

[Block(num_src_nodes=1998, num_dst_nodes=1367, num_edges=4353),
 Block(num_src_nodes=1367, num_dst_nodes=554, num_edges=1699)]

In [129]:
outputs.shape

torch.Size([554, 256])