In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"

# Install the CPU version. If you want to install CUDA version, please
# refer to https://www.dgl.ai/pages/start.html.
device = torch.device("cpu")
import dgl
import dgl.graphbolt as gb
import torch.nn.functional as F


Link Prediction

In [2]:
dataset = gb.BuiltinDataset("cora-seeds",root="../../../data/datasets").load()

The dataset is already preprocessed.


In [62]:
graph = dataset.graph
feature = dataset.feature
train_set = dataset.tasks[1].train_set
test_set = dataset.tasks[1].test_set
task_name = dataset.tasks[1].metadata["name"]
print(f"Task: {task_name}.")

Task: link_prediction.


In [63]:
from functools import partial
datapipe = gb.ItemSampler(train_set, batch_size=256, shuffle=True)
datapipe = datapipe.sample_uniform_negative(graph, 5)
datapipe = datapipe.sample_neighbor(graph, [5, 5, 5])
datapipe = datapipe.transform(partial(gb.exclude_seed_edges, include_reverse_edges=True))
datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
datapipe = datapipe.copy_to(device)
train_dataloader = gb.DataLoader(datapipe)

In [77]:
data = next(iter(train_dataloader))
print(f"MiniBatch: {data}")
# data.node_pairs_with_labels

MiniBatch: MiniBatch(seeds=tensor([[2352, 1129],
                        [ 893, 2383],
                        [1303,  979],
                        ...,
                        [ 132,   20],
                        [ 132, 2046],
                        [ 132,   46]], dtype=torch.int32),
          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([   0,    2,    6,  ..., 7663, 7664, 7666], dtype=torch.int32),
                                                                         indices=tensor([ 144,  931,  144,  ..., 2228, 2250, 2252], dtype=torch.int32),
                                                           ),
                                               original_row_node_ids=tensor([2352, 1129,  893,  ..., 2342,  594,  663], dtype=torch.int32),
                                               original_edge_ids=None,
                                               original_column_node_ids=tensor([2352, 1129,  893,  ..., 2337, 2421, 2661], dtype=torc

Defining Model for Node Representation

In [91]:
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F
class SAGE(nn.Module):
    def __init__(self, in_size, hidden_size):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_size, hidden_size, "mean"))
        self.layers.append(dglnn.SAGEConv(hidden_size, hidden_size, "mean"))
        self.hidden_size = hidden_size
        self.predictor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, blocks, x):
        hidden_x = x
        
        for layer_idx, (layer, block) in enumerate(zip(self.layers, blocks)):
            hidden_x = layer(block, hidden_x)
            is_last_layer = layer_idx == len(self.layers) - 1
            if not is_last_layer:
                hidden_x = F.relu(hidden_x)
        return hidden_x

initilize `model` and define `optimizer` 

In [92]:
in_size = feature.size("node",None,"feat")[0]
model = SAGE(in_size,128).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

trainig loop 

In [93]:
from tqdm.auto import tqdm
for epoch in range(2):
    model.train()
    total_loss = 0
    for step, data in tqdm(enumerate(train_dataloader)):
        # Get node pairs with labels for loss calculation.
        compacted_seeds = data.compacted_seeds.T
        labels = data.labels
        node_feature = data.node_features["feat"]
        # Convert sampled subgraphs to DGL blocks.
        blocks = data.blocks

        # Get the embeddings of the input nodes.
        y = model(blocks, node_feature)
        logits = model.predictor(
            y[compacted_seeds[0]] * y[compacted_seeds[1]]
        ).squeeze()

        # Compute loss.
        loss = F.binary_cross_entropy_with_logits(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:03d} | Loss {total_loss / (step + 1):.3f}")
    

0it [00:00, ?it/s]

Epoch 000 | Loss 0.571


0it [00:00, ?it/s]

Epoch 001 | Loss 0.450


In [75]:
model.eval()

datapipe = gb.ItemSampler(test_set, batch_size=256, shuffle=False)
datapipe = datapipe.copy_to(device)
# Since we need to use all neghborhoods for evaluation, we set the fanout
# to -1.
datapipe = datapipe.sample_neighbor(graph, [-1, -1])
datapipe = datapipe.fetch_feature(feature, node_feature_keys=["feat"])
eval_dataloader = gb.DataLoader(datapipe, num_workers=0)

logits = []
labels = []
for step, data in tqdm(enumerate(eval_dataloader)):
    # Get node pairs with labels for loss calculation.
    compacted_seeds = data.compacted_seeds.T
    label = data.labels

    # The features of sampled nodes.
    x = data.node_features["feat"]

    # Forward.
    y = model(data.blocks, x)
    logit = (
        model.predictor(y[compacted_seeds[0]] * y[compacted_seeds[1]])
        .squeeze()
        .detach()
    )

    logits.append(logit)
    labels.append(label)

logits = torch.cat(logits, dim=0)
labels = torch.cat(labels, dim=0)


# Compute the AUROC score.
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(labels.cpu(), logits.cpu())
print("Link Prediction AUC:", auc)

0it [00:00, ?it/s]

Link Prediction AUC: 0.7427011971878439


In [97]:
x= [1,2,3]
y=[7,7,7,7,7,7,7,7,7,7,77,7,7,7]
for index, (y1,x1) in enumerate(zip(x, y)):
    print(x1)
    print(x1*y1)
    print(index)

7
7
0
7
14
1
7
21
2
