### Assignment 1. Let's program the GT (Graph Transformer) model on Graph-level classification on ogbg-molhiv dataset (https://ogb.stanford.edu/docs/graphprop/#ogbg-mol) with the Readout function: Max pooling.

In [1]:
!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1
!pip install dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html
!pip install chardet
!pip install ogb

Collecting torch==2.4.1
  Downloading torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.19.1
  Downloading torchvision-0.19.1-cp39-cp39-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio==2.4.1
  Downloading torchaudio-2.4.1-cp39-cp39-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.1)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nccl-cu12==2.20.5 (from torch==2.4.1)
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting triton==3.0.0 (from torch==2.4.1)
  Downloading triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl (797.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:0

In [2]:
!pip install pyyaml
!pip install pydantic



In [3]:
import os
import torch
os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"

try:
    import dgl
    installed = True
except ImportError:
    installed = False
print("DGL installed!" if installed else "Failed to install DGL!")

  from .autonotebook import tqdm as notebook_tqdm


DGL installed!


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SparseMHA(nn.Module):

    def __init__(self, hidden_size=80, num_heads=8):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scaling = self.head_dim**-0.5

        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)

    def forward(self, A, h):
        N = len(h)

        q = self.q_proj(h).reshape(N, self.head_dim, self.num_heads)
        q *= self.scaling

        k = self.k_proj(h).reshape(N, self.head_dim, self.num_heads)

        v = self.v_proj(h).reshape(N, self.head_dim, self.num_heads)

        attn = torch.bmm(q.transpose(1, 2), k)
        attn = F.softmax(attn, dim=-1)

        out = torch.bmm(attn, v.transpose(1, 2))

        out = out.transpose(1, 2).reshape(N, -1)
        return self.out_proj(out)

In [5]:
class GTLayer(nn.Module):

    def __init__(self, hidden_size=80, num_heads=8):
        super().__init__()
        self.MHA = SparseMHA(hidden_size=hidden_size, num_heads=num_heads)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size)
        self.FFN1 = nn.Linear(hidden_size, hidden_size * 2)
        self.FFN2 = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, A, h):
        h1 = h
        h = self.MHA(A, h)
        h = self.batchnorm1(h + h1)

        h2 = h
        h = self.FFN2(F.relu(self.FFN1(h)))
        h = h2 + h

        return self.batchnorm2(h)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from dgl.nn import MaxPooling
from tqdm import tqdm
import random

class GTModel(nn.Module):
    def __init__(self, out_size, hidden_size=80, pos_enc_size=2, num_layers=8, num_heads=8):
        super().__init__()
        self.atom_encoder = AtomEncoder(hidden_size)
        self.pos_linear = nn.Linear(pos_enc_size, hidden_size)
        self.layers = nn.ModuleList(
            [GTLayer(hidden_size, num_heads) for _ in range(num_layers)]
        )
        self.pooler = MaxPooling()
        self.predictor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, hidden_size // 4),
            nn.ReLU(),
            nn.Linear(hidden_size // 4, out_size),
        )

    def forward(self, g, X, pos_enc):
        indices = torch.stack(g.edges())
        N = g.num_nodes()
        A = torch.zeros((N, N), device=X.device)
        A[indices[0], indices[1]] = 1

        h = self.atom_encoder(X) + self.pos_linear(pos_enc)
        for layer in self.layers:
            h = layer(A, h)
        h = self.pooler(g, h)

        return self.predictor(h)

In [7]:
@torch.no_grad()
def evaluate(model, dataloader, evaluator, device):
    model.eval()
    y_true = []
    y_pred = []
    for batched_g, labels in dataloader:
        batched_g, labels = batched_g.to(device), labels.to(device)
        y_hat = model(batched_g, batched_g.ndata["feat"], batched_g.ndata["PE"])
        y_true.append(labels.view(y_hat.shape).detach().cpu())
        y_pred.append(y_hat.detach().cpu())
    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()
    input_dict = {"y_true": y_true, "y_pred": y_pred}
    return evaluator.eval(input_dict)["rocauc"]

In [8]:
from dgl.dataloading import GraphDataLoader
from ogb.graphproppred import collate_dgl

def train(model, dataset, evaluator, device):
    train_dataloader = GraphDataLoader(
        dataset[dataset.train_idx],
        batch_size=256,
        shuffle=True,
        collate_fn=collate_dgl,
    )
    valid_dataloader = GraphDataLoader(
        dataset[dataset.val_idx], batch_size=256, collate_fn=collate_dgl
    )
    test_dataloader = GraphDataLoader(
        dataset[dataset.test_idx], batch_size=256, collate_fn=collate_dgl
    )
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 5
    scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=num_epochs, gamma=0.5
    )
    loss_fcn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for batched_g, labels in train_dataloader:
            batched_g, labels = batched_g.to(device), labels.to(device)
            logits = model(
                batched_g, batched_g.ndata["feat"], batched_g.ndata["PE"]
            )
            loss = loss_fcn(logits, labels.float())
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()
        avg_loss = total_loss / len(train_dataloader)
        val_metric = evaluate(model, valid_dataloader, evaluator, device)
        test_metric = evaluate(model, test_dataloader, evaluator, device)
        print(
            f"Epoch: {epoch:03d}, Loss: {avg_loss:.4f}, "
            f"Val: {val_metric:.4f}, Test: {test_metric:.4f}"
        )



In [9]:
from dgl.data import AsGraphPredDataset
from ogb.graphproppred import DglGraphPropPredDataset, Evaluator
from tqdm import tqdm
from ogb.graphproppred.mol_encoder import AtomEncoder

dev = torch.device("cuda:0")

pos_enc_size = 8
dataset = AsGraphPredDataset(
    DglGraphPropPredDataset("ogbg-molhiv", "./data/OGB")
)
evaluator = Evaluator("ogbg-molhiv")

import random
random.seed(42)
train_size = len(dataset.train_idx)
val_size = len(dataset.val_idx)
test_size = len(dataset.test_idx)
dataset.train_idx = dataset.train_idx[
    torch.LongTensor(random.sample(range(train_size), 2000))
]
dataset.val_idx = dataset.val_idx[
    torch.LongTensor(random.sample(range(val_size), 1000))
]
dataset.test_idx = dataset.test_idx[
    torch.LongTensor(random.sample(range(test_size), 1000))
]

indices = torch.cat([dataset.train_idx, dataset.val_idx, dataset.test_idx])
for idx in tqdm(indices, desc="Computing Laplacian PE"):
    g, _ = dataset[idx]
    g.ndata["PE"] = dgl.laplacian_pe(g, k=pos_enc_size, padding=True)

out_size = dataset.num_tasks
model = GTModel(out_size=out_size, pos_enc_size=pos_enc_size).to(dev)

train(model, dataset, evaluator, dev)

Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip


Downloaded 0.00 GB: 100%|██████████| 3/3 [00:01<00:00,  1.80it/s]


Extracting ./data/OGB/hiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 41127/41127 [00:00<00:00, 183243.83it/s]


Converting graphs into DGL objects...


100%|██████████| 41127/41127 [00:04<00:00, 8585.95it/s] 


Saving...


Computing Laplacian PE: 100%|██████████| 4000/4000 [00:06<00:00, 604.46it/s]


Epoch: 000, Loss: 0.5285, Val: 0.5115, Test: 0.3787
Epoch: 001, Loss: 0.2620, Val: 0.4814, Test: 0.3806
Epoch: 002, Loss: 0.1704, Val: 0.4946, Test: 0.4143
Epoch: 003, Loss: 0.1602, Val: 0.5638, Test: 0.5340
Epoch: 004, Loss: 0.1583, Val: 0.6403, Test: 0.5358


### Assignment 2. Let's program the GT (Graph Transformer) model on Graph-level classification on ogbg-molhiv dataset (https://ogb.stanford.edu/docs/graphprop/#ogbg-mol) with the Readout function: average pooling.

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from dgl.nn import AvgPooling
from tqdm import tqdm
import random

class GTModel(nn.Module):
    def __init__(self, out_size, hidden_size=80, pos_enc_size=2, num_layers=8, num_heads=8):
        super().__init__()
        self.atom_encoder = AtomEncoder(hidden_size)
        self.pos_linear = nn.Linear(pos_enc_size, hidden_size)
        self.layers = nn.ModuleList(
            [GTLayer(hidden_size, num_heads) for _ in range(num_layers)]
        )
        self.pooler = AvgPooling()
        self.predictor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, hidden_size // 4),
            nn.ReLU(),
            nn.Linear(hidden_size // 4, out_size),
        )

    def forward(self, g, X, pos_enc):
        indices = torch.stack(g.edges())
        N = g.num_nodes()
        A = torch.zeros((N, N), device=X.device)
        A[indices[0], indices[1]] = 1

        h = self.atom_encoder(X) + self.pos_linear(pos_enc)
        for layer in self.layers:
            h = layer(A, h)
        h = self.pooler(g, h)

        return self.predictor(h)

In [11]:
@torch.no_grad()
def evaluate(model, dataloader, evaluator, device):
    model.eval()
    y_true = []
    y_pred = []
    for batched_g, labels in dataloader:
        batched_g, labels = batched_g.to(device), labels.to(device)
        y_hat = model(batched_g, batched_g.ndata["feat"], batched_g.ndata["PE"])
        y_true.append(labels.view(y_hat.shape).detach().cpu())
        y_pred.append(y_hat.detach().cpu())
    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()
    input_dict = {"y_true": y_true, "y_pred": y_pred}
    return evaluator.eval(input_dict)["rocauc"]

In [12]:
from dgl.dataloading import GraphDataLoader
from ogb.graphproppred import collate_dgl

def train(model, dataset, evaluator, device):
    train_dataloader = GraphDataLoader(
        dataset[dataset.train_idx],
        batch_size=256,
        shuffle=True,
        collate_fn=collate_dgl,
    )
    valid_dataloader = GraphDataLoader(
        dataset[dataset.val_idx], batch_size=256, collate_fn=collate_dgl
    )
    test_dataloader = GraphDataLoader(
        dataset[dataset.test_idx], batch_size=256, collate_fn=collate_dgl
    )
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 5
    scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=num_epochs, gamma=0.5
    )
    loss_fcn = nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for batched_g, labels in train_dataloader:
            batched_g, labels = batched_g.to(device), labels.to(device)
            logits = model(
                batched_g, batched_g.ndata["feat"], batched_g.ndata["PE"]
            )
            loss = loss_fcn(logits, labels.float())
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()
        avg_loss = total_loss / len(train_dataloader)
        val_metric = evaluate(model, valid_dataloader, evaluator, device)
        test_metric = evaluate(model, test_dataloader, evaluator, device)
        print(
            f"Epoch: {epoch:03d}, Loss: {avg_loss:.4f}, "
            f"Val: {val_metric:.4f}, Test: {test_metric:.4f}"
        )

In [13]:
from dgl.data import AsGraphPredDataset
from ogb.graphproppred import DglGraphPropPredDataset, Evaluator
from tqdm import tqdm
from ogb.graphproppred.mol_encoder import AtomEncoder

dev = torch.device("cuda:0")

pos_enc_size = 8
dataset = AsGraphPredDataset(
    DglGraphPropPredDataset("ogbg-molhiv", "./data/OGB")
)
evaluator = Evaluator("ogbg-molhiv")

import random
random.seed(42)
train_size = len(dataset.train_idx)
val_size = len(dataset.val_idx)
test_size = len(dataset.test_idx)
dataset.train_idx = dataset.train_idx[
    torch.LongTensor(random.sample(range(train_size), 2000))
]
dataset.val_idx = dataset.val_idx[
    torch.LongTensor(random.sample(range(val_size), 1000))
]
dataset.test_idx = dataset.test_idx[
    torch.LongTensor(random.sample(range(test_size), 1000))
]

indices = torch.cat([dataset.train_idx, dataset.val_idx, dataset.test_idx])
for idx in tqdm(indices, desc="Computing Laplacian PE"):
    g, _ = dataset[idx]
    g.ndata["PE"] = dgl.laplacian_pe(g, k=pos_enc_size, padding=True)

out_size = dataset.num_tasks
model = GTModel(out_size=out_size, pos_enc_size=pos_enc_size).to(dev)

train(model, dataset, evaluator, dev)

Computing Laplacian PE: 100%|██████████| 4000/4000 [00:06<00:00, 606.59it/s]


Epoch: 000, Loss: 0.6017, Val: 0.4258, Test: 0.4385
Epoch: 001, Loss: 0.4915, Val: 0.5116, Test: 0.4836
Epoch: 002, Loss: 0.3812, Val: 0.5715, Test: 0.5165
Epoch: 003, Loss: 0.2832, Val: 0.6053, Test: 0.5512
Epoch: 004, Loss: 0.2185, Val: 0.6643, Test: 0.6440
