In [1]:
# dependency issue solved from: https://www.kaggle.com/code/nguyendacthienngan/gnn-explainer 
!pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cu124 -q
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.4/883.4 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [2]:
import os
os.environ["DGLBACKEND"] = "pytorch"

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Subset

import dgl
from dgl.nn import GraphConv
from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset, KarateClubDataset, TUDataset
from dgl.dataloading import GraphDataLoader

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# basic GCN class
class GCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, out_dim)
    
    def forward(self, g, features):
        h = self.conv1(g, features)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [5]:
# global params
NUM_EPOCH = 250
LEARNING_RATE = 0.01

In [6]:
# node prediction part

datasets = [
    ('Karate Club', KarateClubDataset()),
    ('Cora', CoraGraphDataset()),
    ('Citeseer', CiteseerGraphDataset()),
    ('Pubmed', PubmedGraphDataset()),
]

print("=" * 50)

for name, dataset in datasets:
    g = dataset[0].to(device)

    # some datasets don't specify features
    if 'feat' not in g.ndata:
        features = torch.eye(g.num_nodes()).to(device)
    else:
        features = g.ndata['feat']
    
    labels = g.ndata['label']
    train_mask = g.ndata.get('train_mask', torch.ones(g.num_nodes(), dtype=torch.bool))
    test_mask = g.ndata.get('test_mask', torch.ones(g.num_nodes(), dtype=torch.bool))

    model = GCN(in_dim=features.shape[1], 
                hidden_dim=16, 
                out_dim=dataset.num_classes).to(device)

    """
    Although some of our datasets might be very sparse, we can't use SparseAdam here.
    Because GCN and similar conv operations produce dense gradients, no matter how 
    sparse the adjacency matrix might be, e.g., most gradient values are non-zero.
    """
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(NUM_EPOCH):
        model.train()
        logits = model(g, features)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        preds = logits.argmax(1)
        acc = accuracy_score(labels[test_mask].cpu(), preds[test_mask].cpu())
        
        print(f"{name} --> {acc * 100:.2f} %")

Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...


/root/.dgl/cora_v2.zip:   0%|          | 0.00/132k [00:00<?, ?B/s]

Extracting file to /root/.dgl/cora_v2_d697a464
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Downloading /root/.dgl/citeseer.zip from https://data.dgl.ai/dataset/citeseer.zip...


/root/.dgl/citeseer.zip:   0%|          | 0.00/239k [00:00<?, ?B/s]

Extracting file to /root/.dgl/citeseer_d6836239
Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Downloading /root/.dgl/pubmed.zip from https://data.dgl.ai/dataset/pubmed.zip...


/root/.dgl/pubmed.zip:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Extracting file to /root/.dgl/pubmed_35464cad
Finished data loading and preprocessing.
  NumNodes: 19717
  NumEdges: 88651
  NumFeats: 500
  NumClasses: 3
  NumTrainingSamples: 60
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Karate Club --> 100.00 %
Cora --> 76.00 %
Citeseer --> 60.30 %
Pubmed --> 76.70 %


In [7]:
# link prediction part

"""
A separate GCN class for link prediction
Since we don't need the GCN to output a specific "prediction" dimension,
we don't need a separate reduced output dimension size either. 
"""
class GCNForLink(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super(GCNForLink, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, hidden_dim)
    
    def forward(self, g, features):
        h = self.conv1(g, features)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

for name, dataset in datasets:
    g = dataset[0].to(device)

     # some datasets don't specify features
    if 'feat' not in g.ndata:
        features = torch.eye(g.num_nodes()).to(device)
    else:
        features = g.ndata['feat']
    
    u, v = g.edges()
    eids = np.arange(g.num_edges())
    eids = np.random.permutation(eids)
    test_size = int(len(eids) * 0.1)
    
    test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]] 
    train_g = dgl.remove_edges(g, eids[:test_size])
    train_u, train_v = train_g.edges()
    train_g = dgl.add_self_loop(train_g)
    train_g = train_g.to(device)
    
    model = GCNForLink(in_dim=features.shape[1], hidden_dim=16).to(device)
    
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(NUM_EPOCH):
        h = model(train_g, features)
        
        num_neg = len(train_u)
        neg_u = torch.randint(0, g.num_nodes(), (num_neg,)).to(device)
        neg_v = torch.randint(0, g.num_nodes(), (num_neg,)).to(device)
        
        pos_score = (h[train_u] * h[train_v]).sum(1)
        neg_score = (h[neg_u] * h[neg_v]).sum(1)
        
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones(len(pos_score)), torch.zeros(len(neg_score))]).to(device)
        loss = F.binary_cross_entropy_with_logits(scores, labels)
        
        optimizer.zero_grad();
        loss.backward();
        optimizer.step()

    h_test = model(train_g, features)

    test_neg_u = torch.randint(0, g.num_nodes(), (test_size,)).to(device)
    test_neg_v = torch.randint(0, g.num_nodes(), (test_size,)).to(device)

    pos_score = (h_test[test_pos_u] * h_test[test_pos_v]).sum(1).detach().cpu().numpy()
    neg_score = (h_test[test_neg_u] * h_test[test_neg_v]).sum(1).detach().cpu().numpy()
    
    truth = np.concatenate([np.ones(test_size), np.zeros(test_size)])
    preds = np.concatenate([pos_score, neg_score])
    auc = roc_auc_score(truth, preds)

    print(f"{name} --> {auc:.4f}")

Karate Club --> 0.8933
Cora --> 0.9384
Citeseer --> 0.9222
Pubmed --> 0.9525


In [8]:
# A separate GCN class for graph classification
class GCNForGraph(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GCNForGraph, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim, allow_zero_in_degree=True)
        self.conv2 = GraphConv(hidden_dim, hidden_dim, allow_zero_in_degree=True)
        self.classify = nn.Linear(hidden_dim, out_dim)

    def forward(self, g, features):
        h = self.conv1(g, features)
        h = F.relu(h)
        h = self.conv2(g, h)
        h = F.relu(h)
        
        g.ndata['h'] = h
        hg = dgl.mean_nodes(g, 'h')
        
        return self.classify(hg)

dataset_names_graph = ['KKI', 'Peking_1', 'MUTAG', 'Tox21_p53_testing']

for name in dataset_names_graph:
    ds = TUDataset(name)
    print("=" * 50)
    
    if 'feat' not in ds[0][0].ndata:
        for i in range(len(ds)):
            g, _ = ds[i]
            g.ndata['feat'] = g.in_degrees().float().unsqueeze(1)
    
    sample_g, _ = ds[0]
    in_dim = sample_g.ndata['feat'].shape[1]
    num_classes = len(set([ds[i][1].item() for i in range(len(ds))]))
    
    indices = np.arange(len(ds))
    train_idx, test_idx = train_test_split(indices, test_size=0.1, random_state=42)
    
    train_loader = GraphDataLoader(Subset(ds, train_idx), batch_size=32, shuffle=True)
    test_loader = GraphDataLoader(Subset(ds, test_idx), batch_size=32, shuffle=False)
    
    model = GCNForGraph(in_dim=in_dim, 
                        hidden_dim=256, 
                        out_dim=num_classes).to(device)
    
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(NUM_EPOCH):
        model.train()
        for batched_g, labels in train_loader:
            batched_g, labels = batched_g.to(device), labels.to(device)

            labels = labels.view(-1)
            logits = model(batched_g, batched_g.ndata['feat'])
            loss = F.cross_entropy(logits, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batched_g, labels in test_loader:
            batched_g, labels = batched_g.to(device), labels.to(device)

            labels = labels.view(-1)
            logits = model(batched_g, batched_g.ndata['feat'])
            preds = logits.argmax(dim=1)
            
            correct += (preds == labels).sum().item()
            total += len(labels)
    
    acc = (correct / total) * 100
    print(f"{name} --> {acc:.2f} %")
    print("=" * 50)

Downloading /root/.dgl/KKI.zip from https://www.chrsmrrs.com/graphkerneldatasets/KKI.zip...


/root/.dgl/KKI.zip:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Extracting file to /root/.dgl/KKI_907c94fc
KKI --> 44.44 %
Downloading /root/.dgl/Peking_1.zip from https://www.chrsmrrs.com/graphkerneldatasets/Peking_1.zip...


/root/.dgl/Peking_1.zip:   0%|          | 0.00/37.1k [00:00<?, ?B/s]

Extracting file to /root/.dgl/Peking_1_e682f90f
Peking_1 --> 33.33 %
Downloading /root/.dgl/MUTAG.zip from https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip...


/root/.dgl/MUTAG.zip:   0%|          | 0.00/24.6k [00:00<?, ?B/s]

Extracting file to /root/.dgl/MUTAG_47395044
MUTAG --> 100.00 %
Downloading /root/.dgl/Tox21_p53_testing.zip from https://www.chrsmrrs.com/graphkerneldatasets/Tox21_p53_testing.zip...


/root/.dgl/Tox21_p53_testing.zip:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

Extracting file to /root/.dgl/Tox21_p53_testing_8385ab2e
Tox21_p53_testing --> 81.48 %
