In [1]:
# dependency issue solved from: https://www.kaggle.com/code/nguyendacthienngan/gnn-explainer 
!pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cu124 -q
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.4/883.4 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import SparseAdam
import dgl
from dgl.nn import DeepWalk
from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset, KarateClubDataset, TUDataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
def deepwalk_train(g, emb_dim=256, walk_length=10, window_size=5, num_epochs=5):
    model = DeepWalk(
        g, 
        emb_dim=emb_dim, 
        walk_length=walk_length, 
        window_size=window_size)
    
    dataloader = DataLoader(
        torch.arange(g.num_nodes()), 
        batch_size=128,
        shuffle=True,
        collate_fn=model.sample 
    )
    
    optimizer = SparseAdam(model.parameters(), lr=0.01)
    
    model.train()
    for epoch in range(num_epochs):
        for batch_walk in dataloader:
            loss = model(batch_walk)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    return model.node_embed.weight.detach().cpu().numpy()

In [5]:
# Node prediction part

datasets = [
    ('Karate Club', KarateClubDataset()),
    ('Cora', CoraGraphDataset()),
    ('Citeseer', CiteseerGraphDataset()),
    ('Pubmed', PubmedGraphDataset()),
]

print("=" * 50)

for name, dataset in datasets:
    g = dataset[0]
    X_embeddings = deepwalk_train(g)
    labels = g.ndata['label'].numpy()
    
    if 'train_mask' in g.ndata:
        train_mask = g.ndata['train_mask'].numpy()
        test_mask = g.ndata['test_mask'].numpy()
        X_train, y_train = X_embeddings[train_mask], labels[train_mask]
        X_test, y_test = X_embeddings[test_mask], labels[test_mask]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X_embeddings, labels, test_size=0.1, random_state=42)
    
    clf = LogisticRegression(max_iter=1000, multi_class='ovr')
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"{name} --> {acc * 100:.2f} %")

Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...


/root/.dgl/cora_v2.zip:   0%|          | 0.00/132k [00:00<?, ?B/s]

Extracting file to /root/.dgl/cora_v2_d697a464
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Downloading /root/.dgl/citeseer.zip from https://data.dgl.ai/dataset/citeseer.zip...


/root/.dgl/citeseer.zip:   0%|          | 0.00/239k [00:00<?, ?B/s]

Extracting file to /root/.dgl/citeseer_d6836239
Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Downloading /root/.dgl/pubmed.zip from https://data.dgl.ai/dataset/pubmed.zip...


/root/.dgl/pubmed.zip:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Extracting file to /root/.dgl/pubmed_35464cad
Finished data loading and preprocessing.
  NumNodes: 19717
  NumEdges: 88651
  NumFeats: 500
  NumClasses: 3
  NumTrainingSamples: 60
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Karate Club --> 25.00 %
Cora --> 68.60 %
Citeseer --> 49.60 %
Pubmed --> 73.50 %


In [6]:
# Link prediction part

def compute_similarity(u, v, emb):
        return (emb[u] * emb[v]).sum(axis=1)

for name, dataset in datasets:
    g = dataset[0]
    u, v = g.edges()
    eids = np.arange(g.num_edges())
    eids = np.random.permutation(eids)
    test_size = int(len(eids) * 0.1)
    
    test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
    train_g = dgl.remove_edges(g, eids[:test_size])

    
    """
    Removing even 10% of edges can make certain nodes isolated
    in very sparse graphs like Cora. Need to add a self-loop. 
    """
    train_g = dgl.add_self_loop(train_g)
    
    embeddings = deepwalk_train(train_g)
    
    test_neg_u = np.random.randint(0, g.num_nodes(), test_size)
    test_neg_v = np.random.randint(0, g.num_nodes(), test_size)

    pos_score = compute_similarity(test_pos_u.numpy(), test_pos_v.numpy(), embeddings)
    neg_score = compute_similarity(test_neg_u, test_neg_v, embeddings)
    
    preds = np.concatenate([pos_score, neg_score])
    truth = np.concatenate([np.ones(test_size), np.zeros(test_size)])
    auc = roc_auc_score(truth, preds)

    print(f"{name} --> {auc :.4f}")

Karate Club --> 0.8467
Cora --> 0.7524
Citeseer --> 0.7101
Pubmed --> 0.9153


In [7]:
# Graph classification part 

dataset_names_graph = ['KKI', 'Peking_1', 'MUTAG', 'Tox21_p53_testing']

for name in dataset_names_graph:
    ds = TUDataset(name)
    print("=" * 50)
    
    graphs, labels = zip(*[ds[i] for i in range(len(ds))])
    graphs = [dgl.add_self_loop(g) for g in graphs]
    bg = dgl.batch(graphs)
    labels = np.array([l.item() for l in labels])
    
    node_embeddings = deepwalk_train(bg) 
    
    bg.ndata['feat'] = torch.tensor(node_embeddings)
    
    graph_embeddings = dgl.mean_nodes(bg, 'feat').numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(graph_embeddings, labels, test_size=0.1, random_state=42)
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"{name} --> {acc * 100:.2f} %")

Downloading /root/.dgl/KKI.zip from https://www.chrsmrrs.com/graphkerneldatasets/KKI.zip...


/root/.dgl/KKI.zip:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Extracting file to /root/.dgl/KKI_907c94fc
KKI --> 55.56 %
Downloading /root/.dgl/Peking_1.zip from https://www.chrsmrrs.com/graphkerneldatasets/Peking_1.zip...


/root/.dgl/Peking_1.zip:   0%|          | 0.00/37.1k [00:00<?, ?B/s]

Extracting file to /root/.dgl/Peking_1_e682f90f
Peking_1 --> 55.56 %
Downloading /root/.dgl/MUTAG.zip from https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip...


/root/.dgl/MUTAG.zip:   0%|          | 0.00/24.6k [00:00<?, ?B/s]

Extracting file to /root/.dgl/MUTAG_47395044
MUTAG --> 63.16 %
Downloading /root/.dgl/Tox21_p53_testing.zip from https://www.chrsmrrs.com/graphkerneldatasets/Tox21_p53_testing.zip...


/root/.dgl/Tox21_p53_testing.zip:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

Extracting file to /root/.dgl/Tox21_p53_testing_8385ab2e
Tox21_p53_testing --> 81.48 %
