<a href="https://colab.research.google.com/github/JulianMeigen/ML-handson/blob/main/notebooks/7.0-SNJMMH-Day7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment Day 7

## Team members:
- Samuel Nebgen s6sanebg@uni-bonn.de
- Muhammad Humza Arain s27marai@uni-bonn.de
- Julian Meigen s82jmeig@uni-bonn.de

## 16.09.2025

Contributions were made by all team members in around the same amount, either based on discussions or coding.

In [2]:
!gdown --folder https://drive.google.com/drive/folders/1VESm-JaHEqPJmM23iLW1mEJsuI2mLBdx?usp=sharing

Retrieving folder contents
Processing file 1i6W9fI3sGEn6V9xBlt2MxlonZXOTLZjg load-subgraph_doc.ipynb
Processing file 1qZpQzFMRzuYQ0xoQJcUNe7CRBMS2mRmz subgraph_hop_1.pt
Processing file 1iz_FOBs9k7m9z3lDtRIXzRkd92tL_EK- subgraph.pt
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1i6W9fI3sGEn6V9xBlt2MxlonZXOTLZjg
To: /content/ML-HandsOn/load-subgraph_doc.ipynb
100% 2.92k/2.92k [00:00<00:00, 7.56MB/s]
Downloading...
From: https://drive.google.com/uc?id=1qZpQzFMRzuYQ0xoQJcUNe7CRBMS2mRmz
To: /content/ML-HandsOn/subgraph_hop_1.pt
100% 10.5M/10.5M [00:00<00:00, 29.4MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1iz_FOBs9k7m9z3lDtRIXzRkd92tL_EK-
From (redirected): https://drive.google.com/uc?id=1iz_FOBs9k7m9z3lDtRIXzRkd92tL_EK-&confirm=t&uuid=2d20b6f1-903c-4855-9bd7-0a9a044e9775
To: /content/ML-HandsOn/subgraph.pt
100% 1.64G/1.64G [00:18<00:00, 88.7MB/s]
Download c

In [None]:
!pip install torch_geometric

In [None]:
import torch
import torch_geometric
import numpy as np
import pandas as pd
import networkx as nx
import plotly
from torch_geometric.utils import to_networkx
from torch.nn import Embedding
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Task 1 Perform a node labeling task with a Graph ML model

## a) Load the graph dataset (ogbn-proteins) into pytorch-geometric

We are directly using a Subgraph

In [None]:
import torch
import torch_geometric

path_big = "/content/ML-HandsOn/subgraph.pt"
path_small = "/content/ML-HandsOn/subgraph_hop_1.pt"

dataset = torch.load(path_small, weights_only=False)

data = dataset["graph"]

print(data)

In [None]:
G = to_networkx(data, to_undirected=True)
print(G)

## b) Create a train, val, test split on the nodes or load the masks via pytorch-geometric.

### i. Create a subgraph if the computation is too expensive.

In [None]:
num_nodes = data.num_nodes
perm = torch.randperm(num_nodes)

train_size = int(0.7 * num_nodes)
val_size = int(0.15 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

print(data.y)

In [None]:
print(len(data.y[data.train_mask]))
print(len(data.y[data.val_mask]))
print(len(data.y[data.test_mask]))

## c) Initialize the graph with random node embeddings.

In [None]:
number_nodes = data.num_nodes
embedding_dim = 64
x = torch.empty((num_nodes, embedding_dim))  # empty tensor
torch.nn.init.xavier_uniform_(x)  # Xavier uniform initialization
data.x = x

In [None]:
node_idx = data.edge_index.flatten().unique()
train_idx = torch.tensor(node_idx[data.train_mask.numpy()])
test_idx = torch.tensor(node_idx[data.test_mask.numpy()])
val_idx = torch.tensor(node_idx[data.val_mask.numpy()])

train_subgraph = data.subgraph(train_idx)
test_subgraph = data.subgraph(test_idx)
val_subgraph = data.subgraph(val_idx)

## d) Define a graph convolutional neural network class with two layers using pytorch-geometric..

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, num_nodes, embedding_dim, hidden_dim, out_dim, drop_out=0.5):
        super().__init__()
        self.conv1 = GCNConv(embedding_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
        self.dropout = torch.nn.Dropout(drop_out)

    def forward(self, x, edge_index):
        # apply GCN layers
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
model_gcn = GCN(num_nodes=data.num_nodes, embedding_dim=64, hidden_dim=128, out_dim=112)

### i. Train your model on the train dataset using an optimizer and a loss function for a multilabel classification task for 100 epochs

In [None]:
# Optimizer and loss
optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 100
model_gcn.train()
for epoch in range(1, epochs + 1):
    optimizer.zero_grad()
    out = model_gcn(data.x, train_subgraph.edge_index)  # forward pass
    loss = criterion(out, data.y.float())        # multi-label BCE loss
    loss.backward()                       # backward pass
    optimizer.step()                      # update parameters

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d}, Loss: {loss.item():.4f}")

### ii. Test your model on the test set and evaluate it with accuracy, AUROC, precision, recall and F1 score.

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

In [None]:
model_gcn.eval()
with torch.no_grad():
    logits = model_gcn(data.x, test_subgraph.edge_index)          # raw logits
    probs = torch.sigmoid(logits)                   # convert to probabilities
    preds = (probs > 0.5).int()                     # threshold at 0.5

    y_true = data.y[test_mask].numpy()
    y_pred = preds[test_mask].numpy()
    y_prob = probs[test_mask].numpy()

# Accuracy (exact match per node)
accuracy = accuracy_score(y_true, y_pred)

# AUROC (per class, average='macro')
auroc = roc_auc_score(y_true, y_prob, average='macro')

# Precision, Recall, F1 (micro-averaged)
precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)

print(f"Test Accuracy:  {accuracy:.4f}")
print(f"Test AUROC:     {auroc:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall:    {recall:.4f}")
print(f"Test F1 Score:  {f1:.4f}")

### Outer Cross-valiudation with Stratifiedkfold

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
nodes_idx = data.edge_index.flatten().unique()
y = data.node_species.squeeze()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracys = []
aurocs = []
precisions = []
recalls = []
f1s = []
for train_idx, test_idx in skf.split(nodes_idx, y):
  train_subgraph = data.subgraph(torch.tensor(train_idx))
  test_subgraph = data.subgraph(torch.tensor(test_idx))

  # Train the model
  optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)
  criterion = torch.nn.CrossEntropyLoss()

  # Training loop
  epochs = 100
  model_gcn.train()
  for epoch in range(1, epochs + 1):
      optimizer.zero_grad()
      out = model_gcn(data.x, train_subgraph.edge_index)  # forward pass
      loss = criterion(out, data.y.float())        # multi-label BCE loss
      loss.backward()                       # backward pass
      optimizer.step()                      # update parameters

  # Evaluate the model
  model_gcn.eval()
  with torch.no_grad():
      logits = model_gcn(data.x, test_subgraph.edge_index)          # raw logits
      probs = torch.sigmoid(logits)                   # convert to probabilities
      preds = (probs > 0.5).int()                     # threshold at 0.5

      y_true = data.y[test_mask].numpy()
      y_pred = preds[test_mask].numpy()
      y_prob = probs[test_mask].numpy()

      # Accuracy (exact match per node)
      accuracy = accuracy_score(y_true, y_pred)
      accuracys.append(accuracy)

      # AUROC (per class, average='macro')
      auroc = roc_auc_score(y_true, y_prob, average='macro')
      aurocs.append(auroc)

      # Precision, Recall, F1 (micro-averaged)
      precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
      precisions.append(precision)
      recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
      recalls.append(recall)
      f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
      f1s.append(f1)
      print("-----")

In [None]:

print(f"Test Accuracy:  {np.mean(accuracys):.4f}")
print(f"Test AUROC:     {np.mean(aurocs):.4f}")
print(f"Test Precision: {np.mean(precisions):.4f}")
print(f"Test Recall:    {np.mean(recalls):.4f}")
print(f"Test F1 Score:  {np.mean(f1s):.4f}")

## e) Set up a hyperparameter optimization pipeline with nested 5-fold cross-validation

### i. Familiarize yourself with the hyperparameter optimization package optuna (https://optuna.org/ )

In [None]:
import optuna

In [None]:
def train(model, data, train_subdata):
  # Train the model
  optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)
  criterion = torch.nn.CrossEntropyLoss()

  # Training loop
  epochs = 100
  model.train()
  for epoch in range(1, epochs + 1):
      optimizer.zero_grad()
      out = model(data.x, train_subgraph.edge_index)  # forward pass
      loss = criterion(out, data.y.float())        # multi-label BCE loss
      loss.backward()                       # backward pass
      optimizer.step()                      # update parameters


In [None]:
def test(model, data, val_subdata):
  model.eval()
  with torch.no_grad():
      logits = model(data.x, val_subdata.edge_index)          # raw logits
      probs = torch.sigmoid(logits)                   # convert to probabilities
      preds = (probs > 0.5).int()                     # threshold at 0.5

      y_true = data.y[test_mask].numpy()
      y_pred = preds[test_mask].numpy()
      y_prob = probs[test_mask].numpy()

      # Accuracy (exact match per node)
      accuracy = accuracy_score(y_true, y_pred)

      # AUROC (per class, average='macro')
      auroc = roc_auc_score(y_true, y_prob, average='macro')

      # Precision, Recall, F1 (micro-averaged)
      precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
      recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
      f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)

      wandb.log({
        "Test AUROC": auroc,
        "Test Loss": })


In [None]:
class GCNoptimization:
    def __init__(self, data, train_subdata, val_subdata, study_name="GCN_optimization"):
        self.data = data
        self.train_subdata = train_subdata
        self.val_subdata = val_subdata

        self.study_name = study_name
        self.storage_name = "sqlite:///{}.db".format(self.study_name)
        self.study = optuna.create_study(study_name=self.study_name, storage=self.storage_name, load_if_exists=True)

    def objective(self, trial):
        # Define the hyperparameters to optimize

        dropout = trial.suggest_float("dropout", 0.0, 0.7)
        hidden_dim = trial.suggest_int("hidden_dim", 16, 256)
        embedding_dim = trial.suggest_int("embedding_dim", 16, 256)

        # Create the GCN model with the suggested hyperparameters
        model = GCN(num_nodes=data.num_nodes, embedding_dim=embedding_dim, hidden_dim=hidden_dim, out_dim=112, drop_out=dropout)

        # Train the model
        optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)
        criterion = torch.nn.CrossEntropyLoss()

        # Training loop
        epochs = 100
        model.train()
        for epoch in range(1, epochs + 1):
            optimizer.zero_grad()
            out = model(data.x, self.train_subgraph.edge_index)  # forward pass
            loss = criterion(out, data.y.float())        # multi-label BCE loss
            loss.backward()                       # backward pass
            optimizer.step()                      # update parameters


        # Validate the model
        model.eval()
        with torch.no_grad():
            logits = model(data.x, self.val_subdata.edge_index)          # raw logits
            probs = torch.sigmoid(logits)                   # convert to probabilities
            preds = (probs > 0.5).int()                     # threshold at 0.5

            y_true = data.y[test_mask].numpy()
            y_pred = preds[test_mask].numpy()
            y_prob = probs[test_mask].numpy()

            # Accuracy (exact match per node)
            accuracy = accuracy_score(y_true, y_pred)

            # AUROC (per class, average='macro')
            auroc = roc_auc_score(y_true, y_prob, average='macro')

            wandb.log({
              "Test AUROC": auroc,
              "Test Loss": })
          return






### ii. Integrate the logging package mlflow (https://mlflow.org/) to log your metrics.

In [None]:
%pip install optuna

In [None]:
%pip install wandb -q

In [None]:
 #Ignore excessive warnings
import logging
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)

# WandB – Import the wandb library
import wandb

In [None]:
# WandB – Login to your wandb account so you can log all your metrics
!wandb login

### iii. Train and test your models and report the evaluation metrics with mean and std for the nested CV.

In [1]:
import torch
import optuna
import wandb
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

# Assuming you already have your GCN model class defined as `GCN`

class GCNOptimization:
    def __init__(self, data, study_name="GCN_optimization"):
        self.data = data
        self.study_name = study_name
        self.storage_name = f"sqlite:///{self.study_name}.db"
        self.study = optuna.create_study(
            study_name=self.study_name,
            storage=self.storage_name,
            load_if_exists=True,
            direction="maximize"
        )

    def objective(self, trial, train_idx, val_idx):
        # Hyperparameters to optimize
        dropout = trial.suggest_float("dropout", 0.0, 0.7)
        hidden_dim = trial.suggest_int("hidden_dim", 16, 256)
        embedding_dim = trial.suggest_int("embedding_dim", 16, 256)

        # Build model
        model = GCN(
            num_nodes=self.data.num_nodes,
            embedding_dim=embedding_dim,
            hidden_dim=hidden_dim,
            out_dim=self.data.y.shape[1],
            drop_out=dropout
        )
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
        criterion = torch.nn.BCEWithLogitsLoss()

        # Subgraphs
        train_subgraph = self.data.subgraph(torch.tensor(train_idx))
        val_subgraph = self.data.subgraph(torch.tensor(val_idx))

        # Train loop
        epochs = 100
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            out = model(self.data.x, train_subgraph.edge_index)
            loss = criterion(out, self.data.y.float())
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            logits = model(self.data.x, val_subgraph.edge_index)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()

            y_true = self.data.y[val_idx].cpu().numpy()
            y_pred = preds[val_idx].cpu().numpy()
            y_prob = probs[val_idx].cpu().numpy()

            auroc = roc_auc_score(y_true, y_prob, average='macro')

        return auroc

    def run_nested_cv(self, n_splits_outer=5, n_splits_inner=3, n_trials=20):
        nodes_idx = self.data.edge_index.flatten().unique()
        y = self.data.node_species.squeeze()

        outer_skf = StratifiedKFold(n_splits=n_splits_outer, shuffle=True, random_state=42)

        outer_results = []

        for fold, (trainval_idx, test_idx) in enumerate(outer_skf.split(nodes_idx, y)):
            print(f"===== Outer Fold {fold+1}/{n_splits_outer} =====")

            # Inner CV for hyperparameter tuning
            def optuna_objective(trial):
                inner_skf = StratifiedKFold(n_splits=n_splits_inner, shuffle=True, random_state=42)
                inner_scores = []

                for inner_train_idx, inner_val_idx in inner_skf.split(trainval_idx, y[trainval_idx]):
                    score = self.objective(trial, trainval_idx[inner_train_idx], trainval_idx[inner_val_idx])
                    inner_scores.append(score)

                return np.mean(inner_scores)

            study = optuna.create_study(direction="maximize")
            study.optimize(optuna_objective, n_trials=n_trials)

            best_params = study.best_params
            print(f"Best params for fold {fold+1}: {best_params}")

            # Retrain with best params on full trainval
            model = GCN(
                num_nodes=self.data.num_nodes,
                embedding_dim=best_params["embedding_dim"],
                hidden_dim=best_params["hidden_dim"],
                out_dim=self.data.y.shape[1],
                drop_out=best_params["dropout"]
            )

            optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
            criterion = torch.nn.BCEWithLogitsLoss()
            trainval_subgraph = self.data.subgraph(torch.tensor(trainval_idx))
            test_subgraph = self.data.subgraph(torch.tensor(test_idx))

            for epoch in range(100):
                model.train()
                optimizer.zero_grad()
                out = model(self.data.x, trainval_subgraph.edge_index)
                loss = criterion(out, self.data.y.float())
                loss.backward()
                optimizer.step()

            # Test evaluation
            model.eval()
            with torch.no_grad():
                logits = model(self.data.x, test_subgraph.edge_index)
                probs = torch.sigmoid(logits)
                preds = (probs > 0.5).int()

                y_true = self.data.y[test_idx].cpu().numpy()
                y_pred = preds[test_idx].cpu().numpy()
                y_prob = probs[test_idx].cpu().numpy()

                acc = accuracy_score(y_true, y_pred)
                auroc = roc_auc_score(y_true, y_prob, average='macro')
                precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
                recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
                f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)

                metrics = {
                    "accuracy": acc,
                    "auroc": auroc,
                    "precision": precision,
                    "recall": recall,
                    "f1": f1,
                    "fold": fold+1
                }

                outer_results.append(metrics)

                # Log to wandb
                wandb.log(metrics)

        return outer_results


In [None]:
# Usage:
# wandb.init(project="gcn-nested-cv")
# optimizer = GCNOptimization(data)
# results = optimizer.run_nested_cv(n_splits_outer=5, n_splits_inner=3, n_trials=20)
# wandb.finish()