# Assignment Day 6

## Team members:
- Samuel Nebgen s6sanebg@uni-bonn.de
- Muhammad Humza Arain s27marai@uni-bonn.de
- Julian Meigen s82jmeig@uni-bonn.de

## 15.09.2025

Contributions were made by all team members in around the same amount, either based on discussions or coding. Since we had multiple problems on our individual machines, Julians code was used in the end.

# Task 2 - Load ogbn-proteins Data

In [1]:
import torch
from ogb.nodeproppred import PygNodePropPredDataset

# Monkey-patch torch.load to always allow full pickle
torch_load_old = torch.load
def torch_load_wrapper(*args, **kwargs):
    kwargs["weights_only"] = False
    return torch_load_old(*args, **kwargs)

torch.load = torch_load_wrapper

dataset = PygNodePropPredDataset(name="ogbn-proteins")

  Referenced from: <7882E224-C4F5-325F-B895-2D7293A98B46> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/python3.12/site-packages/libpyg.so
  Expected in:     <77030126-DDEC-3A87-BE1E-EE3D08216101> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/libtorch_cpu.dylib
  Referenced from: <7F6096B0-4F9D-3574-B2E0-E04DD08A8586> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/python3.12/site-packages/torch_scatter/_scatter_cpu.so
  Expected in:     <77030126-DDEC-3A87-BE1E-EE3D08216101> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/libtorch_cpu.dylib
  Referenced from: <897C854E-82AB-3DDD-8D24-8899B225A7EF> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/python3.12/site-packages/torch_spline_conv/_basis_cpu.so
  Expected in:     <77030126-DDEC-3A87-BE1E-EE3D08216101> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/libtorch_cpu.dylib
  Referenced from: <B6C7D170-6D58-3806-A004-364F6E565BE5> /Users/julianmeigen/miniconda3/envs/mlhandson/lib/python3.12/site-packages/torch_sparse/_s

In [3]:
data = dataset[0]
print(f"In the ogbn-proteins dataset, there are {data.num_nodes} nodes, {data.num_edges} edges.")

In the ogbn-proteins dataset, there are 132534 nodes, 79122504 edges.


# Task 3

## b) Sample 1000 edges randomly from the graph

In [5]:
import networkx as nx
import plotly.graph_objects as go

# fix seed for reproducibility
torch.manual_seed(42)

sample_size = 1000

num_edges = data.edge_index.size(1)
perm = torch.randperm(num_edges)

# select 1000 random edges and attributes
edge_index_sample = data.edge_index[:, perm[:sample_size]]
edge_attr_sample = data.edge_attr[perm[:sample_size]]
edge_index_sample = data.edge_index[:, perm[:sample_size]] # since (2, num_edges)
edge_attr_sample = data.edge_attr[perm[:sample_size]]  # since (num_edges, num_edge_features)

## c) Convert to networkx

In [6]:
import networkx as nx
import plotly.graph_objects as go

G = nx.Graph()

for i in range(sample_size):
    node_u = edge_index_sample[0, i].item()
    node_v = edge_index_sample[1, i].item()

    edge_features = edge_attr_sample[i]
    # Generate dictionary with numeric key and feature value
    features = {f"feature_{idx}": j.item() for idx, j in enumerate(edge_features)}

    G.add_edge(node_u, node_v, **features)

## d) Calculate the number of connected components.

In [9]:
num_components = nx.number_connected_components(G)
print("Number of connected components:", num_components)

Number of connected components: 966


## e) Identify the hub nodes and their degrees.

In [8]:
# Get degrees of Node, find max degree and filter nodes with max degree to get hub nodes
node_degree = dict(G.degree())
max_degree = max(node_degree.values())
hub_nodes = {i:j for i,j in node_degree.items() if j == max_degree}

In [14]:
print(f"There are {len(hub_nodes)} hub nodes with a degree of {max_degree}.")

There are 34 hub nodes with a degree of 2.


## f) Visualize with plotly

In [15]:
# Create positions using spring layout
pos = nx.spring_layout(G, seed=42)  # For reproducible layout

In [16]:
edge_x = []
edge_y = []
for edge in G.edges():
    # Use the pos dictionary to get the positions of the nodes
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title=dict(
              text='Node Connections',
              side='right'
            ),
            xanchor='left',
        ),
        line_width=2))

# Color nodes by degree
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])))

node_trace.marker.color = node_adjacencies
node_trace.text = node_text


In [17]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title=dict(
                    text="<br>Sample from ogbn-proteins",
                    font=dict(
                        size=16
                    )
                ),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()

# Task 5

## a) Create the train, validation, and test datasets (using the available splits)

In [18]:
# Use the given splits from xgbn-protiens
split_idx = dataset.get_idx_split()

# get data
data = dataset[0]

# get train, val, test indices
train_idx = split_idx["train"]
val_idx = split_idx["valid"]
test_idx = split_idx["test"]

## b) Train a Node2vec model to learn the node embeddings with the dataset

In [20]:
# Initialize Node2Vec model

import torch
from torch_geometric.nn import Node2Vec

device = "cuda" if torch.cuda.is_available() else "cpu"

node2vec_model = Node2Vec(
    edge_index=data.edge_index,
    embedding_dim=112,      # smaller than 128
    walk_length=10,        # shorter walks
    context_size=5,        # smaller window
    walks_per_node=2,      # fewer walks per node
    num_negative_samples=1,
    p=1, q=1,
    sparse=True
).to(device)

In [21]:
# Train the model

EPOCHS = 10
optimizer = torch.optim.SparseAdam(node2vec_model.parameters(), lr=0.01)

# Training loop
def train():
    node2vec_model.train()
    total_loss = 0
    loader = node2vec_model.loader(batch_size=2000, shuffle=True, num_workers=0)  # random walks
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = node2vec_model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


for epoch in range(1, EPOCHS):
    loss = train()
    print(f"Epoch {epoch}, Loss: {loss:.4f}")

Epoch 1, Loss: 8.9406
Epoch 2, Loss: 7.9974
Epoch 3, Loss: 7.0427
Epoch 4, Loss: 6.1625
Epoch 5, Loss: 5.3754
Epoch 6, Loss: 4.6595
Epoch 7, Loss: 4.0270
Epoch 8, Loss: 3.4635
Epoch 9, Loss: 2.9951


In [22]:
# Get embeddings
node2vec_model.eval()
z = node2vec_model().detach().numpy()
y = data.y.numpy()

## Train a LogisticRegression for the task of multi-label classification (node labeling) on your dataset.

In [23]:
# Use the train_idx and test_idx to create the train and test splits

train_z = z[train_idx]
test_z = z[test_idx]

train_y = y[train_idx]
test_y = y[test_idx]

In [24]:
# Train a LogisticRegression for the task of multi-label classification.

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

clf = MultiOutputClassifier(LogisticRegression(max_iter=1000, n_jobs=-1))
clf.fit(train_z, train_y)

0,1,2
,estimator,"LogisticRegre...00, n_jobs=-1)"
,n_jobs,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


## d) Report your results with the metrics accuracy and AUROC using the sklearn package

In [25]:
# Validate the model on the test set

from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = clf.predict(test_z)

accuracy = accuracy_score(test_y, y_pred)
roc_auc = roc_auc_score(test_y, y_pred, multi_class="ovr")

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

Accuracy: 0.0528
ROC AUC: 0.5003


## Perform 5-fold cross-validation with according train, validation and test splits and report again AUROC and accuracy.

In [26]:
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mean_accuracy = []
mean_roc_auc = []

for i, (train_idx, test_idx) in enumerate(kf.split(z)):
    train_z = z[train_idx]
    train_y = y[train_idx]

    test_z = z[test_idx]
    test_y = y[test_idx]

    clf = MultiOutputClassifier(LogisticRegression(max_iter=1000, n_jobs=-1))
    clf.fit(train_z, train_y)

    y_pred = clf.predict(test_z)

    accuracy = accuracy_score(test_y, y_pred)
    roc_auc = roc_auc_score(test_y, y_pred, multi_class="ovr")

    mean_accuracy.append(accuracy)
    mean_roc_auc.append(roc_auc)

mean_accuracy = np.mean(mean_accuracy)
mean_roc_auc = np.mean(mean_roc_auc)


In [27]:
print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"Mean ROC AUC: {mean_roc_auc:.4f}")

Mean Accuracy: 0.2196
Mean ROC AUC: 0.5027
