# Link Prediction

## Preparation

In [1]:
%env NX_CUGRAPH_AUTOCONFIG=True

env: NX_CUGRAPH_AUTOCONFIG=True


In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install igraph networkit pandas matplotlib seaborn networkx numpy scikit-learn tqdm ipywidgets



In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import networkx as nx
import pickle
import random
import igraph as ig
import networkit as nk

from itertools import combinations
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Dataset Preparation

In [4]:
pickle_file_path = 'dataset/amazon_copurchase_graph.pickle'
with open(pickle_file_path, 'rb') as f:
    G = pickle.load(f)

print(G)

DiGraph with 259102 nodes and 1207337 edges


### Split Dataset

In [5]:
nkG = nk.nxadapter.nx2nk(G)

edges = list(G.edges())
existing_edges = set(edges)

# Sampling dengan Networkit Graph (lebih cepat)
def sample_non_edges_nk(nkG, num_samples):
    non_edges = set()
    nodes = list(G.nodes())

    while len(non_edges) < num_samples:
        u, v = random.sample(nodes, 2)
        if not nkG.hasEdge(u, v):
            non_edges.add((u, v))

    return list(non_edges)

num_samples = len(edges)
non_edges = sample_non_edges_nk(nkG, num_samples)

train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)
train_non_edges = random.sample(non_edges, len(train_edges))
test_non_edges = random.sample(non_edges, len(test_edges))

G_train = nx.Graph()
G_train.add_nodes_from(G.nodes())
G_train.add_edges_from(train_edges)

print(f"Train Edges: {len(train_edges)}, Test Edges: {len(test_edges)}")
print(f"Train Non-Edges: {len(train_non_edges)}, Test Non-Edges: {len(test_non_edges)}")

Train Edges: 965869, Test Edges: 241468
Train Non-Edges: 965869, Test Non-Edges: 241468


In [6]:
# Metrik evaluasi ranking problem
def precision_at_k(y_true, y_scores, k):
    sorted_indices = np.argsort(y_scores)[::-1]
    top_k = sorted_indices[:k]
    return np.mean(y_true[top_k])

def recall_at_k(y_true, y_scores, k):
    sorted_indices = np.argsort(y_scores)[::-1]
    top_k = sorted_indices[:k]
    return np.sum(y_true[top_k]) / np.sum(y_true)

def mean_average_precision(y_true, y_scores):
    sorted_indices = np.argsort(y_scores)[::-1]
    relevant = np.cumsum(y_true[sorted_indices])
    precision_at_i = relevant / (np.arange(len(y_true)) + 1)
    return np.sum(precision_at_i * y_true[sorted_indices]) / np.sum(y_true)

def f1_beta_at_k(y_true, y_scores, k, beta=1):
    precision_k = precision_at_k(y_true, y_scores, k)
    recall_k = recall_at_k(y_true, y_scores, k)

    if precision_k + recall_k == 0:
        return 0.0

    beta_sq = beta ** 2
    return (1 + beta_sq) * (precision_k * recall_k) / ((beta_sq * precision_k) + recall_k)



## Graph Convolutional Network (GCN) Link Prediction

In [7]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ------------------ --------------------- 0.5/1.1 MB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 1.1/1.1 MB 1.9 MB/s eta 0:00:00
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


Setelah beberapa kali *tuning* dalam parameternya, model yang dibangun adalah sebagai berikut.

1. **Lapisan Input**
    - **Learnable Node Embeddings**: `torch.nn.Embedding(num_nodes, 32)`
        - Setiap node memiliki embedding berdimensi 32.

2. **Lapisan Konvolusi Graf** (GCN)
    - **GCNConv(32 → 64)** → Konvolusi Awal
        - Aktivasi ReLU
        - Dropout (p=0.5)

    - **GCNConv(64 → 64)** → Konvolusi Tambahan
        - Aktivasi ReLU
        - Dropout (p=0.5)

    - **GCNConv(64 → 32)** → Konvolusi Akhir

3. **Edge Decoder** (Link Prediction)

    - **Dot Product Decoder**

        - Menghitung $$score(u,v)=z_u * z_v$$
        - Menghasilkan skor probabilitas untuk setiap hubungan (edge). (logit score)

4. **Loss Function**

    - **Binary Cross Entropy dengan Logits** (`BCEWithLogitsLoss`).
    - **Regularisasi L2** (`torch.norm(z, p=2)`).

5. **Optimisasi**
    - **Optimizer**: AdamW
        - Learning rate: 0.002
        - Weight decay: 5e-4
    - **Learning Rate Scheduler**: `StepLR(step_size=15, gamma=0.7)`
        - Mengurangi learning rate setiap 15 epoch.

In [9]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

node_to_idx = {node: idx for idx, node in enumerate(G.nodes())}
edge_index = torch.tensor(
    [[node_to_idx[u], node_to_idx[v]] for u, v in G.edges() if u in node_to_idx and v in node_to_idx],
    dtype=torch.long
).t().contiguous()

# Use learnable node embeddings with higher dimension
num_nodes = G.number_of_nodes()
embedding_dim = 64
x = torch.nn.Embedding(num_nodes, embedding_dim).weight

data = Data(x=x, edge_index=edge_index)

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(p=0.3)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        return self.conv3(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

# Initialize model and optimizer
model = GNN(in_channels=embedding_dim, hidden_channels=64, out_channels=32)  # Increased hidden size
optimizer = torch.optim.AdamW(model.parameters(), lr=0.005, weight_decay=5e-4)  # Adjusted learning rate & weight decay
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.7)  # More frequent LR decay

# Training loop
for epoch in range(1, 301):
    model.train()
    optimizer.zero_grad()

    z = model.encode(data.x, data.edge_index)
    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=pos_edge_index, num_nodes=data.num_nodes, num_neg_samples=2 * len(pos_edge_index[0])  # Increased negatives
    )

    pos_out = model.decode(z, pos_edge_index)
    neg_out = model.decode(z, neg_edge_index)

    out = torch.cat([pos_out, neg_out], dim=0)
    labels = torch.cat([torch.ones(pos_out.size(0)), torch.zeros(neg_out.size(0))], dim=0)

    loss = F.binary_cross_entropy_with_logits(out, labels)
    # loss += 0.001 * torch.norm(z, p=2)  # L2 regularization
    loss.backward()
    optimizer.step()
    scheduler.step()

    # if epoch % 10 == 0:
    #     print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')


Epoch: 001, Loss: 1.1968
Epoch: 002, Loss: 0.7391
Epoch: 003, Loss: 0.6603
Epoch: 004, Loss: 0.6620
Epoch: 005, Loss: 0.6776
Epoch: 006, Loss: 0.6865
Epoch: 007, Loss: 0.6854
Epoch: 008, Loss: 0.6805
Epoch: 009, Loss: 0.6768
Epoch: 010, Loss: 0.6742
Epoch: 011, Loss: 0.6716
Epoch: 012, Loss: 0.6694
Epoch: 013, Loss: 0.6673
Epoch: 014, Loss: 0.6659
Epoch: 015, Loss: 0.6647
Epoch: 016, Loss: 0.6642
Epoch: 017, Loss: 0.6636
Epoch: 018, Loss: 0.6632
Epoch: 019, Loss: 0.6626
Epoch: 020, Loss: 0.6618
Epoch: 021, Loss: 0.6610
Epoch: 022, Loss: 0.6602
Epoch: 023, Loss: 0.6589
Epoch: 024, Loss: 0.6575
Epoch: 025, Loss: 0.6559
Epoch: 026, Loss: 0.6544
Epoch: 027, Loss: 0.6524
Epoch: 028, Loss: 0.6505
Epoch: 029, Loss: 0.6485
Epoch: 030, Loss: 0.6461
Epoch: 031, Loss: 0.6436
Epoch: 032, Loss: 0.6421
Epoch: 033, Loss: 0.6402
Epoch: 034, Loss: 0.6381
Epoch: 035, Loss: 0.6365
Epoch: 036, Loss: 0.6343
Epoch: 037, Loss: 0.6322
Epoch: 038, Loss: 0.6302
Epoch: 039, Loss: 0.6282
Epoch: 040, Loss: 0.6263


In [10]:
# Evaluation
model.eval()
with torch.no_grad():
    z = model.encode(data.x, data.edge_index)
    test_edges_tensor = torch.tensor(
        [[node_to_idx[u], node_to_idx[v]] for u, v in test_edges if u in node_to_idx and v in node_to_idx],
        dtype=torch.long
    ).t().contiguous()
    test_non_edges_tensor = torch.tensor(
        [[node_to_idx[u], node_to_idx[v]] for u, v in test_non_edges if u in node_to_idx and v in node_to_idx],
        dtype=torch.long
    ).t().contiguous()

    test_pos_out = model.decode(z, test_edges_tensor)
    test_neg_out = model.decode(z, test_non_edges_tensor)

    out_test = torch.cat([test_pos_out, test_neg_out], dim=0)
    labels_test = torch.cat([torch.ones(test_pos_out.size(0)), torch.zeros(test_neg_out.size(0))], dim=0)

    k = 100000
    probabilities = out_test.cpu().numpy()
    labels_np = labels_test.cpu().numpy()

    roc_auc = roc_auc_score(labels_np, probabilities)
    ap_score = average_precision_score(labels_np, probabilities)
    precision_at_k_val = precision_at_k(labels_np, probabilities, k)
    recall_at_k_val = recall_at_k(labels_np, probabilities, k)
    map_score = mean_average_precision(labels_np, probabilities)
    f1_k_val = f1_beta_at_k(labels_np, probabilities, k)

    print("{:<25} {:>10} {:>15} {:>15} {:>15} {:>15} {:>15}".format("Model", "AUC-ROC", "AP Score", "Prec@100k", "Rec@100k", "MAP", "F1@100k"))
    print("=" * 105)
    print("{:<25} {:>10.6f} {:>15.6f} {:>15.6f} {:>15.6f} {:>15.6f} {:>15.6f}".format(
        "GNN", roc_auc, ap_score, precision_at_k_val, recall_at_k_val, map_score, f1_k_val
    ))


Model                        AUC-ROC        AP Score       Prec@100k        Rec@100k             MAP         F1@100k
GNN                         0.903680        0.907610        0.964510        0.399436        0.907609        0.564920


In [28]:
# Save the model
torch.save(model.state_dict(), "gnn_model.pth")
print("Success!")

Success!


In [12]:
!pip install nbmerge

Collecting nbmerge
  Using cached nbmerge-0.0.4-py2.py3-none-any.whl
Installing collected packages: nbmerge
Successfully installed nbmerge-0.0.4


In [14]:
!nbmerge modeling_dave.ipynb community_detection.ipynb link_prediction_alek.ipynb > merged.ipynb