In [1]:
!pip install torch_geometric



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.datasets import HM
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.data import HeteroData
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import SAGEConv, HeteroConv

In [3]:
device = "cuda" if torch.cuda.is_available()  else "cpu"
device

'cuda'

In [4]:
dataset = HM(root="data/HM", transform=NormalizeFeatures())
dataset[0]

HeteroData(
  customer={ x=[1371980, 9] },
  article={ x=[105542, 683] },
  (customer, to, article)={
    edge_index=[2, 31788324],
    time=[31788324],
    edge_attr=[31788324, 3],
  },
  (article, rev_to, customer)={
    edge_index=[2, 31788324],
    time=[31788324],
    edge_attr=[31788324, 3],
  }
)

### Sampling a smaller SubGraph

In [5]:
def Sample_SubGraph(dataset, num_customers=100000):
    # Picking a random set of customer and their corresponding links

    total_customer = dataset[0]["customer"]['x'].shape[0]
    chosen_customers = torch.randperm(total_customer)[:num_customers]

    src, dest = dataset[0]['customer', 'to', 'article']['edge_index']
    mask = torch.isin(src, chosen_customers)

    new_src = src[mask]
    new_dst = dest[mask]

    # Get unique articles that remain
    chosen_articles = torch.unique(new_dst)

    # Remap customer and article indices
    cust_id_map = {old.item(): new for new, old in enumerate(chosen_customers)}
    art_id_map = {old.item(): new for new, old in enumerate(chosen_articles)}

    new_src = torch.tensor([cust_id_map[s.item()] for s in new_src])
    new_dst = torch.tensor([art_id_map[d.item()] for d in new_dst])

    # Create new HeteroData
    new_data = HeteroData()

    new_data['customer'].x = dataset[0]['customer']['x'][chosen_customers]
    new_data['article'].x = dataset[0]['article']['x'][chosen_articles]

    new_data['customer', 'to', 'article'].edge_index = torch.stack([new_src, new_dst])
    new_data['article', 'rev_to', 'customer'].edge_index = torch.stack([new_dst, new_src])

    # Copy edge attributes & time (filter with same mask)
    if 'edge_attr' in dataset[0]['customer', 'to', 'article']:
        new_data['customer', 'to', 'article'].edge_attr = dataset[0]['customer', 'to', 'article']['edge_attr'][mask]
        new_data['article', 'rev_to', 'customer'].edge_attr = dataset[0]['article', 'rev_to', 'customer']['edge_attr'][mask]

    if 'time' in dataset[0]['customer', 'to', 'article']:
        new_data['customer', 'to', 'article'].time = dataset[0]['customer', 'to', 'article']['time'][mask]
        new_data['article', 'rev_to', 'customer'].time = dataset[0]['article', 'rev_to', 'customer']['time'][mask]

    return new_data

In [6]:
# Sub Graph
data = Sample_SubGraph(dataset)

In [7]:
data

HeteroData(
  customer={ x=[100000, 9] },
  article={ x=[82477, 683] },
  (customer, to, article)={
    edge_index=[2, 2327067],
    edge_attr=[2327067, 3],
    time=[2327067],
  },
  (article, rev_to, customer)={
    edge_index=[2, 2327067],
    edge_attr=[2327067, 3],
    time=[2327067],
  }
)

### Splitting Edges into Train/ Test

In [8]:
def split(data, split_ratio=0.3, seed=42):
    edge_index = data['customer', 'to', 'article'].edge_index
    num_edges = edge_index.shape[1]

    # Shuffle edges
    torch.manual_seed(seed)
    perm = torch.randperm(num_edges)

    #Split
    test_size = int(split_ratio * num_edges)

    test_src, test_dest = edge_index[:, perm[:test_size]]
    train_src, train_dest = edge_index[:, perm[test_size:]]

    train_edges_forward = torch.stack([train_src, train_dest])
    train_edges_reverse = torch.stack([train_dest, train_src])

    test_edges_forward = torch.stack([test_src, test_dest])
    test_edges_reverse = torch.stack([test_dest, test_src])

    train_edges = [train_edges_forward, train_edges_reverse]
    test_edges = [test_edges_forward, test_edges_reverse]

    return train_edges, test_edges

In [9]:
train_edges, test_edges = split(data)

### Sampling Negative Edges

In [10]:
# Sampling Negative edges on the train edges
neg_edge_index_forward = negative_sampling(
    edge_index=train_edges[0],
    num_nodes = (data['customer'].x.shape[0], data['article'].x.shape[0]),
    num_neg_samples= train_edges[0].shape[1]
)

src, dst = neg_edge_index_forward

neg_edge_index_reverse = torch.stack([dst, src])

neg_edges = [neg_edge_index_forward, neg_edge_index_reverse]

In [30]:
# Sampling Negative edges on the test edges
neg_edge_index_forward = negative_sampling(
    edge_index=test_edges[0],
    num_nodes = (data['customer'].x.shape[0], data['article'].x.shape[0]),
    num_neg_samples= test_edges[0].shape[1]
)

src, dst = neg_edge_index_forward

neg_edge_index_reverse = torch.stack([dst, src])

test_neg_edges = [neg_edge_index_forward, neg_edge_index_reverse]

In [11]:
def Labels(pos_edges, neg_edges):
    pos_forward = pos_edges[0]
    neg_forward = neg_edges[0]

    # pos_reverse = pos_edges[1]
    # neg_reverse = neg_edges[1]

    combined_edges_forward = torch.cat((pos_forward, neg_forward), dim=1)
    # combined_edges_reverse = torch.cat((pos_reverse, neg_reverse), dim=1)

    labels_forward = torch.cat((torch.ones(pos_forward.size(1)), torch.zeros(neg_forward.size(1))), dim=0)
    # labels_reverse = torch.cat((torch.ones(pos_reverse.size(1)), torch.zeros(neg_reverse.size(1))), dim=0)

    # combined_edges = [combined_edges_forward, combined_edges_reverse]
    # labels = [labels_forward, labels_reverse]

    return combined_edges_forward, labels_forward


In [12]:
combined_edges, labels = Labels(train_edges, neg_edges) # only forward edges considered

In [31]:
test_combined_edges, test_labels = Labels(test_edges, test_neg_edges)

In [13]:
x_dict = {
    'customer': data['customer'].x.to(device), # shape [100000, 9]
    'article':  data['article'].x.to(device)  # shape [82302, 683]
}

edge_index_dict = {
    ('customer', 'to', 'article'): train_edges[0].to(device),
    ('article', 'rev_to', 'customer'): train_edges[1].to(device),
}

### GNN Model

In [14]:
class HeteroGNN(nn.Module):
    def __init__(self, hidden_channels=128):
        super().__init__()

        # One layer per relation type
        self.conv1 = HeteroConv({
            ('customer', 'to', 'article'): SAGEConv((-1, -1), hidden_channels),
            ('article', 'rev_to', 'customer'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')

        self.conv2 = HeteroConv({
            ('customer', 'to', 'article'): SAGEConv((-1, -1), hidden_channels),
            ('article', 'rev_to', 'customer'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')

    def forward(self, x_dict, edge_index_dict):

        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}

        # Second conv (final embeddings)
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

In [21]:
model = HeteroGNN()
model = model.to(device)

In [22]:
# To check if the initialization works
with torch.no_grad():  # Initialize lazy modules.
     out = model(x_dict, edge_index_dict)
out['article'].shape, out['customer'].shape

(torch.Size([82477, 128]), torch.Size([100000, 128]))

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [24]:
def accuracy(pred, label): # Link Prediction Accuracy
  accu = 0.0

  pred = (pred > 0.5).float()
  count = (pred == label).sum().item()
  accu = count / len(pred)
  accu = round(accu, 4)
  return accu

In [25]:
def train(model, x, edge_index, combined_edges, labels):
    train_loss = 0.0
    model.train()

    for epoch in range(200):
        optimizer.zero_grad()

        # Get Node Embeddings
        node_emb = model(x, edge_index)

        # Extract source and destination node embeddings
        src = combined_edges[0]
        dst = combined_edges[1]
        src_emb = node_emb['customer'][src]
        dst_emb = node_emb['article'][dst]

        # Compute dot product for edge prediction
        dot_prd = (src_emb * dst_emb).sum(dim=1)

        # Compute loss
        loss = criterion(dot_prd, labels.to(device))

        # Backpropagation
        loss.backward()
        optimizer.step()

        pred = torch.sigmoid(dot_prd)

        # Compute accuracy
        Accuracy = accuracy(pred, labels.to(device))

        if epoch % 1 == 0:
            print(f"Epoch {epoch + 1}/200 - Loss: {loss.item():.4f}, Accuracy: {Accuracy}")

    return node_emb

In [33]:
emb = train(model, x_dict, edge_index_dict, combined_edges, labels)

Epoch 1/200 - Loss: 0.3037, Accuracy: 0.8696
Epoch 2/200 - Loss: 0.3039, Accuracy: 0.8696
Epoch 3/200 - Loss: 0.3045, Accuracy: 0.869
Epoch 4/200 - Loss: 0.3061, Accuracy: 0.8687
Epoch 5/200 - Loss: 0.3082, Accuracy: 0.8668
Epoch 6/200 - Loss: 0.3130, Accuracy: 0.8657
Epoch 7/200 - Loss: 0.3154, Accuracy: 0.8626
Epoch 8/200 - Loss: 0.3160, Accuracy: 0.8643
Epoch 9/200 - Loss: 0.3053, Accuracy: 0.8684
Epoch 10/200 - Loss: 0.3016, Accuracy: 0.8706
Epoch 11/200 - Loss: 0.3074, Accuracy: 0.8682
Epoch 12/200 - Loss: 0.3056, Accuracy: 0.8687
Epoch 13/200 - Loss: 0.3005, Accuracy: 0.8714
Epoch 14/200 - Loss: 0.3022, Accuracy: 0.8705
Epoch 15/200 - Loss: 0.3035, Accuracy: 0.8698
Epoch 16/200 - Loss: 0.3002, Accuracy: 0.8715
Epoch 17/200 - Loss: 0.2993, Accuracy: 0.872
Epoch 18/200 - Loss: 0.3015, Accuracy: 0.8707
Epoch 19/200 - Loss: 0.3011, Accuracy: 0.8712
Epoch 20/200 - Loss: 0.2981, Accuracy: 0.8724
Epoch 21/200 - Loss: 0.2979, Accuracy: 0.8725
Epoch 22/200 - Loss: 0.2997, Accuracy: 0.8718

In [40]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_fscore_support
)

@torch.no_grad()
def test(model, x, edge_index, combined_edges, labels):
    model.eval()

    # Node embeddings from the GNN
    node_emb = model(x, edge_index)

    # Use forward edges (customer → article)
    src = combined_edges[0].to(device)
    dst = combined_edges[1].to(device)

    src_emb = node_emb['customer'][src]
    dst_emb = node_emb['article'][dst]

    dot_prd = (src_emb * dst_emb).sum(dim=1)

    labels = labels.to(device)
    loss = criterion(dot_prd, labels)

    # Prediction probabilities
    pred_prob = torch.sigmoid(dot_prd)
    pred = (pred_prob > 0.5).long()   # threshold at 0.5

    # Metrics
    acc = accuracy(pred_prob, labels)
    auc = roc_auc_score(labels.cpu().numpy(), pred_prob.cpu().numpy())
    ap = average_precision_score(labels.cpu().numpy(), pred_prob.cpu().numpy())

    # Precision, Recall, F1, Support
    precision, recall, f1, support = precision_recall_fscore_support(
        labels.cpu().numpy(),
        pred.cpu().numpy(),
        average=None   # returns per-class metrics
    )

    print(f"Test Loss: {loss.item():.4f}, Accuracy: {acc:.4f}, AUC: {auc:.4f}, AP: {ap:.4f}")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)
    print("Support:", support)

    return {
        "loss": loss.item(),
        "accuracy": acc,
        "auc": auc,
        "ap": ap,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [43]:
results = test(model, x_dict, edge_index_dict, combined_edges, labels)

train_acc = results["accuracy"]
train_auc = results["auc"]
train_avg_prec = results["ap"]
t_prec = results["precision"]
t_recall = results["recall"]
t_f1 = results["f1"]


Test Loss: 0.2516, Accuracy: 0.8957, AUC: 0.9566, AP: 0.9449
Precision: [0.92845569 0.86767824]
Recall: [0.8575767  0.93391743]
F1: [0.89160977 0.89958013]
Support: [1628947 1628947]


In [44]:
# Test Accuracy and ROC score
results = test(model, x_dict, edge_index_dict, test_combined_edges, test_labels)

Test Loss: 0.4709, Accuracy: 0.8475, AUC: 0.9170, AP: 0.9101
Precision: [0.84173397 0.8534515 ]
Recall: [0.85592162 0.83906635]
F1: [0.84876851 0.8461978 ]
Support: [698120 698120]


In [45]:
# from sklearn.metrics import roc_auc_score

# @torch.no_grad()
# def test(model, x, edge_index, combined_edges, labels):
#     model.eval()

#     # Node embeddings from the GNN
#     node_emb = model(x, edge_index)

#     # Use forward edges (customer → article)
#     src = combined_edges[0].to(device)
#     dst = combined_edges[1].to(device)

#     src_emb = node_emb['customer'][src]
#     dst_emb = node_emb['article'][dst]

#     dot_prd = (src_emb * dst_emb).sum(dim=1)

#     labels = labels.to(device)
#     loss = criterion(dot_prd, labels)

#     # Prediction probabilities
#     pred = torch.sigmoid(dot_prd)

#     # Metrics
#     acc = accuracy(pred, labels)
#     auc = roc_auc_score(labels.cpu().numpy(), pred.cpu().numpy())

#     print(f"Test Loss: {loss.item():.4f}, Accuracy: {acc:.4f}, AUC: {auc:.4f}")
#     return acc, auc
