In [50]:
import json
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch_geometric.data import Data
from torch_geometric.nn import RGCNConv
from collections import defaultdict

In [51]:
datapath = "../graph_data/"
# 1. Load files
with open(datapath+'entities.json', 'r') as f:
    entities = json.load(f)             # { entity_str: { 'canonical': text, ... }, ... }
with open(datapath+'relation2id.json', 'r') as f:
    rel2id = json.load(f)               # { relation_str: relation_id, ... }

entity_names = list(entities.keys())
entity2id = {name: idx for idx, name in enumerate(entity_names)}
num_nodes = len(entity2id)
num_rels = len(rel2id)

id2cluster = { entity2id[name]: name.rsplit('_', 1)[0] for name in entity_names }
# Build cluster → list of node‑IDs
cluster2entity_ids = defaultdict(list)
for node_id, cl in id2cluster.items():
    cluster2entity_ids[cl].append(node_id)

# --- 2. Load train/dev/test triples as ID tuples ---
def load_id_triples(path):
    triples = []
    with open(path) as f:
        for line in f:
            h_str, r_str, t_str = line.strip().split('\t')
            h_id = entity2id[h_str]
            r_id = int(r_str)
            t_id = entity2id[t_str]
            triples.append((h_id, r_id, t_id))
    return triples

train_triples = load_id_triples(datapath+"triples_train.tsv")
dev_triples   = load_id_triples(datapath+"triples_dev.tsv")
test_triples  = load_id_triples(datapath+"triples_test.tsv")

In [52]:
# --- 3. Build graph from train triples ---
src = [h for h, r, t in train_triples]
dst = [t for h, r, t in train_triples]
edge_index = torch.tensor([src, dst], dtype=torch.long)
edge_type  = torch.tensor([r for h, r, t in train_triples], dtype=torch.long)

data = Data(edge_index=edge_index)
data.edge_type = edge_type

# --- 4. Negative sampling function ---
def negative_sample_tails(triples, neg_rate=1):
    examples = []
    for h, r, t in triples:
        examples.append((h, r, t, 1))
        c = id2cluster[h]
        candidates = cluster2entity_ids[c]
        for _ in range(neg_rate):
            t_neg = random.choice(candidates)
            while t_neg == t:
                t_neg = random.choice(candidates)
            examples.append((h, r, t_neg, 0))
    return examples

In [53]:
class RGCNLinkPredictor(nn.Module):
    def __init__(
        self,
        num_nodes: int,
        in_dim: int,
        hidden_dim: int,
        out_dim: int,
        num_rels: int,
        num_layers: int = 2,
        dropout: float = 0.3
    ):
        super().__init__()
        # learnable node and relation embeddings
        self.node_emb = nn.Embedding(num_nodes, in_dim)
        self.rel_emb  = nn.Embedding(num_rels, out_dim)
        self.dropout  = nn.Dropout(dropout)

        # build R-GCN layers
        self.convs = nn.ModuleList()
        if num_layers == 1:
            self.convs.append(RGCNConv(in_dim, out_dim, num_rels))
        else:
            # first layer
            self.convs.append(RGCNConv(in_dim, hidden_dim, num_rels))
            # intermediate layers
            for _ in range(num_layers - 2):
                self.convs.append(RGCNConv(hidden_dim, hidden_dim, num_rels))
            # final layer
            self.convs.append(RGCNConv(hidden_dim, out_dim, num_rels))

    def forward(self, edge_index, edge_type):
        # initial node features
        x = self.node_emb.weight  # shape: (num_nodes, in_dim)
        # apply each R-GCN conv with residual + dropout
        for conv in self.convs:
            h_prev = x
            x = conv(x, edge_index, edge_type)
            x = F.relu(x)
            x = self.dropout(x)
            # residual if dimensions match
            if x.shape == h_prev.shape:
                x = x + h_prev
        return x

In [54]:

# --- 7. Training & evaluation utilities ---
def train_epoch(neg_rate=1, batch_size=1024):
    model.train()
    total_loss = 0.0

    examples = negative_sample_tails(train_triples, neg_rate)
    random.shuffle(examples)

    for i in range(0, len(examples), batch_size):
        batch = examples[i : i+batch_size]
        h_ids = torch.tensor([h for h,_,_,_ in batch], device=device)
        r_ids = torch.tensor([r for _,r,_,_ in batch], device=device)
        t_ids = torch.tensor([t for *_,t, _ in batch], device=device)
        labels= torch.tensor([l for *_,_,l in batch], dtype=torch.float, device=device)

        optimizer.zero_grad()

        # ← Moved inside the loop
        node_embs = model(data.edge_index.to(device),
                          data.edge_type.to(device))
        rel_embs  = model.rel_emb.weight

        e_h = node_embs[h_ids]
        e_t = node_embs[t_ids]
        e_r = rel_embs[r_ids]

        scores = (e_h * e_r * e_t).sum(dim=1)
        loss   = F.binary_cross_entropy_with_logits(scores, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(batch)

    return total_loss / len(examples)

@torch.no_grad()
def evaluate(triples):
    model.eval()
    # Precompute node and relation embeddings once per evaluation
    node_embs = model(data.edge_index.to(device), data.edge_type.to(device))
    rel_embs  = model.rel_emb.weight.to(device)
    ranks = []

    for h, r, t in triples:
        # score against all nodes
        v = node_embs[h] * rel_embs[r]
        scores = (node_embs * v).sum(dim=1)

        # Restrict scoring to h's cluster
        c = id2cluster[h]
        candidates = cluster2entity_ids[c]
        cand_scores = scores[candidates]

        # Local ranking within cluster
        sorted_idx = torch.argsort(cand_scores, descending=True)
        sorted_cands = [candidates[i] for i in sorted_idx]
        rank = sorted_cands.index(t) + 1
        ranks.append(rank)

    ranks = np.array(ranks)
    mrr   = np.mean(1.0 / ranks)
    hits1 = np.mean(ranks <= 1)
    hits3 = np.mean(ranks <= 3)
    hits10= np.mean(ranks <= 10)
    return mrr, hits1, hits3, hits10



In [None]:
# --- 6. Instantiate model & optimizer ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RGCNLinkPredictor(
    num_nodes=num_nodes,
    in_dim=128,
    hidden_dim=128,
    out_dim=128,
    num_rels=num_rels,
    num_layers=3
).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)




In [56]:
# --- 8. Main training loop ---
best_mrr = 0.0
num_epochs = 50
for epoch in range(1, num_epochs+1):
    loss = train_epoch(neg_rate=5)
    dev_mrr, dev_h1, dev_h3, dev_h10 = evaluate(dev_triples)
    print(f"Epoch {epoch} ▶ loss={loss:.4f}  Dev MRR={dev_mrr:.4f}  Hits@1={dev_h1:.4f} Hits@3={dev_h3:.4f} Hits@10={dev_h10:.4f}")
    if dev_mrr > best_mrr:
        best_mrr = dev_mrr
        torch.save(model.state_dict(), "best_rgcn.pt")

# --- 9. Final test evaluation ---
model.load_state_dict(torch.load("best_rgcn.pt"))
test_mrr, test_h1, test_h3, test_h10 = evaluate(test_triples)
print(f"Test ▶ MRR={test_mrr:.4f}  Hits@1={test_h1:.4f}  Hits@3={test_h3:.4f} Hits@10={test_h10:.4f}")

Epoch 1 ▶ loss=11.7891  Dev MRR=0.2373  Hits@1=0.0791 Hits@3=0.2396 Hits@10=0.6857
Epoch 2 ▶ loss=4.3331  Dev MRR=0.2482  Hits@1=0.0857 Hits@3=0.2484 Hits@10=0.6989
Epoch 3 ▶ loss=2.7609  Dev MRR=0.2486  Hits@1=0.0835 Hits@3=0.2527 Hits@10=0.6923
Epoch 4 ▶ loss=2.1204  Dev MRR=0.2438  Hits@1=0.0791 Hits@3=0.2505 Hits@10=0.6923
Epoch 5 ▶ loss=1.6767  Dev MRR=0.2448  Hits@1=0.0769 Hits@3=0.2527 Hits@10=0.6813
Epoch 6 ▶ loss=1.3936  Dev MRR=0.2443  Hits@1=0.0769 Hits@3=0.2505 Hits@10=0.6901
Epoch 7 ▶ loss=1.1739  Dev MRR=0.2538  Hits@1=0.0945 Hits@3=0.2505 Hits@10=0.6835
Epoch 8 ▶ loss=1.0966  Dev MRR=0.2501  Hits@1=0.0857 Hits@3=0.2549 Hits@10=0.6791
Epoch 9 ▶ loss=0.9557  Dev MRR=0.2508  Hits@1=0.0835 Hits@3=0.2769 Hits@10=0.6791
Epoch 10 ▶ loss=0.8695  Dev MRR=0.2521  Hits@1=0.0835 Hits@3=0.2637 Hits@10=0.6659
Epoch 11 ▶ loss=0.7943  Dev MRR=0.2529  Hits@1=0.0857 Hits@3=0.2615 Hits@10=0.6637
Epoch 12 ▶ loss=0.7511  Dev MRR=0.2539  Hits@1=0.0901 Hits@3=0.2681 Hits@10=0.6637
Epoch 13 ▶ l

  model.load_state_dict(torch.load("best_rgcn.pt"))


Test ▶ MRR=0.2411  Hits@1=0.0801  Hits@3=0.2269 Hits@10=0.6992
