In [1]:
# 04 - LightGCN Model Implementation
## Graph-based Recommendation on Book-Crossing Subgraph

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.utils import structured_negative_sampling
import numpy as np
import psutil
import gc
from tqdm.notebook import tqdm



The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
## 1. Load Graph and Mappings

from torch_geometric.data.data import DataTensorAttr, DataEdgeAttr
from torch_geometric.data.storage import GlobalStorage
import torch.serialization

torch.serialization.add_safe_globals([DataTensorAttr, DataEdgeAttr, GlobalStorage])

data = torch.load('data/processed/graph_data.pt')

num_users = data.num_users
num_books = data.num_books
num_nodes = data.num_nodes

print(data)
print(f"Users: {num_users}, Books: {num_books}, Total nodes: {num_nodes}")


Data(x=[5582, 64], edge_index=[2, 95220], num_nodes=5582, num_users=3404, num_books=2178)
Users: 3404, Books: 2178, Total nodes: 5582


In [3]:
## 2. Train/Val/Test Split (Random Edge Split)

# We'll use 80% train, 10% val, 10% test edges
# LightGCN typically uses random positive edge split

def random_edge_split(edge_index, test_ratio=0.1, val_ratio=0.1):
    num_edges = edge_index.size(1) // 2  # undirected, so half
    perm = torch.randperm(num_edges)
    
    test_size = int(num_edges * test_ratio)
    val_size = int(num_edges * val_ratio)
    
    test_pos = perm[:test_size]
    val_pos = perm[test_size:test_size+val_size]
    train_pos = perm[test_size+val_size:]
    
    # Full edge index (undirected)
    train_edge_index = torch.cat([edge_index[:, train_pos], edge_index[:, train_pos + num_edges]], dim=1)
    val_edge_index = torch.cat([edge_index[:, val_pos], edge_index[:, val_pos + num_edges]], dim=1)
    test_edge_index = torch.cat([edge_index[:, test_pos], edge_index[:, test_pos + num_edges]], dim=1)
    
    return train_edge_index, val_edge_index, test_edge_index

train_edge_index, val_edge_index, test_edge_index = random_edge_split(data.edge_index)

print(f"Train edges: {train_edge_index.size(1)//2}")
print(f"Val edges: {val_edge_index.size(1)//2}")
print(f"Test edges: {test_edge_index.size(1)//2}")

Train edges: 38088
Val edges: 4761
Test edges: 4761


In [None]:
## 3. LightGCN Model Definition
# ...existing code...
from torch_geometric.utils import structured_negative_sampling, degree
# ...existing code...
class LightGCN(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim=64, num_layers=3):
        super().__init__()
        self.num_users = num_users
        self.num_books = num_books
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        
        # Initial embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_books, embedding_dim)
        
        # Xavier initialization
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
    
    def forward(self, edge_index):
        # Concat user and item embeddings
        x = torch.cat([self.user_embedding.weight, self.item_embedding.weight], dim=0)
        
        # List to store outputs from each layer
        out_list = [x]
        
        # Message passing (normalized adjacency multiplication)
        for _ in range(self.num_layers):
            x = self.propagate(x, edge_index)
            out_list.append(x)
        
        # Average all layers (core LightGCN idea)
        final = sum(out_list) / (self.num_layers + 1)
        
        user_emb_final, item_emb_final = torch.split(final, [self.num_users, self.num_books])
        
        return user_emb_final, item_emb_final
    
    def propagate(self, x, edge_index):
        row, col = edge_index
        deg = degree(row, num_nodes=x.size(0))  # incoming degree
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
        
        return torch.sparse_coo_tensor(edge_index, norm, x.size()).matmul(x)

# Instantiate model
device = torch.device('cpu')  # forced CPU
model = LightGCN(num_users, num_books, embedding_dim=64, num_layers=3).to(device)

print(model)
print(f"Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


LightGCN(
  (user_embedding): Embedding(3404, 64)
  (item_embedding): Embedding(2178, 64)
)
Total parameters: 357,248


In [8]:
## 4. BPR Loss and Negative Sampling

def bpr_loss(user_emb, pos_emb, neg_emb):
    pos_scores = (user_emb * pos_emb).sum(dim=1)
    neg_scores = (user_emb * neg_emb).sum(dim=1)
    loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-8).mean()
    return loss

# Negative sampling using structured_negative_sampling from PyG
def sample_negative(edge_index, num_users):
    row, col = edge_index
    # Structured negative sampling: for each (u,i+) sample i- not connected to u
    u, pos, neg = structured_negative_sampling(edge_index, num_users=num_nodes)
    return u, pos - num_users, neg - num_users  # adjust book nodes

In [9]:
## 5. Training Function (One Epoch)

def train_one_epoch(model, edge_index, batch_size=1024):
    model.train()
    optimizer.zero_grad()
    
    total_loss = 0
    num_batches = 0
    
    # Shuffle edges
    perm = torch.randperm(edge_index.size(1)//2)
    edges = edge_index[:, perm]
    
    for i in range(0, edges.size(1), batch_size):
        batch_edges = edges[:, i:i+batch_size]
        
        # Sample negatives
        u, pos, neg = structured_negative_sampling(batch_edges, num_nodes=num_nodes)
        
        user_emb, item_emb = model(train_edge_index)  # full forward pass
        
        pos_emb = item_emb[pos - num_users]
        neg_emb = item_emb[neg - num_users]
        batch_user_emb = user_emb[u]
        
        loss = bpr_loss(batch_user_emb, pos_emb, neg_emb)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        num_batches += 1
    
    return total_loss / num_batches

In [11]:
## 6. Setup Optimizer

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)

print("Model and training setup complete!")
print("Ready for training loop in next notebook.")

gc.collect()

Model and training setup complete!
Ready for training loop in next notebook.


2759