# Experiment: LightGCN baseline for 200-20 reranking

Goal: train LightGCN on user-edition interactions, score the provided 200 candidates per user, and apply a simple genre-diversity rerank to produce `submission.csv`.

Notes:
- Only two event types exist in the data: `wishlist` (1) and `read` (2).
- The competition metric is `0.7 * NDCG@20 + 0.3 * Diversity@20` where Diversity uses genres.


In [None]:
# Setup: imports and reproducibility
# If needed:
# pip install pandas numpy scipy torch tqdm

import os
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy import sparse

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)


## Load data

Expected repo structure:
- `data/` with `users.csv`, `interactions.csv`, `editions.csv`, `book_genres.csv`
- `submit/` with `targets.csv`, `candidates.csv`


In [None]:
DATA_DIR = "data"
SUBMIT_DIR = "submit"

users = pd.read_csv(os.path.join(DATA_DIR, "users.csv"))
interactions = pd.read_csv(os.path.join(DATA_DIR, "interactions.csv"), parse_dates=["event_ts"])
editions = pd.read_csv(os.path.join(DATA_DIR, "editions.csv"))
book_genres = pd.read_csv(os.path.join(DATA_DIR, "book_genres.csv"))

targets = pd.read_csv(os.path.join(SUBMIT_DIR, "targets.csv"))
candidates = pd.read_csv(os.path.join(SUBMIT_DIR, "candidates.csv"))

print("users", users.shape)
print("interactions", interactions.shape)
print("editions", editions.shape)
print("book_genres", book_genres.shape)
print("targets", targets.shape)
print("candidates", candidates.shape)


## ID mappings

We map all user_id and edition_id to contiguous indices for model embeddings.


In [None]:
all_user_ids = pd.Index(pd.concat([users["user_id"], interactions["user_id"], targets["user_id"]]).unique())
all_edition_ids = pd.Index(pd.concat([editions["edition_id"], interactions["edition_id"], candidates["edition_id"]]).unique())

user2idx = {u: i for i, u in enumerate(all_user_ids)}
item2idx = {it: i for i, it in enumerate(all_edition_ids)}
idx2item = {i: it for it, i in item2idx.items()}

n_users = len(all_user_ids)
n_items = len(all_edition_ids)

interactions["u"] = interactions["user_id"].map(user2idx).astype(np.int64)
interactions["i"] = interactions["edition_id"].map(item2idx).astype(np.int64)

print("n_users", n_users, "n_items", n_items)


## Interaction weights

We use a simple weighting scheme:
- wishlist -> 1
- read -> 3 (+ small rating boost)


In [None]:
def make_weight(df: pd.DataFrame) -> np.ndarray:
    w = np.zeros(len(df), dtype=np.float32)
    is_wish = df["event_type"] == 1
    is_read = df["event_type"] == 2

    w[is_wish] = 1.0
    w[is_read] = 3.0

    # rating only for read; normalize 0..5 to 0..1
    r = df["rating"].astype("float32").fillna(0.0).clip(0.0, 5.0) / 5.0
    w[is_read] += 0.2 * r[is_read].to_numpy()
    return w

interactions["w"] = make_weight(interactions)
interactions[["user_id", "edition_id", "event_type", "rating", "w"]].head()


## Optional local time split (30 days)

For offline validation you can split per user by the last 30 days.
Set `DO_LOCAL_SPLIT = False` to train on all data.


In [None]:
DO_LOCAL_SPLIT = True

if DO_LOCAL_SPLIT:
    interactions = interactions.sort_values(["u", "event_ts"])
    max_ts = interactions.groupby("u")["event_ts"].transform("max")
    cutoff = max_ts - pd.Timedelta(days=30)

    train_df = interactions[interactions["event_ts"] < cutoff].copy()
    val_df = interactions[interactions["event_ts"] >= cutoff].copy()
else:
    train_df = interactions.copy()
    val_df = None

print("train events", len(train_df), "val events", 0 if val_df is None else len(val_df))


## Build normalized adjacency for LightGCN

We build the bipartite graph adjacency and use D^{-1/2} A D^{-1/2} normalization.


In [None]:
def build_norm_adj(n_users, n_items, df):
    ui = df[["u", "i"]].to_numpy()
    w = df["w"].to_numpy()

    R = sparse.coo_matrix((w, (ui[:, 0], ui[:, 1])), shape=(n_users, n_items))

    upper = sparse.hstack([sparse.csr_matrix((n_users, n_users)), R], format="csr")
    lower = sparse.hstack([R.T, sparse.csr_matrix((n_items, n_items))], format="csr")
    A = sparse.vstack([upper, lower], format="csr")

    deg = np.array(A.sum(axis=1)).squeeze()
    deg_inv_sqrt = np.power(deg, -0.5, where=(deg > 0))
    deg_inv_sqrt[~np.isfinite(deg_inv_sqrt)] = 0.0
    D_inv_sqrt = sparse.diags(deg_inv_sqrt)

    A_norm = D_inv_sqrt @ A @ D_inv_sqrt
    return A_norm.tocoo()

A_norm = build_norm_adj(n_users, n_items, train_df)
print("A_norm nnz", A_norm.nnz)


## LightGCN model and BPR training

This is a simple LightGCN implementation with BPR loss.


In [None]:
def scipy_coo_to_torch_sparse(coo: sparse.coo_matrix, device):
    idx = np.vstack([coo.row, coo.col]).astype(np.int64)
    val = coo.data.astype(np.float32)
    i = torch.from_numpy(idx).to(device)
    v = torch.from_numpy(val).to(device)
    return torch.sparse_coo_tensor(i, v, size=coo.shape, device=device).coalesce()

class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=64, n_layers=3):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.emb_dim = emb_dim
        self.n_layers = n_layers

        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

    def propagate(self, A_norm_sp):
        x0 = torch.cat([self.user_emb.weight, self.item_emb.weight], dim=0)
        xs = [x0]
        x = x0
        for _ in range(self.n_layers):
            x = torch.sparse.mm(A_norm_sp, x)
            xs.append(x)
        x_out = torch.mean(torch.stack(xs, dim=0), dim=0)
        users_out = x_out[:self.n_users]
        items_out = x_out[self.n_users:]
        return users_out, items_out

def bpr_loss(u_vec, pos_vec, neg_vec):
    pos = torch.sum(u_vec * pos_vec, dim=1)
    neg = torch.sum(u_vec * neg_vec, dim=1)
    return -torch.mean(F.logsigmoid(pos - neg))

device = "cuda" if torch.cuda.is_available() else "cpu"
A_torch = scipy_coo_to_torch_sparse(A_norm, device)

model = LightGCN(n_users, n_items, emb_dim=96, n_layers=3).to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=1e-6)

user_pos = train_df.groupby("u")["i"].apply(lambda s: set(s.values)).to_dict()

all_items = np.arange(n_items)

def sample_batch(batch_size=4096):
    us = np.random.choice(list(user_pos.keys()), size=batch_size, replace=True)
    pos = np.empty(batch_size, dtype=np.int64)
    neg = np.empty(batch_size, dtype=np.int64)

    for t, u in enumerate(us):
        p = np.random.choice(list(user_pos[u]))
        pos[t] = p
        while True:
            j = np.random.randint(0, n_items)
            if j not in user_pos[u]:
                neg[t] = j
                break
    return us, pos, neg

EPOCHS = 5
STEPS_PER_EPOCH = 300

for epoch in range(1, EPOCHS + 1):
    model.train()
    total = 0.0

    for _ in tqdm(range(STEPS_PER_EPOCH), desc=f"epoch {epoch}"):
        us, pos, neg = sample_batch()
        us = torch.from_numpy(us).to(device)
        pos = torch.from_numpy(pos).to(device)
        neg = torch.from_numpy(neg).to(device)

        users_out, items_out = model.propagate(A_torch)
        u_vec = users_out[us]
        p_vec = items_out[pos]
        n_vec = items_out[neg]

        loss = bpr_loss(u_vec, p_vec, n_vec)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total += float(loss.item())

    print(f"epoch {epoch} loss={total / STEPS_PER_EPOCH:.4f}")


## Score the 200 candidates

We compute dot-product scores for each (user, edition) candidate pair.


In [None]:
@torch.no_grad()
def score_candidates(model, A_torch, candidates_df: pd.DataFrame):
    model.eval()
    users_out, items_out = model.propagate(A_torch)

    cand = candidates_df.copy()
    cand["u"] = cand["user_id"].map(user2idx).astype(np.int64)
    cand["i"] = cand["edition_id"].map(item2idx).astype(np.int64)

    u_idx = torch.from_numpy(cand["u"].values).to(device)
    i_idx = torch.from_numpy(cand["i"].values).to(device)

    scores = torch.sum(users_out[u_idx] * items_out[i_idx], dim=1).detach().cpu().numpy()
    cand["score"] = scores
    return cand[["user_id", "edition_id", "score"]]

cand_scored = score_candidates(model, A_torch, candidates)
cand_scored.head()


## Genre-diversity rerank (greedy)

We rerank the scored candidates with a simple greedy MMR-like objective that adds genre coverage and intra-list diversity on top of the relevance score.


In [None]:
ed2book = dict(zip(editions["edition_id"].values, editions["book_id"].values))
bg = book_genres.groupby("book_id")["genre_id"].apply(lambda s: set(s.values)).to_dict()

ed2genres = {}
for ed in all_edition_ids:
    b = ed2book.get(ed, None)
    ed2genres[ed] = bg.get(b, set())

def jaccard_dist(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return 1.0 - (inter / union if union else 0.0)

def rerank_diverse(df_user: pd.DataFrame, topk=20, lam=0.15, gamma=0.5):
    items = df_user.sort_values("score", ascending=False)[["edition_id", "score"]].to_records(index=False)

    chosen = []
    chosen_genres = set()

    for _ in range(topk):
        best = None
        best_val = -1e18

        for ed, s in items:
            if ed in chosen:
                continue

            g = ed2genres.get(ed, set())
            if len(g) > 0:
                new = len(g - chosen_genres) / len(g)
            else:
                new = 0.0

            if not chosen:
                ild = 0.0
            else:
                dsum = 0.0
                for prev in chosen:
                    dsum += jaccard_dist(g, ed2genres.get(prev, set()))
                ild = dsum / len(chosen)

            val = float(s) + lam * (new + gamma * ild)
            if val > best_val:
                best_val = val
                best = ed

        chosen.append(best)
        chosen_genres |= ed2genres.get(best, set())

    return chosen

pred_rows = []
for u, grp in tqdm(cand_scored.groupby("user_id"), total=cand_scored["user_id"].nunique()):
    chosen = rerank_diverse(grp, topk=20, lam=0.15, gamma=0.5)
    for r, ed in enumerate(chosen, start=1):
        pred_rows.append((u, ed, r))

submission = pd.DataFrame(pred_rows, columns=["user_id", "edition_id", "rank"])
print(submission.shape)
submission.head()


## Save submission


In [None]:
out_path = "submission.csv"
submission.to_csv(out_path, index=False)
print("saved", out_path)

# quick validation
ok_20 = submission.groupby("user_id").size().eq(20).all()
unique_ed = submission.groupby("user_id")["edition_id"].nunique().eq(20).all()
unique_rank = submission.groupby("user_id")["rank"].nunique().eq(20).all()
print("20 rows per user", ok_20)
print("unique edition_id", unique_ed)
print("unique rank", unique_rank)
