# Experiment: Baseline: Co-Read CF

Goal: build a simple baseline and produce `submission.csv`.


In [None]:
# Setup: imports and reproducibility
# If needed:
# pip install pandas numpy scipy tqdm

import os
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

SEED = 42
rng = np.random.default_rng(SEED)


## Load data

In [None]:
DATA_DIR = "data"
SUBMIT_DIR = "submit"

users = pd.read_csv(os.path.join(DATA_DIR, "users.csv"))
interactions = pd.read_csv(os.path.join(DATA_DIR, "interactions.csv"), parse_dates=["event_ts"])
editions = pd.read_csv(os.path.join(DATA_DIR, "editions.csv"))
book_genres = pd.read_csv(os.path.join(DATA_DIR, "book_genres.csv"))

targets = pd.read_csv(os.path.join(SUBMIT_DIR, "targets.csv"))
candidates = pd.read_csv(os.path.join(SUBMIT_DIR, "candidates.csv"))

print("users", users.shape)
print("interactions", interactions.shape)
print("editions", editions.shape)
print("book_genres", book_genres.shape)
print("targets", targets.shape)
print("candidates", candidates.shape)


## ID mappings

In [None]:
all_user_ids = pd.Index(pd.concat([users["user_id"], interactions["user_id"], targets["user_id"]]).unique())
all_edition_ids = pd.Index(pd.concat([editions["edition_id"], interactions["edition_id"], candidates["edition_id"]]).unique())

user2idx = {u: i for i, u in enumerate(all_user_ids)}
item2idx = {it: i for i, it in enumerate(all_edition_ids)}
idx2item = {i: it for it, i in item2idx.items()}

n_users = len(all_user_ids)
n_items = len(all_edition_ids)

interactions["u"] = interactions["user_id"].map(user2idx).astype(np.int64)
interactions["i"] = interactions["edition_id"].map(item2idx).astype(np.int64)

print("n_users", n_users, "n_items", n_items)


## Interaction weights

In [None]:
def make_weight(df: pd.DataFrame) -> np.ndarray:
    w = np.zeros(len(df), dtype=np.float32)
    is_wish = df["event_type"] == 1
    is_read = df["event_type"] == 2

    w[is_wish] = 1.0
    w[is_read] = 3.0

    r = df["rating"].astype("float32").fillna(0.0).clip(0.0, 5.0) / 5.0
    w[is_read] += 0.2 * r[is_read].to_numpy()
    return w

interactions["w"] = make_weight(interactions)
interactions[["user_id", "edition_id", "event_type", "rating", "w"]].head()


## Optional local time split (30 days)

In [None]:
DO_LOCAL_SPLIT = True

if DO_LOCAL_SPLIT:
    interactions = interactions.sort_values(["u", "event_ts"])
    max_ts = interactions.groupby("u")["event_ts"].transform("max")
    cutoff = max_ts - pd.Timedelta(days=30)

    train_df = interactions[interactions["event_ts"] < cutoff].copy()
    val_df = interactions[interactions["event_ts"] >= cutoff].copy()
else:
    train_df = interactions.copy()
    val_df = None

print("train events", len(train_df), "val events", 0 if val_df is None else len(val_df))


## Local scorer

In [None]:
from collections import defaultdict

ed2book = dict(zip(editions["edition_id"].values, editions["book_id"].values))
bg = book_genres.groupby("book_id")["genre_id"].apply(lambda s: set(s.values)).to_dict()

ed2genres = {}
for ed in all_edition_ids:
    b = ed2book.get(ed, None)
    ed2genres[ed] = bg.get(b, set())

def build_relevance(val_df):
    rel = defaultdict(dict)
    if val_df is None or len(val_df) == 0:
        return rel
    for (u, ed), grp in val_df.groupby(["user_id", "edition_id"]):
        if (grp["event_type"] == 2).any():
            rel[u][ed] = 3
        elif (grp["event_type"] == 1).any():
            rel[u][ed] = 1
    return rel

def ndcg_at_20(ranked_items, rel_u):
    gains = []
    for k, ed in enumerate(ranked_items, start=1):
        r = rel_u.get(ed, 0)
        gains.append(r / math.log2(k + 1))
    dcg = sum(gains)

    ideal_rels = sorted(rel_u.values(), reverse=True)[:20]
    idcg = 0.0
    for k, r in enumerate(ideal_rels, start=1):
        idcg += r / math.log2(k + 1)
    return 0.0 if idcg == 0 else dcg / idcg

def jaccard_dist(a, b):
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return 1.0 - (inter / union if union else 0.0)

def diversity_at_20(ranked_items, rel_u):
    rel_mask = [rel_u.get(ed, 0) > 0 for ed in ranked_items]
    rel_items = [ed for ed, m in zip(ranked_items, rel_mask) if m]

    w = [1.0 / math.log2(k + 1) for k in range(1, 21)]

    S = set()
    num = 0.0
    den = 0.0
    for k, ed in enumerate(ranked_items, start=1):
        g = ed2genres.get(ed, set())
        if len(g) == 0:
            den += w[k-1] * 0.0
            continue
        if rel_u.get(ed, 0) > 0:
            new = len(g - S) / len(g)
            num += w[k-1] * new
            S |= g
        den += w[k-1] * len(g)
    coverage = 0.0 if den == 0 else num / den

    if len(rel_items) < 2:
        ild = 0.0
    else:
        dsum = 0.0
        cnt = 0
        for i in range(len(rel_items)):
            for j in range(i + 1, len(rel_items)):
                dsum += jaccard_dist(ed2genres.get(rel_items[i], set()), ed2genres.get(rel_items[j], set()))
                cnt += 1
        ild = dsum / cnt if cnt > 0 else 0.0

    return 0.5 * coverage + 0.5 * ild

def score_submission(pred_df, val_df):
    rel = build_relevance(val_df)
    users = pred_df["user_id"].unique()

    ndcgs = []
    divs = []
    for u in users:
        ranked = pred_df[pred_df["user_id"] == u].sort_values("rank")["edition_id"].tolist()
        rel_u = rel.get(u, {})
        ndcgs.append(ndcg_at_20(ranked, rel_u))
        divs.append(diversity_at_20(ranked, rel_u))

    ndcg = float(np.mean(ndcgs)) if ndcgs else 0.0
    div = float(np.mean(divs)) if divs else 0.0
    score = 0.7 * ndcg + 0.3 * div
    return ndcg, div, score


## Build validation candidate pool (200 per user)

In [None]:
from collections import Counter

item_pop = train_df.groupby("i").size().sort_values(ascending=False)
item_pop_rank = item_pop.index.to_numpy()

user_items = train_df.groupby("u")["i"].apply(list).to_dict()
item_users = train_df.groupby("i")["u"].apply(list).to_dict()

ed2genres_idx = {item2idx[ed]: ed2genres.get(ed, set()) for ed in all_edition_ids if ed in item2idx}

genre_to_items = {}
for item_idx in range(n_items):
    gset = ed2genres_idx.get(item_idx, set())
    for g in gset:
        genre_to_items.setdefault(g, []).append(item_idx)

for g, items in genre_to_items.items():
    items.sort(key=lambda x: item_pop.get(x, 0), reverse=True)

def build_val_candidates(val_df, per_user=200, pop_k=200, genre_k=200, coread_k=200,
                         max_seed_items=5, max_users_per_item=200, max_items_per_user=20):
    val_users = val_df["user_id"].unique()
    val_pairs = val_df.groupby("user_id")["edition_id"].apply(set).to_dict()

    rows = []
    for u in tqdm(val_users, desc="build val candidates"):
        u_idx = user2idx[u]
        train_items = set(user_items.get(u_idx, []))
        positives = set(item2idx[ed] for ed in val_pairs.get(u, set()) if ed in item2idx)

        cand = set(positives)

        for it in item_pop_rank[:pop_k]:
            if it not in train_items:
                cand.add(it)
            if len(cand) >= per_user:
                break

        gcount = Counter()
        for it in list(train_items)[:200]:
            for g in ed2genres_idx.get(it, set()):
                gcount[g] += 1

        top_genres = [g for g, _ in gcount.most_common(10)]
        for g in top_genres:
            for it in genre_to_items.get(g, [])[:genre_k]:
                if it not in train_items:
                    cand.add(it)
                if len(cand) >= per_user:
                    break
            if len(cand) >= per_user:
                break

        seed_items = list(train_items)[:max_seed_items]
        for it in seed_items:
            users = item_users.get(it, [])
            if len(users) > max_users_per_item:
                users = rng.choice(users, size=max_users_per_item, replace=False)
            for v in users:
                v_items = user_items.get(v, [])[:max_items_per_user]
                for it2 in v_items:
                    if it2 not in train_items:
                        cand.add(it2)
                    if len(cand) >= per_user:
                        break
                if len(cand) >= per_user:
                    break
            if len(cand) >= per_user:
                break

        if len(cand) < per_user:
            pool = [it for it in item_pop_rank if it not in train_items]
            need = per_user - len(cand)
            if need > 0:
                extra = rng.choice(pool, size=min(need, len(pool)), replace=False)
                cand.update(extra)

        cand_list = list(cand)[:per_user]
        for it in cand_list:
            rows.append((u, idx2item[it]))

    return pd.DataFrame(rows, columns=["user_id", "edition_id"])

if val_df is not None and len(val_df) > 0:
    val_candidates = build_val_candidates(val_df, per_user=200)
    print(val_candidates.head())
    print("val candidates rows", len(val_candidates))
else:
    val_candidates = None
    print("val_df is empty; cannot build validation candidates.")


## Diversity rerank helper

In [None]:
def jaccard_dist(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return 1.0 - (inter / union if union else 0.0)

def rerank_diverse(df_user: pd.DataFrame, topk=20, lam=0.15, gamma=0.5):
    items = df_user.sort_values("score", ascending=False)[["edition_id", "score"]].to_records(index=False)

    chosen = []
    chosen_genres = set()

    for _ in range(topk):
        best = None
        best_val = -1e18

        for ed, s in items:
            if ed in chosen:
                continue

            g = ed2genres.get(ed, set())
            if len(g) > 0:
                new = len(g - chosen_genres) / len(g)
            else:
                new = 0.0

            if not chosen:
                ild = 0.0
            else:
                dsum = 0.0
                for prev in chosen:
                    dsum += jaccard_dist(g, ed2genres.get(prev, set()))
                ild = dsum / len(chosen)

            val = float(s) + lam * (new + gamma * ild)
            if val > best_val:
                best_val = val
                best = ed

        chosen.append(best)
        chosen_genres |= ed2genres.get(best, set())

    return chosen


## Baseline scoring

In [None]:
# co-read score: sum of overlaps between candidate and user's seed items

# item -> users set
item_users = train_df.groupby("i")["u"].apply(lambda s: set(s.values)).to_dict()
# user -> items list
user_items = train_df.groupby("u")["i"].apply(list).to_dict()

MAX_SEED_ITEMS = 5

# preconvert to sets for speed
item_users = {k: v for k, v in item_users.items()}


def score_candidates(cand_df):
    cand = cand_df.copy()
    cand["u"] = cand["user_id"].map(user2idx).astype(np.int64)
    cand["i"] = cand["edition_id"].map(item2idx).astype(np.int64)

    scores = []
    for u, it in zip(cand["u"].values, cand["i"].values):
        seed_items = user_items.get(u, [])[:MAX_SEED_ITEMS]
        if not seed_items:
            scores.append(0.0)
            continue
        users_it = item_users.get(it, set())
        s = 0.0
        for s_it in seed_items:
            users_s = item_users.get(s_it, set())
            if not users_it or not users_s:
                continue
            # overlap size
            s += len(users_it & users_s)
        scores.append(s)

    cand["score"] = np.array(scores, dtype=np.float32)
    return cand[["user_id", "edition_id", "score"]]

cand_scored = score_candidates(candidates)


## Build submission

In [None]:
pred_rows = []
for u, grp in tqdm(cand_scored.groupby("user_id"), total=cand_scored["user_id"].nunique()):
    chosen = rerank_diverse(grp, topk=20, lam=0.15, gamma=0.5)
    for r, ed in enumerate(chosen, start=1):
        pred_rows.append((u, ed, r))

submission = pd.DataFrame(pred_rows, columns=["user_id", "edition_id", "rank"])
submission.head()


## Save submission

In [None]:
out_path = "submission.csv"
submission.to_csv(out_path, index=False)
print("saved", out_path)

ok_20 = submission.groupby("user_id").size().eq(20).all()
unique_ed = submission.groupby("user_id")["edition_id"].nunique().eq(20).all()
unique_rank = submission.groupby("user_id")["rank"].nunique().eq(20).all()
print("20 rows per user", ok_20)
print("unique edition_id", unique_ed)
print("unique rank", unique_rank)


## Evaluate on validation candidates

In [None]:
if val_candidates is not None:
    val_scored = score_candidates(val_candidates)

    pred_rows = []
    for u, grp in tqdm(val_scored.groupby("user_id"), total=val_scored["user_id"].nunique()):
        chosen = rerank_diverse(grp, topk=20, lam=0.15, gamma=0.5)
        for r, ed in enumerate(chosen, start=1):
            pred_rows.append((u, ed, r))

    val_submission = pd.DataFrame(pred_rows, columns=["user_id", "edition_id", "rank"])

    ndcg, div, score = score_submission(val_submission, val_df)
    print(f"local NDCG@20: {ndcg:.6f}")
    print(f"local Diversity@20: {div:.6f}")
    print(f"local Score: {score:.6f}")
else:
    print("val_candidates is None; skip validation scoring.")
