# H3 â€” Cold-Start Items
Hypothesis:

For cold-start items, models with content embeddings outperform CF-only models.

In [None]:
# ---------------------------
# 0) Installing dependencies (in one block)
# ---------------------------
!pip -q install -U pip setuptools wheel
!pip -q install lightfm-next tqdm

import numpy as np
import polars as pl
from huggingface_hub import hf_hub_download

from lightfm import LightFM
from scipy import sparse

from tqdm.auto import trange
import matplotlib.pyplot as plt

In [None]:
# ============================
#Take ~10% of users from a subsample without killing RAM because up0.01_ir0.01 is 10 times larger than up0.001_ip0.001
# we work with the help of polars and thus cut from the cut (0)-(0)
# ===========================
# -------------------------
# 0) Parameters
# -------------------------
subsample_name = "up0.01_ir0.01"
content_embedding_size = 32

FRACTION_USERS = 0.10   # 10% of users from up0.01_ir0.01
SEED = 42

TRAIN_WEEKS = list(range(25))
VAL_WEEK = 25

NEEDED_COLS = ["user_id", "item_id", "timespent", "like", "share", "bookmark"]

# -------------------------
# 1)Downloading files
# -------------------------
train_interactions_files = [
    f"subsamples/{subsample_name}/train/week_{i:02}.parquet" for i in TRAIN_WEEKS
]
val_interactions_file = [f"subsamples/{subsample_name}/validation/week_{VAL_WEEK:02}.parquet"]

metadata_files = [
    "metadata/users_metadata.parquet",
    "metadata/items_metadata.parquet",
    "metadata/item_embeddings.npz",
]

for file in (train_interactions_files + val_interactions_file + metadata_files):
    hf_hub_download(
        repo_id="deepvk/VK-LSVD",
        repo_type="dataset",
        filename=file,
        local_dir="VK-LSVD",
    )

# -------------------------
# 2) We lazily read train and select 10% of users
# -------------------------
train_lf = pl.concat([
    pl.scan_parquet(f"VK-LSVD/{file}").select(NEEDED_COLS)
    for file in train_interactions_files
])

# count the number of unique users (cheap)
n_users = (
    train_lf
    .select(pl.col("user_id").n_unique().alias("n_users"))
    .collect(engine="streaming")["n_users"][0]
)
n_sample = max(1, int(n_users * FRACTION_USERS))

print("Unique users in subsample:", int(n_users))
print("Sampling users:", int(n_sample), f"({FRACTION_USERS*100:.1f}% users)")

# unique_users - a small table (safe to build)
unique_users = (
    train_lf
    .select("user_id")
    .unique()
    .collect(engine="streaming")
)

# sampled_users â€” DataFrame
sampled_users = unique_users.sample(n=n_sample, seed=SEED)

# IMPORTANT: When joining with a LazyFrame, there must also be a LazyFrame on the right
sampled_users_lf = sampled_users.lazy()

# filter the train by these users (semi join)
train_interactions = (
    train_lf
    .join(sampled_users_lf, on="user_id", how="semi")
    .collect(engine="streaming")
)

# -------------------------
# 3) Validation (filtering by the same users)
# -------------------------
val_lf = (
    pl.scan_parquet(f"VK-LSVD/{val_interactions_file[0]}")
    .select(NEEDED_COLS)
)

val_interactions = (
    val_lf
    .join(sampled_users_lf, on="user_id", how="semi")
    .collect(engine="streaming")
)

# -------------------------
# 4) Metadata and embeddings (filter by items from train)
# -------------------------
# users_metadata â€” a regular DataFrame, here you can do a join DataFrame â†” DataFrame
users_metadata = (
    pl.read_parquet("VK-LSVD/metadata/users_metadata.parquet")
    .join(sampled_users, on="user_id", how="inner")
)

items_metadata_full = pl.read_parquet("VK-LSVD/metadata/items_metadata.parquet")

train_items = train_interactions.select("item_id").unique()

items_metadata = items_metadata_full.join(train_items, on="item_id", how="inner")
items_duration = items_metadata.select(["item_id", "duration"])

# embeddings
npz = np.load("VK-LSVD/metadata/item_embeddings.npz")
item_ids_all = npz["item_id"]
item_emb_all = npz["embedding"].astype(np.float32)

train_items_np = train_items["item_id"].to_numpy()
mask_items = np.isin(item_ids_all, train_items_np)

item_ids = item_ids_all[mask_items]
item_embeddings = item_emb_all[mask_items][:, :content_embedding_size]

# -------------------------
# 5) Final diagnostics
# -------------------------
print("\nLoaded subsample:", subsample_name)
print("Train rows (filtered):", train_interactions.height)
print("Val rows (filtered):", val_interactions.height)
print("Train users (sampled):", sampled_users.height)
print("Train items:", train_items.height)
print("Embeddings items:", len(item_ids), "Embedding dim:", item_embeddings.shape[1])
print("Users metadata rows:", users_metadata.height)
print("Items metadata rows:", items_metadata.height)


ðŸ“Œur0.01_ir0.01 / up0.01_ir0.01:

ir â†’ item-sparse â‡’ more cold-start items

Without this, cold-start items simply won't be eval'd

In [3]:
# =========================
# Training and evaluation of H2 (Sparse users)
# =========================

# ---------- 1) Positives ----------
def add_watch_ratio(df: pl.DataFrame, items_duration: pl.DataFrame) -> pl.DataFrame:
    return (
        df.select(["user_id", "item_id", "timespent"])
          .join(items_duration, on="item_id", how="inner")
          .with_columns([
              (pl.col("timespent").cast(pl.Float32) / pl.col("duration").cast(pl.Float32))
              .clip(0.0, 10.0)
              .alias("watch_ratio")
          ])
    )

WATCH_THR = 0.5
train_w = add_watch_ratio(train_interactions, items_duration)
val_w   = add_watch_ratio(val_interactions, items_duration)

train_pos_P1 = train_w.filter(pl.col("watch_ratio") >= WATCH_THR).select(["user_id", "item_id"])
val_pos_P1   = val_w.filter(pl.col("watch_ratio") >= WATCH_THR).select(["user_id", "item_id"])

train_pos_P2 = train_interactions.filter(pl.col("like") | pl.col("share") | pl.col("bookmark")).select(["user_id", "item_id"])
val_pos_P2   = val_interactions.filter(pl.col("like") | pl.col("share") | pl.col("bookmark")).select(["user_id", "item_id"])


# ---------- 2) mapping ----------
def build_mappings(train_pos: pl.DataFrame):
    user_ids = train_pos.select("user_id").unique().sort("user_id")["user_id"].to_numpy()
    item_ids_train = train_pos.select("item_id").unique().sort("item_id")["item_id"].to_numpy()
    user2idx = {int(u): i for i, u in enumerate(user_ids)}
    item2idx = {int(it): i for i, it in enumerate(item_ids_train)}
    return user_ids, item_ids_train, user2idx, item2idx


# ---------- 3) sparse interactions ----------
def to_sparse_matrix(pos_df: pl.DataFrame, user2idx, item2idx, n_users, n_items):
    u = pos_df["user_id"].to_numpy()
    it = pos_df["item_id"].to_numpy()

    u_idx = np.fromiter((user2idx.get(int(x), -1) for x in u), dtype=np.int32, count=len(u))
    it_idx = np.fromiter((item2idx.get(int(x), -1) for x in it), dtype=np.int32, count=len(it))

    mask = (u_idx >= 0) & (it_idx >= 0)
    u_idx = u_idx[mask]
    it_idx = it_idx[mask]

    mat = sparse.coo_matrix(
        (np.ones(len(u_idx), dtype=np.float32), (u_idx, it_idx)),
        shape=(n_users, n_items)
    ).tocsr()

    # if there were duplicate pairs, convert to binary form
    if mat.nnz > 0:
        mat.data[:] = 1.0
    return mat


# ---------- 4) cold-start items (H3) ----------
def get_cold_items(train_mat, top_quantile=0.1):
    """
    Cold-start items = items with interaction count
    below the given quantile.
    """
    item_counts = np.array(train_mat.getnnz(axis=0)).astype(np.int32)
    thr = float(np.quantile(item_counts, top_quantile))
    cold_idx = np.where(item_counts <= thr)[0]
    return cold_idx, item_counts, thr


# ---------- 5) item_features from embeddings----------
def build_item_features(item_ids_train, item2idx, item_ids_emb, item_emb):
    emb_map = {int(i): item_emb[j].astype(np.float32) for j, i in enumerate(item_ids_emb)}
    n_items = len(item_ids_train)
    n_feat = item_emb.shape[1]

    rows, cols, data = [], [], []
    for it_id, it_i in item2idx.items():
        emb = emb_map.get(int(it_id))
        if emb is None:
            continue
# convert D-dimensional embedding into D "features"
# (LightFM expects a sparse item_features matrix)
        for f in range(n_feat):
            val = float(emb[f])
            if val != 0.0:
                rows.append(it_i)
                cols.append(f)
                data.append(val)

    feats = sparse.coo_matrix((data, (rows, cols)), shape=(n_items, n_feat), dtype=np.float32).tocsr()
    return feats


# ---------- 6) metrics ----------
def ndcg_at_k(recs, gt_set, k=10):
    dcg = 0.0
    for rank, it in enumerate(recs[:k], start=1):
        if it in gt_set:
            dcg += 1.0 / np.log2(rank + 1)
    ideal = sum(1.0 / np.log2(r + 1) for r in range(1, min(len(gt_set), k) + 1))
    return dcg / ideal if ideal > 0 else 0.0

def recall_at_k(recs, gt_set, k=10):
    if len(gt_set) == 0:
        return 0.0
    hit = sum(1 for it in recs[:k] if it in gt_set)
    return hit / len(gt_set)


def evaluate_model_cold_items(
    model,
    train_mat,
    val_mat,
    cold_item_idx,
    item_features=None,
    k=10,
    max_users=2000,
    seed=42,
    num_threads=4,
):
    rng = np.random.default_rng(seed)
    users = np.arange(train_mat.shape[0], dtype=np.int32)

    if len(users) > max_users:
        users = rng.choice(users, size=max_users, replace=False)

    cold_set = set(cold_item_idx.tolist())

    ndcgs, recalls = [], []
    all_items = np.arange(train_mat.shape[1], dtype=np.int32)

    for u in users:
        gt_items = val_mat[u].indices
        gt_cold = [it for it in gt_items if it in cold_set]

        if len(gt_cold) == 0:
            continue

        u_arr = np.full(len(all_items), int(u), dtype=np.int32)
        scores = model.predict(
            u_arr,
            all_items,
            item_features=item_features,
            num_threads=num_threads,
        )

# removing seen
        seen = train_mat[u].indices
        scores[seen] = -1e9

        topk = np.argpartition(-scores, k)[:k]
        topk = topk[np.argsort(-scores[topk])]

        ndcgs.append(ndcg_at_k(topk.tolist(), set(gt_cold), k=k))
        recalls.append(recall_at_k(topk.tolist(), set(gt_cold), k=k))

    return (
        float(np.mean(ndcgs)) if ndcgs else 0.0,
        float(np.mean(recalls)) if recalls else 0.0,
        len(ndcgs),
    )
def train_with_history_h3(
    train_mat,
    val_mat,
    cold_item_idx,
    item_features=None,
    loss="bpr",
    no_components=64,
    lr=0.05,
    epochs=5,
    k=10,
    max_users_eval=2000,
    num_threads=4,
    seed=42,
):
    model = LightFM(
        loss=loss,
        no_components=no_components,
        learning_rate=lr,
        random_state=seed,
    )

    hist = []
    desc = "Hybrid" if item_features is not None else "CF-only"

    for ep in trange(1, epochs + 1, desc=f"Training {desc}"):
        model.fit_partial(
            train_mat,
            item_features=item_features,
            epochs=1,
            num_threads=num_threads,
        )

        ndcg, rec, n_eval = evaluate_model_cold_items(
            model,
            train_mat,
            val_mat,
            cold_item_idx,
            item_features=item_features,
            k=k,
            max_users=max_users_eval,
            seed=seed,
            num_threads=num_threads,
        )

        hist.append({
            "epoch": ep,
            "ndcg": ndcg,
            "recall": rec,
            "n_eval": n_eval,
        })

    return model, hist


def run_h3_experiment(
    train_pos,
    val_pos,
    label,
    cold_quantile=0.1,
    no_components=64,
    epochs=5,
    lr=0.05,
    k=10,
    max_users_eval=2000,
    num_threads=4,
    seed=42,
):
    print(f"\n================ H3 (Cold-start Items): {label} ================")

    user_ids, item_ids_train, user2idx, item2idx = build_mappings(train_pos)
    n_users, n_items = len(user_ids), len(item_ids_train)

    val_pos_f = val_pos.filter(
        pl.col("user_id").is_in(pl.Series(user_ids)) &
        pl.col("item_id").is_in(pl.Series(item_ids_train))
    )

    train_mat = to_sparse_matrix(train_pos, user2idx, item2idx, n_users, n_items)
    val_mat   = to_sparse_matrix(val_pos_f, user2idx, item2idx, n_users, n_items)

    print("train:", train_mat.shape, "nnz:", train_mat.nnz)
    print("val:  ", val_mat.shape, "nnz:", val_mat.nnz)

    cold_idx, item_counts, thr = get_cold_items(train_mat, top_quantile=cold_quantile)
    print("cold items threshold (<=):", thr)
    print("cold items:", len(cold_idx), "/", n_items)

    item_features = build_item_features(
        item_ids_train, item2idx, item_ids, item_embeddings
    )
    print("item_features:", item_features.shape)

    # CF-only
    model_cf, hist_cf = train_with_history_h3(
        train_mat, val_mat, cold_idx,
        item_features=None,
        no_components=no_components,
        lr=lr,
        epochs=epochs,
        k=k,
        max_users_eval=max_users_eval,
        num_threads=num_threads,
        seed=seed,
    )

    # Hybrid
    model_h, hist_h = train_with_history_h3(
        train_mat, val_mat, cold_idx,
        item_features=item_features,
        no_components=no_components,
        lr=lr,
        epochs=epochs,
        k=k,
        max_users_eval=max_users_eval,
        num_threads=num_threads,
        seed=seed,
    )

    ndcg_cf, rec_cf = hist_cf[-1]["ndcg"], hist_cf[-1]["recall"]
    ndcg_h,  rec_h  = hist_h[-1]["ndcg"],  hist_h[-1]["recall"]
    delta = ndcg_h - ndcg_cf

    print(f"CF-only    | NDCG@{k}: {ndcg_cf:.5f} | Recall@{k}: {rec_cf:.5f}")
    print(f"CF+content | NDCG@{k}: {ndcg_h:.5f} | Recall@{k}: {rec_h:.5f}")
    print(f"Î”NDCG@{k}: {delta:+.5f}")

    if delta > 0:
        print("H3 is confirmed: content is critical for cold-start items.")
    else:
        print("H3 is weakening: the content did not yield the expected results.")

    return {
        "label": label,
        "cold_quantile": cold_quantile,
        "ndcg_cf": ndcg_cf,
        "ndcg_h": ndcg_h,
        "delta_ndcg": float(delta),
        "n_cold_items": int(len(cold_idx)),
    }


In [4]:
# =========================
# H3 multi-seed setup
# =========================

H3_SEEDS = [41, 42, 43, 44, 45]

H3_CONFIG = dict(
    cold_quantile=0.1,        # 10% of the coldest items
    no_components=64,
    epochs=5,
    lr=0.05,
    k=10,
    max_users_eval=2000,
    num_threads=4,
)

H3_LABEL = "H3 / Cold-start items"


In [None]:
# =========================
# Run H3 for multiple seeds
# =========================

h3_results = []

for seed in H3_SEEDS:
    res = run_h3_experiment(
        train_pos=train_pos_P2,      # or P1, if you want.
        val_pos=val_pos_P2,
        label=f"{H3_LABEL} | seed={seed}",
        seed=seed,
        **H3_CONFIG
    )
    res["seed"] = seed
    h3_results.append(res)


In [None]:
import pandas as pd

h3_df = pd.DataFrame([
    {
        "seed": r["seed"],
        "ndcg_cf": r["ndcg_cf"],
        "ndcg_hybrid": r["ndcg_h"],
        "delta_ndcg": r["delta_ndcg"],
        "n_cold_items": r["n_cold_items"],
    }
    for r in h3_results
])

h3_df


In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(7, 4))

x = np.arange(len(h3_df))

plt.plot(x, h3_df["ndcg_cf"], marker="o", label="CF-only")
plt.plot(x, h3_df["ndcg_hybrid"], marker="o", label="CF + content")

plt.xticks(x, h3_df["seed"])
plt.xlabel("Seed")
plt.ylabel("NDCG@10")
plt.title("H3: Cold-start items â€” CF vs Hybrid")
plt.legend()
plt.grid(alpha=0.3)

plt.show()
