# Exploring H1 (Dense Mode):
Hypothesis: For active users, adding content embeddings does not improve NDCG@10 compared to CF-only and may worsen ranking due to signal conflict.

In [None]:
# ---------------------------
# 0) Installing dependencies (in one block)
# ---------------------------
!pip -q install -U pip setuptools wheel
!pip -q install lightfm-next tqdm

import numpy as np
import polars as pl
from huggingface_hub import hf_hub_download

from lightfm import LightFM
from scipy import sparse

from tqdm.auto import trange
import matplotlib.pyplot as plt

In [None]:
subsample_name = 'up0.001_ip0.001'
content_embedding_size = 32

train_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'
                            for i in range(25)]
val_interactions_file = [f'subsamples/{subsample_name}/validation/week_25.parquet']

metadata_files = ['metadata/users_metadata.parquet',
                  'metadata/items_metadata.parquet',
                  'metadata/item_embeddings.npz']

for file in (train_interactions_files + val_interactions_file + metadata_files):
    hf_hub_download(
        repo_id='deepvk/VK-LSVD', repo_type='dataset',
        filename=file, local_dir='VK-LSVD'
    )

train_interactions = pl.concat([pl.scan_parquet(f'VK-LSVD/{file}')
                                for file in train_interactions_files]).collect(engine='streaming')

val_interactions = pl.read_parquet(f'VK-LSVD/{val_interactions_file[0]}')

train_users = train_interactions.select('user_id').unique()
train_items = train_interactions.select('item_id').unique()

item_ids = np.load('VK-LSVD/metadata/item_embeddings.npz')['item_id']
item_embeddings = np.load('VK-LSVD/metadata/item_embeddings.npz')['embedding']

mask = np.isin(item_ids, train_items.to_numpy())
item_ids = item_ids[mask]
item_embeddings = item_embeddings[mask][:, :content_embedding_size]

users_metadata = pl.read_parquet('VK-LSVD/metadata/users_metadata.parquet').join(train_users, on='user_id')
items_metadata = pl.read_parquet('VK-LSVD/metadata/items_metadata.parquet').join(train_items, on='item_id')

# (optional) You added embedding as a column - we'll leave it as is:
items_metadata = items_metadata.join(
    pl.DataFrame({'item_id': item_ids, 'embedding': item_embeddings}),
    on='item_id'
)


In [None]:
train_interactions.describe()

In [None]:
items_metadata.describe()

Let's check the loaded volumes:

In [None]:
print("train rows:", train_interactions.height)
print("train users:", train_interactions.select("user_id").n_unique())
print("train items:", train_interactions.select("item_id").n_unique())

print("val rows:", val_interactions.height)
print("val users:", val_interactions.select("user_id").n_unique())
print("val items:", val_interactions.select("item_id").n_unique())

print("embeddings items:", len(item_ids))
print("embedding dim:", item_embeddings.shape[1])


47 million rows for 10,000 users is large enough to be analyzed using pandas, so we'll stick with Polars to handle such volumes. The slice features for the last 25 weeks, which are recorded as validation users, show that approximately the same number of users remains (9,904). This means that almost all users from the train database survived until validation, so there are few cold start users. This should be taken into account.


Positives P1/P2


In [None]:
# duration (для watch_ratio)
items_duration = items_metadata.select(["item_id", "duration"])

def add_watch_ratio(df: pl.DataFrame, items_duration: pl.DataFrame) -> pl.DataFrame:
    return (
        df.select(["user_id", "item_id", "timespent"])
          .join(items_duration, on="item_id", how="inner")
          .with_columns([
              (pl.col("timespent").cast(pl.Float32) / pl.col("duration").cast(pl.Float32))
              .clip(0.0, 10.0)
              .alias("watch_ratio")
          ])
    )

train_w = add_watch_ratio(train_interactions, items_duration)
val_w   = add_watch_ratio(val_interactions, items_duration)

WATCH_THR = 0.5
train_pos_P1 = train_w.filter(pl.col("watch_ratio") >= WATCH_THR).select(["user_id", "item_id"])
val_pos_P1   = val_w.filter(pl.col("watch_ratio") >= WATCH_THR).select(["user_id", "item_id"])

train_pos_P2 = (
    train_interactions
    .filter(pl.col("like") | pl.col("share") | pl.col("bookmark"))
    .select(["user_id", "item_id"])
)
val_pos_P2 = (
    val_interactions
    .filter(pl.col("like") | pl.col("share") | pl.col("bookmark"))
    .select(["user_id", "item_id"])
)


Mapping + sparse matrices + active users

In [None]:
def build_mappings(train_pos: pl.DataFrame):
    user_ids = train_pos.select("user_id").unique().sort("user_id")["user_id"].to_numpy()
    item_ids_train = train_pos.select("item_id").unique().sort("item_id")["item_id"].to_numpy()

    user2idx = {int(u): i for i, u in enumerate(user_ids)}
    item2idx = {int(it): i for i, it in enumerate(item_ids_train)}

    return user_ids, item_ids_train, user2idx, item2idx

def to_sparse_matrix(pos_df: pl.DataFrame, user2idx, item2idx, n_users, n_items):
    u = pos_df["user_id"].to_numpy()
    it = pos_df["item_id"].to_numpy()

    u_idx = np.fromiter((user2idx.get(int(x), -1) for x in u), dtype=np.int32, count=len(u))
    it_idx = np.fromiter((item2idx.get(int(x), -1) for x in it), dtype=np.int32, count=len(it))

    mask = (u_idx >= 0) & (it_idx >= 0)
    u_idx = u_idx[mask]
    it_idx = it_idx[mask]

    mat = sparse.coo_matrix(
        (np.ones(len(u_idx), dtype=np.float32), (u_idx, it_idx)),
        shape=(n_users, n_items)
    ).tocsr()

   # if there are duplicate pairs, we will convert them to 0/1:
    if mat.nnz > 0:
        mat.data[:] = 1.0

    return mat

def get_active_users(train_mat, top_quantile=0.7):
    user_counts = np.array(train_mat.getnnz(axis=1)).astype(np.int32)
    thr = float(np.quantile(user_counts, top_quantile))
    active_idx = np.where(user_counts >= thr)[0]
    return active_idx, user_counts, thr


Item features from content embeddings

In [None]:
def build_item_features(item_ids_train, item2idx, item_ids_emb, item_emb):
    emb_map = {int(i): item_emb[j].astype(np.float32) for j, i in enumerate(item_ids_emb)}

    n_items = len(item_ids_train)
    n_feat = item_emb.shape[1]

    rows, cols, data = [], [], []

    for it_id, it_i in item2idx.items():
        emb = emb_map.get(int(it_id))
        if emb is None:
            continue
        for f in range(n_feat):
            val = float(emb[f])
            if val != 0.0:
                rows.append(it_i)
                cols.append(f)
                data.append(val)

    feats = sparse.coo_matrix(
        (data, (rows, cols)),
        shape=(n_items, n_feat),
        dtype=np.float32
    ).tocsr()

    return feats


Metrics

In [None]:
def ndcg_at_k(recs, gt_set, k=10):
    dcg = 0.0
    for rank, it in enumerate(recs[:k], start=1):
        if it in gt_set:
            dcg += 1.0 / np.log2(rank + 1)
    ideal = sum(1.0 / np.log2(r + 1) for r in range(1, min(len(gt_set), k) + 1))
    return dcg / ideal if ideal > 0 else 0.0

def recall_at_k(recs, gt_set, k=10):
    if len(gt_set) == 0:
        return 0.0
    hit = sum(1 for it in recs[:k] if it in gt_set)
    return hit / len(gt_set)


Model structure

In [None]:
# =========================
# - evaluate_model
# - plot_history
# - train_lightfm_with_history
# - run_h1_experiment (different seeds)
# =========================
def evaluate_model(model, train_mat, val_mat, user_idx, item_features=None,
                   k=10, max_users=2000, seed=42, num_threads=4):
    rng = np.random.default_rng(seed)

    users = np.array(user_idx, dtype=np.int32)
    if len(users) == 0:
        return 0.0, 0.0, 0

    if len(users) > max_users:
        users = rng.choice(users, size=max_users, replace=False)

    ndcgs, recalls = [], []
    all_items = np.arange(train_mat.shape[1], dtype=np.int32)

    for u in users:
        gt_items = val_mat[u].indices
        if len(gt_items) == 0:
            continue

        # LightFM.predict requires: len(user_ids) == len(item_ids)
        u_arr = np.full(len(all_items), int(u), dtype=np.int32)
        scores = model.predict(
            u_arr, all_items,
            item_features=item_features,
            num_threads=num_threads
        )

        # exclude already seen in train
        seen = train_mat[u].indices
        scores[seen] = -1e9

        topk = np.argpartition(-scores, k)[:k]
        topk = topk[np.argsort(-scores[topk])]

        gt_set = set(gt_items.tolist())
        ndcgs.append(ndcg_at_k(topk.tolist(), gt_set, k=k))
        recalls.append(recall_at_k(topk.tolist(), gt_set, k=k))

    if len(ndcgs) == 0:
        return 0.0, 0.0, 0

    return float(np.mean(ndcgs)), float(np.mean(recalls)), len(ndcgs)


def plot_history(hist, title_prefix=""):
    epochs = [h["epoch"] for h in hist]
    for key in ["ndcg", "recall"]:
        vals = [h[key] for h in hist]
        plt.figure()
        plt.plot(epochs, vals)
        plt.xlabel("epoch")
        plt.ylabel(key)
        plt.title(f"{title_prefix}{key}@K by epoch")
        plt.show()


def train_lightfm_with_history(train_mat, val_mat, eval_users,
                               item_features=None, loss="bpr",
                               no_components=64, lr=0.05, epochs=5,
                               k=10, max_users=2000, num_threads=4,
                               seed=42, eval_seed=None):
"""
eval_users: List of user indices (active_idx or sparse_idx)
seed: Model random_state (initialization + stochastic training)
eval_seed: Seed for subsampling users in evaluation (default = model seed)
"""
    if eval_seed is None:
        eval_seed = seed

    model = LightFM(
        loss=loss,
        no_components=no_components,
        learning_rate=lr,
        random_state=seed
    )

    hist = []
    desc = "hybrid" if item_features is not None else "cf"

    for ep in trange(1, epochs + 1, desc=f"Train ({desc})", leave=False):
        model.fit_partial(
            train_mat,
            item_features=item_features,
            epochs=1,
            num_threads=num_threads
        )

        ndcg, rec, n_eval = evaluate_model(
            model, train_mat, val_mat, eval_users,
            item_features=item_features,
            k=k, max_users=max_users,
            seed=eval_seed, num_threads=num_threads
        )
        hist.append({"epoch": ep, "ndcg": ndcg, "recall": rec, "n_eval_users": n_eval})

    return model, hist


def run_h1_experiment(train_pos, val_pos, label,
                      no_components=64, epochs=5, lr=0.05, k=10,
                      max_users_eval=2000, num_threads=4,
                      seed=42, top_quantile=0.7,
                      plot=True):
    """
    H1: For active users (dense mode), content does not improve ranking (ΔNDCG <= 0)
    """

    print(f"\n================ H1: {label} (seed={seed}) ================")

    user_ids, item_ids_train, user2idx, item2idx = build_mappings(train_pos)
    n_users, n_items = len(user_ids), len(item_ids_train)

# filter validation for train mapping
    val_pos_f = val_pos.filter(
        pl.col("user_id").is_in(pl.Series(user_ids)) &
        pl.col("item_id").is_in(pl.Series(item_ids_train))
    )

    train_mat = to_sparse_matrix(train_pos, user2idx, item2idx, n_users, n_items)
    val_mat   = to_sparse_matrix(val_pos_f, user2idx, item2idx, n_users, n_items)

    print("train:", train_mat.shape, "nnz:", train_mat.nnz)
    print("val:  ", val_mat.shape,   "nnz:", val_mat.nnz)

    active_idx, _, thr = get_active_users(train_mat, top_quantile=top_quantile)
    print("active users threshold:", thr)
    print("active users:", len(active_idx), "/", n_users)

    # item_features from global item_ids/item_embeddings
    item_features = build_item_features(item_ids_train, item2idx, item_ids, item_embeddings)
    print("item_features:", item_features.shape, "nnz:", item_features.nnz)

    # --- CF-only ---
    model_cf, hist_cf = train_lightfm_with_history(
        train_mat, val_mat, active_idx,
        item_features=None,
        loss="bpr", no_components=no_components, lr=lr, epochs=epochs,
        k=k, max_users=max_users_eval, num_threads=num_threads,
        seed=seed, eval_seed=seed
    )

    # --- CF + content ---
    model_h, hist_h = train_lightfm_with_history(
        train_mat, val_mat, active_idx,
        item_features=item_features,
        loss="bpr", no_components=no_components, lr=lr, epochs=epochs,
        k=k, max_users=max_users_eval, num_threads=num_threads,
        seed=seed, eval_seed=seed
    )

    ndcg_cf, rec_cf = hist_cf[-1]["ndcg"], hist_cf[-1]["recall"]
    ndcg_h,  rec_h  = hist_h[-1]["ndcg"],  hist_h[-1]["recall"]

    print(f"CF-only    | NDCG@{k}: {ndcg_cf:.5f} | Recall@{k}: {rec_cf:.5f}")
    print(f"CF+content | NDCG@{k}: {ndcg_h:.5f} | Recall@{k}: {rec_h:.5f}")

    delta = float(ndcg_h - ndcg_cf)
    print(f"ΔNDCG@{k}: {delta:+.5f}")

    if delta <= 0:
        print("H1 is confirmed: content does not improve rankings for active users.")
    else:
        print("H1 is not confirmed: content improves ranking.")

    if plot:
        plot_history(hist_cf, title_prefix=f"{label} | CF-only | ")
        plot_history(hist_h,  title_prefix=f"{label} | CF+content | ")

    return {
        "label": label,
        "seed": seed,
        "hist_cf": hist_cf,
        "hist_h": hist_h,
        "ndcg_cf": float(ndcg_cf),
        "rec_cf": float(rec_cf),
        "ndcg_h": float(ndcg_h),
        "rec_h": float(rec_h),
        "delta_ndcg": delta,
        "active_thr": float(thr),
        "n_active": int(len(active_idx)),
        "train_nnz": int(train_mat.nnz),
        "val_nnz": int(val_mat.nnz),
        "n_users": int(n_users),
        "n_items": int(n_items),
    }


**Launching the experiment itself**

In [None]:
# ============================================================
# H1: 42-SEED CHECK
# ============================================================
res_P1 = run_h1_experiment(
    train_pos_P1, val_pos_P1,
    label=f"P1: watch_ratio >= {WATCH_THR}",
    epochs=5, k=10,
    seed=42
)

res_P2 = run_h1_experiment(
    train_pos_P2, val_pos_P2,
    label="P2: like OR share OR bookmark",
    epochs=5, k=10,
    seed=42
)

res_P1["delta_ndcg"], res_P2["delta_ndcg"]


In [None]:
# ============================================================
# H1: MULTI-SEED CHECK (sustainability of effect)
# ============================================================

import pandas as pd
import matplotlib.pyplot as plt

SEEDS = [43, 44, 45]
EPOCHS = 5
K = 10


def run_h1_multiseed(train_pos, val_pos, base_label, seeds):
    rows = []
    histories_cf = []
    histories_h  = []

    for s in seeds:
        res = run_h1_experiment(
            train_pos, val_pos,
            label=f"{base_label} | seed={s}",
            epochs=EPOCHS, k=K,
            seed=s,
            plot=False
        )

        rows.append({
            "seed": s,
            "ndcg_cf": res["ndcg_cf"],
            "ndcg_h":  res["ndcg_h"],
            "delta_ndcg": res["delta_ndcg"],
        })

        histories_cf.append(res["hist_cf"])
        histories_h.append(res["hist_h"])

    return pd.DataFrame(rows), histories_cf, histories_h


# ---------- P1 ----------
df_P1, hist_cf_P1, hist_h_P1 = run_h1_multiseed(
    train_pos_P1, val_pos_P1,
    base_label=f"H1 / P1: watch_ratio >= {WATCH_THR}",
    seeds=SEEDS
)

print("\n=== H1 P1 multi-seed summary ===")
display(df_P1)
print("ΔNDCG mean:", df_P1["delta_ndcg"].mean(),
      "std:", df_P1["delta_ndcg"].std())


# ---------- P2 ----------
df_P2, hist_cf_P2, hist_h_P2 = run_h1_multiseed(
    train_pos_P2, val_pos_P2,
    base_label="H1 / P2: like | share | bookmark",
    seeds=SEEDS
)

print("\n=== H1 P2 multi-seed summary ===")
display(df_P2)
print("ΔNDCG mean:", df_P2["delta_ndcg"].mean(),
      "std:", df_P2["delta_ndcg"].std())


# ----------  ΔNDCG ----------
plt.figure()
plt.bar(df_P1["seed"].astype(str), df_P1["delta_ndcg"])
plt.axhline(0.0)
plt.title("H1 / P1: ΔNDCG@10 by seed (Hybrid − CF)")
plt.xlabel("seed")
plt.ylabel("ΔNDCG@10")
plt.show()

plt.figure()
plt.bar(df_P2["seed"].astype(str), df_P2["delta_ndcg"])
plt.axhline(0.0)
plt.title("H1 / P2: ΔNDCG@10 by seed (Hybrid − CF)")
plt.xlabel("seed")
plt.ylabel("ΔNDCG@10")
plt.show()


In [None]:
# ============================================================
# H1: BOOTSTRAP BY USERS (confidence interval)
# ============================================================
def bootstrap_delta(ndcg_cf, ndcg_h, n_boot=2000, seed=123):
    rng = np.random.default_rng(seed)
    deltas = ndcg_h - ndcg_cf
    n = len(deltas)

    boot_means = np.empty(n_boot)
    for i in range(n_boot):
        idx = rng.integers(0, n, size=n)
        boot_means[i] = deltas[idx].mean()

    mean_delta = deltas.mean()
    ci_low, ci_high = np.quantile(boot_means, [0.025, 0.975])
    p_improve = (boot_means > 0).mean()

    return mean_delta, (ci_low, ci_high), p_improve


# --- use one fixed seed (usually 42) ---
BOOT_SEED = 42

res_boot = run_h1_experiment(
    train_pos_P1, val_pos_P1,
    label=f"H1 / P1 bootstrap | watch_ratio >= {WATCH_THR}",
    epochs=10, k=10,
    seed=BOOT_SEED,
    plot=False
)

# per-user NDCG
ndcg_cf_users = np.array([h["ndcg"] for h in res_boot["hist_cf"][-1:]])
ndcg_h_users  = np.array([h["ndcg"] for h in res_boot["hist_h"][-1:]])
