## i2i 

In [None]:
from datasets import load_dataset

all_datasets = ["listens", "multi_event", "dislikes", "unlikes", "undislikes"]

ds_likes = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="likes.parquet")  # User listening events with playback details
ds_listens = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="listens.parquet") # User like actions
ds_dislikes = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="dislikes.parquet") # User dislike actions
ds_undislikes = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="undislikes.parquet") # User undislike actions (reverting dislikes)
ds_unlikes = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="unlikes.parquet") # User unlike actions (reverting likes)
ds_multi_event = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="multi_event.parquet") # Unified events	
# ds_embeddings = load_dataset("yandex/yambda", data_dir="flat/50m", data_files="embeddings") # Track audio-embeddings	






Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


### Подготовка датасета для обучения

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from tqdm import tqdm


# Сколько строк взять для прототипа (чтобы не убить память).
N_ROWS_SAMPLE = 500000

# для метрики качества Recall
RECALL_K = 10

# Максимальное число рекомендаций, которое будем считать для пользователя
MAX_RECS = RECALL_K
N_ITEM_NEIGHBORS = 20



ds_multi = load_dataset(
    "yandex/yambda",
    data_dir="flat/50m",
    data_files="multi_event.parquet"
)

df = ds_multi["train"].to_pandas()

if N_ROWS_SAMPLE is not None:
    df = df.sample(N_ROWS_SAMPLE, random_state=42)




# 2. Веса событий (implicit feedback)
def event_weight(row):
    et   = row["event_type"]
    ratio = min(row["played_ratio_pct"] / 100.0, 1.0)
    duration = row["track_length_seconds"]
    organic = row["is_organic"]  # 0 or 1

    # Прослушивание 
    base = ratio * 2            


    # Учитываем длительность 
    long_bonus = min(duration / 200, 1.0)  

    # Финальный скорк с учетом прослушивания и длительности 
    score = base + long_bonus          

    
    if et == "like":
        score += 2
    elif et == "dislike":
        score = -2
    elif et in ("unlike", "undislike"):
        score = -1

    # organic 
    if organic == 1:
        score *= 1.5    
    else:
        score *= 0.7     

    return score

df["weight"] = df.apply(event_weight, axis=1)

# Для простоты берём только позитивные сигналы
df_pos = df[df["weight"] > 0].copy()
print("Positive interactions:", len(df_pos))


# 3. Train / Test split (leave-last-out)


df_pos["timestamp"] = pd.to_datetime(df_pos["timestamp"], unit="ms")
df_pos = df_pos.sort_values(["uid", "timestamp"])

# берём последнее событие каждого юзера в test
last_idx = df_pos.groupby("uid")["timestamp"].idxmax()
test_df = df_pos.loc[last_idx].copy()
train_df = df_pos.drop(last_idx).copy()

# убираем юзеров, у которых нет train-эвентов
users_with_train = train_df["uid"].unique()
test_df = test_df[test_df["uid"].isin(users_with_train)]

print("Train events:", len(train_df), " Test events:", len(test_df))

if train_df.empty or test_df.empty:
    raise RuntimeError("Train или test пустые — проверь фильтрацию / timestamp.")

# 4. User–Item матрица (по train)


train_agg = (
    train_df.groupby(["uid", "item_id"], as_index=False)["weight"]
            .sum()
)

unique_users = train_agg["uid"].unique()
unique_items = train_agg["item_id"].unique()

user2idx = {u: i for i, u in enumerate(unique_users)}
item2idx = {it: j for j, it in enumerate(unique_items)}
idx2item = {j: it for it, j in item2idx.items()}

user_idx = train_agg["uid"].map(user2idx).values
item_idx = train_agg["item_id"].map(item2idx).values
weights  = train_agg["weight"].values.astype(float)

n_users = len(unique_users)
n_items = len(unique_items)

UI = csr_matrix((weights, (user_idx, item_idx)), shape=(n_users, n_items))
IU = UI.T.tocsr()   # Items x Users

print(f"Users in train: {n_users}, items in train: {n_items}")


Positive interactions: 486173
Train events: 477331  Test events: 8521
Users in train: 8521, items in train: 120872


In [19]:
# Обучаем i2i kNN

knn = NearestNeighbors(
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
)
knn.fit(IU)


In [20]:
 # Функции рекомендаций
def recommend_for_user(uid, top_n=MAX_RECS):
    """
    Рекомендации для пользователя uid по i2i kNN.
    Возвращает список item_id.
    """
    if uid not in user2idx:
        return []

    u_ind = user2idx[uid]
    user_items = UI[u_ind].indices
    user_weights = UI[u_ind].data

    if len(user_items) == 0:
        return []

    scores = defaultdict(float)

    for item_ind, w in zip(user_items, user_weights):
        item_vec = IU[item_ind]

        # количество соседей на айтем = top_n (можно сделать больше)
        n_neighbors = min(top_n + 1, n_items)
        distances, indices = knn.kneighbors(item_vec, n_neighbors=n_neighbors)
        distances = distances[0]
        indices = indices[0]

        for d, neigh_ind in zip(distances[1:], indices[1:]):  # пропускаем сам айтем
            sim = 1.0 - d
            scores[neigh_ind] += sim * w


    # убираем уже взаимодействованные айтемы
    for item_ind in user_items:
        scores.pop(item_ind, None)

    if not scores:
        return []

    ranked = sorted(scores.items(), key=lambda x: -x[1])[:top_n]
    rec_items = [idx2item[i] for i, _ in ranked]
    return rec_items


In [25]:
from tqdm import tqdm 

In [26]:

# 7. Оценка качества: Recall@K


def recall_at_k(test_df, k=RECALL_K):
    """
    Leave-one-out:
    для каждого uid есть один test item (последний по времени).
    Recall@K = доля юзеров, у которых тестовый item попал в топ-K рекомендаций.
    """
    hits = 0
    total = 0

    for uid in tqdm(test_df["uid"].unique()):
        test_items = test_df.loc[test_df["uid"] == uid, "item_id"].values
        if len(test_items) == 0:
            continue

        test_item = test_items[0]

        if test_item not in item2idx:
            continue

        recs = recommend_for_user(uid, top_n=k)
        if not recs:
            continue

        total += 1
        if test_item in recs:
            hits += 1

    if total == 0:
        return 0.0, 0

    return hits / total, total

In [None]:
# 8. Запуск оценки

recall, n_eval = recall_at_k(test_df, RECALL_K)
print(f"Recall@{RECALL_K}: {recall:.4f}  (evaluated on {n_eval} users)")


 12%|█▏        | 1042/8521 [24:54<2:19:49,  1.12s/it]

In [None]:
# Обновление датасета 