### Настройка окружения и переменных окружения

In [1]:
# python 3.13

import os
from tqdm import tqdm

from tecd_retail_recsys.data import download_tecd_data
from tecd_retail_recsys.data import DataPreprocessor
from tecd_retail_recsys.metrics import calculate_metrics
from tecd_retail_recsys.models import TopPopular, TopPersonal, EASE, iALS, TIFUKNN


# HuggingFace token for downloading dataset
with open('hf_token.txt', 'r') as file:
    hf_token = file.read()
os.environ['HF_TOKEN'] = hf_token

In [2]:
%load_ext autoreload
%autoreload 2

### Загрузка и предобработка данных

In [None]:
download_tecd_data()

Downloading 230 files to t_ecd_small_partial


Downloading files:   0%|          | 0/230 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:   0%|          | 1/230 [00:01<04:03,  1.06s/it]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:   3%|▎         | 7/230 [00:01<00:28,  7.83it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:   5%|▍         | 11/230 [00:01<00:24,  9.02it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:   7%|▋         | 16/230 [00:01<00:16, 13.33it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:   8%|▊         | 19/230 [00:01<00:15, 13.90it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  10%|█         | 24/230 [00:02<00:11, 18.14it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  12%|█▏        | 28/230 [00:02<00:12, 16.46it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  13%|█▎        | 31/230 [00:03<00:20,  9.51it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  16%|█▌        | 36/230 [00:03<00:14, 13.59it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  17%|█▋        | 39/230 [00:04<00:24,  7.88it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  18%|█▊        | 41/230 [00:04<00:27,  6.94it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  20%|█▉        | 45/230 [00:04<00:20,  9.17it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  21%|██▏       | 49/230 [00:04<00:14, 12.30it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  23%|██▎       | 52/230 [00:05<00:17, 10.26it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  24%|██▍       | 56/230 [00:05<00:15, 11.17it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  25%|██▌       | 58/230 [00:05<00:14, 11.64it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  27%|██▋       | 61/230 [00:05<00:12, 13.76it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  29%|██▊       | 66/230 [00:05<00:09, 17.54it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  30%|███       | 69/230 [00:06<00:08, 19.30it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  31%|███▏      | 72/230 [00:06<00:07, 21.23it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  33%|███▎      | 76/230 [00:06<00:06, 23.91it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  35%|███▍      | 80/230 [00:06<00:05, 26.28it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  36%|███▌      | 83/230 [00:06<00:05, 26.28it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  38%|███▊      | 87/230 [00:06<00:05, 26.23it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  40%|████      | 92/230 [00:06<00:05, 27.59it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  42%|████▏     | 96/230 [00:06<00:04, 29.34it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  43%|████▎     | 100/230 [00:07<00:04, 26.26it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  45%|████▌     | 104/230 [00:07<00:05, 24.59it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  47%|████▋     | 109/230 [00:07<00:04, 27.50it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  49%|████▊     | 112/230 [00:07<00:04, 26.43it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  50%|█████     | 116/230 [00:07<00:04, 26.73it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  52%|█████▏    | 119/230 [00:07<00:05, 21.11it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  54%|█████▍    | 125/230 [00:08<00:03, 28.43it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  56%|█████▌    | 129/230 [00:08<00:03, 25.58it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  58%|█████▊    | 133/230 [00:08<00:03, 24.56it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  60%|██████    | 139/230 [00:08<00:02, 30.88it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  62%|██████▏   | 143/230 [00:08<00:03, 25.92it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  64%|██████▍   | 147/230 [00:08<00:02, 28.17it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  66%|██████▌   | 151/230 [00:09<00:02, 26.52it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  67%|██████▋   | 154/230 [00:09<00:02, 26.94it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  68%|██████▊   | 157/230 [00:09<00:03, 23.31it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  70%|███████   | 161/230 [00:09<00:02, 24.77it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  72%|███████▏  | 166/230 [00:09<00:02, 28.78it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  74%|███████▍  | 170/230 [00:09<00:02, 26.72it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  75%|███████▌  | 173/230 [00:09<00:02, 21.93it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  77%|███████▋  | 177/230 [00:10<00:02, 23.00it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  78%|███████▊  | 180/230 [00:10<00:02, 20.47it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  81%|████████  | 186/230 [00:10<00:01, 28.05it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  83%|████████▎ | 190/230 [00:10<00:01, 26.82it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  84%|████████▍ | 194/230 [00:10<00:01, 25.53it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  87%|████████▋ | 199/230 [00:10<00:01, 28.88it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  88%|████████▊ | 203/230 [00:11<00:01, 25.62it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  90%|█████████ | 208/230 [00:11<00:00, 23.57it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  92%|█████████▏| 211/230 [00:11<00:00, 24.54it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  93%|█████████▎| 214/230 [00:11<00:01, 14.99it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  94%|█████████▍| 217/230 [00:12<00:01, 12.52it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  95%|█████████▌| 219/230 [00:12<00:00, 12.67it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  97%|█████████▋| 222/230 [00:12<00:00, 14.01it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files:  97%|█████████▋| 224/230 [00:12<00:00, 14.15it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading files: 100%|██████████| 230/230 [00:12<00:00, 18.03it/s]

Download completed: 230/230 files successful





In [3]:

# всего 227 дней данных в датасете
dp = DataPreprocessor(day_begin=1082, day_end=1308, val_days=20, test_days=20, min_user_interactions=1, min_item_interactions=20)
train_df, val_df, test_df = dp.preprocess()

Starting data preprocessing...
Loading events from t_ecd_small_partial/dataset/small/retail/events
Loaded 236,479,226 total events
Filtered to 3,758,762 events with action_type='added-to-cart'
After filtering (min_user_interactions=1, min_item_interactions=20): 3,249,972 events, 84,944 users, 30,954 items
Created mappings: 84944 users, 30954 items
Temporal split - Train: days < 1269 (902,543 events), Val: days 1269-1288 (228,339 events), Test: days >= 1289 (223,395 events)
Users in each part (train, val, test) - 7425


`Оставляю только те айтемы, по которым есть хотя бы 20 взаимодействий, тем самым это позволит сократить часть данных и ускорить вычисления (что особенно полезно для модели EASE), снизит разреженность матрицы взаимодействий, сделает фокус на бизнес-релевантных товарах (товары с низкой частотой покупок часто являются нишевыми, сезонными или устаревшими позициями).`

In [4]:
train_df.head()

Unnamed: 0,timestamp,user_id,item_id,subdomain,action_type,os,day
1252,93485160,79038,20358,main,added-to-cart,android,1082
1336,93485187,44584,23489,item,added-to-cart,android,1082
1453,93485221,12869,2908,item,added-to-cart,android,1082
2144,93485421,42145,18904,search,added-to-cart,ios,1082
2189,93485437,15304,14462,catalog,added-to-cart,android,1082


In [5]:
# датафрейм формата: user_id - [(item_id1, timestamp1), (item_id2, timestamp2), ...]
joined = dp.get_grouped_data(train_df, val_df, test_df)
print(joined.shape)
joined.head(10)

(7425, 4)


Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions
0,11,"[(8476, 95399246), (7952, 95411006), (19655, 9...","[(8591, 110164749), (4010, 110185189), (750, 1...","[(18369, 112022725), (29875, 112032227), (1325..."
1,14,"[(11776, 98623953), (11360, 98624852), (17503,...","[(30789, 111213199), (17585, 111242301), (2802...","[(8664, 111400827), (10311, 111404767), (17934..."
2,21,"[(16067, 105087839), (6548, 105089789), (4237,...","[(2195, 109690171), (24455, 109704924), (6375,...","[(5535, 111584788), (25886, 111614896), (13617..."
3,29,"[(24855, 105300569), (1228, 105347095), (1163,...","[(10268, 111157530), (2169, 111175402), (19736...","[(22555, 111565119), (5342, 111585126), (16892..."
4,39,"[(18478, 97610336), (26949, 97898345), (15341,...","[(30001, 110120451), (19002, 110125294), (1458...","[(13391, 111576876), (14124, 111591192), (1601..."
5,50,"[(22366, 104572854), (17179, 104575922), (1522...","[(4164, 110351806), (9497, 110351936), (8222, ...","[(12641, 111581805), (17127, 111591088), (3054..."
6,57,"[(6646, 95313142), (1916, 95315235), (10801, 9...","[(25291, 110102310), (21758, 110107367), (1786...","[(17861, 112174473), (27541, 112174676), (2998..."
7,72,"[(15625, 96009394), (19585, 96016728), (27989,...","[(27373, 109951487), (12522, 109975279), (1745...","[(579, 111928617), (11435, 111936448), (18797,..."
8,116,"[(4566, 103467223), (13383, 103467594), (5330,...","[(8476, 110003862), (30451, 110004188), (24367...","[(16572, 111399181), (10986, 111400880), (1981..."
9,118,"[(8034, 105509277), (6010, 105511292), (16328,...","[(7286, 109752777), (4237, 109753233), (5459, ...","[(16561, 112147082), (29369, 112169203), (2841..."


### Baseline-models

- Top-Popular
- Top-Personal
- EASE
- iALS
- TIFU-KNN

#### TopPopular

In [6]:
toppop = TopPopular()
toppop.fit(joined)
joined['toppopular_recs'] = toppop.predict(joined, topn=100)
joined.head()

Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions,toppopular_recs
0,11,"[(8476, 95399246), (7952, 95411006), (19655, 9...","[(8591, 110164749), (4010, 110185189), (750, 1...","[(18369, 112022725), (29875, 112032227), (1325...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,..."
1,14,"[(11776, 98623953), (11360, 98624852), (17503,...","[(30789, 111213199), (17585, 111242301), (2802...","[(8664, 111400827), (10311, 111404767), (17934...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,..."
2,21,"[(16067, 105087839), (6548, 105089789), (4237,...","[(2195, 109690171), (24455, 109704924), (6375,...","[(5535, 111584788), (25886, 111614896), (13617...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,..."
3,29,"[(24855, 105300569), (1228, 105347095), (1163,...","[(10268, 111157530), (2169, 111175402), (19736...","[(22555, 111565119), (5342, 111585126), (16892...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,..."
4,39,"[(18478, 97610336), (26949, 97898345), (15341,...","[(30001, 110120451), (19002, 110125294), (1458...","[(13391, 111576876), (14124, 111591192), (1601...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,..."


In [7]:
print('Метрики на валидацинной выборке:')
val_metrics_toppop = calculate_metrics(joined, model_preds='toppopular_recs', gt_col='val_interactions', verbose=True)

print('\nМетрики на тестовой выборке:')
test_metrics_toppop = calculate_metrics(joined, model_preds='toppopular_recs', gt_col='test_interactions', verbose=True)

Метрики на валидацинной выборке:
NDCG@10 = 0.1028
Recall@10 = 0.0143
NDCG@100 = 0.3173
Recall@100 = 0.0767

Метрики на тестовой выборке:
NDCG@10 = 0.0926
Recall@10 = 0.0134
NDCG@100 = 0.2872
Recall@100 = 0.0684


`В целом отличные метрики для TopPopular: NDCG@100=0.3058 на валидации и 0.2732 на тестовых данных!`

#### TopPersonal

In [8]:
toppers = TopPersonal()
toppers.fit(joined)
joined['toppersonal_recs'] = toppers.predict(joined, topn=100)
joined.head()

Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions,toppopular_recs,toppersonal_recs
0,11,"[(8476, 95399246), (7952, 95411006), (19655, 9...","[(8591, 110164749), (4010, 110185189), (750, 1...","[(18369, 112022725), (29875, 112032227), (1325...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,...","[11413, 20331, 15494, 18369, 2104, 24676, 8476..."
1,14,"[(11776, 98623953), (11360, 98624852), (17503,...","[(30789, 111213199), (17585, 111242301), (2802...","[(8664, 111400827), (10311, 111404767), (17934...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,...","[11776, 9654, 21847, 25640, 25795, 10184, 7982..."
2,21,"[(16067, 105087839), (6548, 105089789), (4237,...","[(2195, 109690171), (24455, 109704924), (6375,...","[(5535, 111584788), (25886, 111614896), (13617...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,...","[29228, 24455, 6548, 17934, 7548, 16067, 4237,..."
3,29,"[(24855, 105300569), (1228, 105347095), (1163,...","[(10268, 111157530), (2169, 111175402), (19736...","[(22555, 111565119), (5342, 111585126), (16892...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,...","[6409, 28497, 6163, 20549, 30537, 136, 1228, 1..."
4,39,"[(18478, 97610336), (26949, 97898345), (15341,...","[(30001, 110120451), (19002, 110125294), (1458...","[(13391, 111576876), (14124, 111591192), (1601...","[17934, 11413, 5631, 21321, 7982, 20587, 8476,...","[13391, 4194, 27517, 66, 15184, 14580, 10449, ..."


In [9]:
print('Метрики на валидацинной выборке:')
val_metrics_toppers = calculate_metrics(joined, model_preds='toppersonal_recs', gt_col='val_interactions', verbose=True)

print('\nМетрики на тестовой выборке:')
test_metrics_toppers = calculate_metrics(joined, model_preds='toppersonal_recs', gt_col='test_interactions', verbose=True)

Метрики на валидацинной выборке:
NDCG@10 = 0.2965
Recall@10 = 0.0659
NDCG@100 = 0.4391
Recall@100 = 0.1647

Метрики на тестовой выборке:
NDCG@10 = 0.2631
Recall@10 = 0.0596
NDCG@100 = 0.4027
Recall@100 = 0.1471


`Смогли неплохо так вырастить метрики!`

#### EASE

In [10]:
# создадим матрицу интеракций
matrix, idx_to_item = dp.get_interactions_matrix(joined)
print(matrix.shape)

(7425, 30751)


In [11]:
ease = EASE(idx_to_item=idx_to_item, reg_weight=1000)
ease.fit(matrix)
joined['ease_recs'] = ease.predict(joined, topn=100)

In [13]:
print('Метрики на валидацинной выборке:')
val_metrics_ease = calculate_metrics(joined, model_preds='ease_recs', gt_col='val_interactions', verbose=True)

print('\nМетрики на тестовой выборке:')
test_metrics_ease = calculate_metrics(joined, model_preds='ease_recs', gt_col='test_interactions', verbose=True)

Метрики на валидацинной выборке:
NDCG@10 = 0.0908
Recall@10 = 0.0145
NDCG@100 = 0.2409
Recall@100 = 0.0532

Метрики на тестовой выборке:
NDCG@10 = 0.0828
Recall@10 = 0.0127
NDCG@100 = 0.2257
Recall@100 = 0.0487


```
EASE работает плохо, потому что:

- Данные слишком разреженные (мало interactions на user/item)
- С большой регуляризацией он вырождается в "плохой TopPopular"
- Часть val/test items отсутствуют в train (cold start)
```

#### iALS

In [14]:
ials = iALS(idx_to_item=idx_to_item)
ials.fit(matrix)
joined['ials_recs'] = ials.predict(joined, topn=100)

Iter № 1/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5375.87it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 8038.06it/s]


Iter № 2/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5389.34it/s]
Updating items: 100%|██████████| 30751/30751 [00:04<00:00, 7637.78it/s]


Iter № 3/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5601.00it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 8180.17it/s]


Iter № 4/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5493.70it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 8131.47it/s]


Iter № 5/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5563.56it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 7817.33it/s]


Iter № 6/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5133.05it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 7815.28it/s]


Iter № 7/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5322.03it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 7813.82it/s]


Iter № 8/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5510.76it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 8191.07it/s]


Iter № 9/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5530.50it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 7914.06it/s]


Iter № 10/10:


Updating users: 100%|██████████| 7425/7425 [00:01<00:00, 5349.78it/s]
Updating items: 100%|██████████| 30751/30751 [00:03<00:00, 8106.45it/s]


In [15]:
print('Метрики на валидацинной выборке:')
val_metrics_ials = calculate_metrics(joined, model_preds='ials_recs', gt_col='val_interactions', verbose=True)

print('\nМетрики на тестовой выборке:')
test_metrics_ials = calculate_metrics(joined, model_preds='ials_recs', gt_col='test_interactions', verbose=True)

Метрики на валидацинной выборке:
NDCG@10 = 0.1009
Recall@10 = 0.0164
NDCG@100 = 0.2652
Recall@100 = 0.0622

Метрики на тестовой выборке:
NDCG@10 = 0.0855
Recall@10 = 0.0131
NDCG@100 = 0.2496
Recall@100 = 0.0580


#### TIFU-KNN

Параметры для экспериментов:
- alpha: попробуйте 0.5-0.9 (баланс между повторением и исследованием)
- n_neighbors: 100-500 (больше соседей → больше коллаборации, но медленнее)
- within_decay_rate: 0.8-0.95 (как быстро "забывать" старые покупки)
- n_groups: 5-10 (на сколько групп делить историю)

Модель особенно хороша для:
- Next-basket recommendation (предсказание следующей корзины покупок)
- Повторяющихся покупок (grocery, retail)
- Temporal patterns в покупках

In [16]:
tifu = TIFUKNN(n_neighbors=1000)
tifu.fit(joined)
joined['tifuknn_recs'] = tifu.predict(joined, topn=100)

Building TIFU-KNN PIF vectors for 7425 users and 30751 items...


Computing PIF: 100%|██████████| 7425/7425 [00:00<00:00, 15011.12it/s]


Precomputing user similarities and neighbors...
Finding 1000 nearest neighbors for each user...


Finding neighbors: 100%|██████████| 7425/7425 [00:00<00:00, 10980.36it/s]


Precomputing collaborative signals...


Computing neighbor PIF: 100%|██████████| 7425/7425 [00:49<00:00, 151.52it/s]


TIFU-KNN training completed!
TIFU-KNN training completed!
Generating recommendations for 7425 users...
Masking interacted items...


Creating masks: 100%|██████████| 7425/7425 [00:00<00:00, 32728.57it/s]


Computing top-N recommendations...


Extracting top-N: 100%|██████████| 7425/7425 [00:01<00:00, 4141.46it/s]


In [17]:
print('Метрики на валидацинной выборке:')
val_metrics_tifuknn = calculate_metrics(joined, model_preds='tifuknn_recs', gt_col='val_interactions', verbose=True)

print('\nМетрики на тестовой выборке:')
test_metrics_tifuknn = calculate_metrics(joined, model_preds='tifuknn_recs', gt_col='test_interactions', verbose=True)

Метрики на валидацинной выборке:
NDCG@10 = 0.1110
Recall@10 = 0.0183
NDCG@100 = 0.2851
Recall@100 = 0.0700

Метрики на тестовой выборке:
NDCG@10 = 0.0975
Recall@10 = 0.0149
NDCG@100 = 0.2735
Recall@100 = 0.0655


### **Итоги**

**Сравнительная таблица baseline-моделей**


| Модель | NDCG@100 (val) | Recall@100 (val) | NDCG@100 (test) | Recall@100 (test) | Время обучения |
|--------|----------------|------------------|-----------------|-------------------|----------------|
| **TopPersonal** | **0.4391** | **0.1647** | **0.4027** | **0.1471** | мгновенно |
| TopPopular | 0.3173 | 0.0767 | 0.2872 | 0.0684 | мгновенно |
| TIFU-KNN | 0.2851 | 0.0700 | 0.2735 | 0.0655 | ~1 минута |
| iALS | 0.2652 | 0.0622 | 0.2496 | 0.0580 | ~1 минута |
| EASE | 0.2409 | 0.0532 | 0.2257 | 0.0487 | 10 минут |


**Ключевые выводы:**

1. **TopPersonal - безусловный лидер** 🏆
   - Лучшие показатели по всем метрикам
   - NDCG@100 = 0.4391 на валидации (на 38% выше второго места)
   - Recall@100 = 0.1647 (в 2+ раза выше конкурентов)
   - **Причина успеха**: В ритейле пользователи склонны повторять покупки одних и тех же товаров (молоко, хлеб, бытовая химия), что идеально подходит для персонализированной частотной рекомендации

2. **TIFU-KNN - второе место по NDCG@100**
   - Хороший баланс между персонализацией и коллаборацией
   - Учитывает temporal patterns в покупках
   - Медленнее TopPersonal, но показывает стабильные результаты

3. **TopPopular - сильный baseline**
   - Лучший по Recall@100 среди неперсонализированных методов
   - Мгновенное обучение и inference
   - Хорошо работает для холодных пользователей

4. **iALS - разочарование**
   - Долгое обучение (10 итераций)
   - Метрики хуже TopPopular несмотря на сложность
   - Страдает от разреженности данных

5. **EASE - худший результат**
   - Самые низкие метрики среди всех моделей
   - Проблемы: крайне разреженные данные (7425 users × 30751 items), cold start items

**Почему TopPersonal победил:**

- **Специфика ритейла**: Частота повторных покупок >> 1
- **Персонализация**: Каждый пользователь имеет свой уникальный набор регулярных товаров
- **Простота**: Отсутствие переобучения благодаря простоте метода
- **Устойчивость**: Не требует плотных данных, работает на sparse interactions

![](imgs/models.png)

#### **Next time**

Далее в ~~сериале~~ планах:

- добавить настройку зависимостей в репо, упорядочить директории более точно (особенно models/);
- реализовать более корректную схему валидации (рекомендовать для валидации только по трейну, а для теста по трейну+валидации);
- учет кросс-доменной информации (feature engineering по другим доменам);
- опробовать SOTA-модели: SASRec, BERT4Rec, двухуровневая архитектура с бустингом;
- мерить метрики стоимости итоговой корзины, чтобы понимать, где мы выигрываем, как бизнес;
- попробовать использовать RAG-архитектуру в рекомендашках (реализовать идеи):
    * Embedding-based Retrieval (использовать эмбеддинги товаров как еще один источник информации, сравнивать с ними эмбеддинг состояния корзины);
    * Generative Retrieval (энкодер кодирует последние взаимодействия пользователя и прочие признаки, декодер принимает выход энкодера и генерирует последовательности кодов, соответствующие топ-k айтемам. Генерация топ-k кандидатов происходит с помощью beam search.
)


![](imgs/skeletron.jpg)