In [1]:
!pip -q install rectools

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Импорты

In [2]:
import time
import tqdm
import zipfile as zf
import requests
import logging
from copy import deepcopy, copy
from typing import Dict, List, Tuple, Union, Callable, Any
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import display

In [3]:
import rectools
from rectools.dataset import Interactions, Dataset, DenseFeatures
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models.base import ModelBase
from rectools.models import RandomModel, PopularModel
from rectools.metrics.base import MetricAtK
from rectools.metrics import (
    Precision,
    Recall,
    MAP,
    NDCG,
    Serendipity,
    MeanInvUserFreq,
    IntraListDiversity,
    PairwiseHammingDistanceCalculator,
    calc_metrics,
)



## Загрузка данных

In [4]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
request = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
  total_sipe_in_bytes = int(request.headers.get('Content-Length', 0))
  progress_bar = tqdm(desc='KION dataset download', total=total_sipe_in_bytes, unit='iB', unit_scale=True)
  for chunk in request.iter_content(chunk_size= 2 ** 20):
    progress_bar.update(len(chunk))
    fd.write(chunk)

KION dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
files = zf.ZipFile('kion.zip', 'r')
files.extractall()
files.close()

In [147]:
idf = pd.read_csv("/content/data_original/interactions.csv", parse_dates=["last_watch_dt"])
idf.rename(columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True)
interactions = Interactions(idf)

### Для удобства поместим все метрики в один объект

In [42]:
# В задании сказано, что нужно 6 метрик, добавляем одну самостоятельно
def create_coverage_metric(item):
    return item.item_id.nunique() / len(item)

In [121]:
ats = [1, 5, 10]
metrics = { "Precision": Precision, "Recall": Recall, "MAP": MAP, "NDCG": NDCG, "MeanInvUserFreq": MeanInvUserFreq }
metrics_with_thresholds = {}

for name, metric in metrics.items():
  kwargs = {}
  if isinstance(metric, tuple):
      kwargs.update(**metric[1])
      metric = metric[0]
  for at in ats:
    metrics_with_thresholds.update({"{}@{}".format(name, at): metric(k=at, **kwargs)})

metrics_with_thresholds.update({"Coverage": create_coverage_metric})
metrics_with_thresholds

{'Precision@1': Precision(k=1),
 'Precision@5': Precision(k=5),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@5': Recall(k=5),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'NDCG@1': NDCG(k=1, log_base=2),
 'NDCG@5': NDCG(k=5, log_base=2),
 'NDCG@10': NDCG(k=10, log_base=2),
 'MeanInvUserFreq@1': MeanInvUserFreq(k=1),
 'MeanInvUserFreq@5': MeanInvUserFreq(k=5),
 'MeanInvUserFreq@10': MeanInvUserFreq(k=10),
 'Coverage': <function __main__.create_coverage_metric(item)>}

### Расчет метрик

In [139]:
def calculate_metrics(dataset, models, metrics, splitter, k):

    # Выберем K метрики
    metrics_at_k = {k: v for k, v in metrics.items() if isinstance(v, MetricAtK)}

    # Выберем coverage метрики, мы создавали их в create_coverage_metric()
    coverage_metrics = {k: v for k, v in metrics.items() if k not in metrics_at_k}

    res = []
    interacted = Interactions(dataset)
    splits = splitter.split(interactions)

    for train_ids, test_ids, i in splits:

        train = Dataset.construct(dataset.iloc[train_ids])
        test = Dataset.construct(dataset.iloc[test_ids])

        prev_interacted = train.interactions.df
        test_interacted = test.interactions.df

        # Вычисляем K метрики
        for name, model in models.items():
            model = deepcopy(model)

            start = time.time()
            model.fit(train)
            end = time.time()

            recommended = model.recommend(test.user_id_map.external_ids, train, k, True)

            metrics = {"model_name": name, "total": end - start,
                **calc_metrics(
                    metrics_at_k,
                    reco=recommended,
                    interactions=test_interacted,
                    prev_interactions=prev_interacted,
                ),
            }

            # Отдельно высчитываем нами же добавленный coverage и добавляем в результат
            for name, metric in coverage_metrics.items():
                metrics.update({name: metric(recommended)})
            res.append(metrics)

    return res

In [140]:
splitter = rectools.model_selection.time_split.TimeRangeSplitter("1D", 3)
models = {"RandomModel": RandomModel(random_state=32), "PopularModel": PopularModel()}

results = calculate_metrics(idf, models, metrics_with_thresholds, splitter, 10)

### Визуализация метрик

In [142]:
pd.DataFrame(results).groupby("model_name").mean()

Unnamed: 0_level_0,total,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Coverage
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
popular,3.293556,0.0,0.0,0.0,0.0,1e-06,3e-06,0.0,0.0,7.181475e-07,0.0,0.0,4.309706e-07,16.810539,16.168155,15.20751,0.00011
random,4.3e-05,0.0,0.0,0.0,0.0,2e-06,1.1e-05,0.0,0.0,1.505503e-06,0.0,0.0,1.401207e-06,15.861961,15.860099,15.860982,0.049638


### Визуализация рекомендаций

In [145]:
items = pd.read_csv("/content/data_original/items.csv")
interactions = pd.read_csv("/content/data_original/interactions.csv")

In [154]:
interactions_df = pd.read_csv("/content/data_original/interactions.csv", parse_dates=["last_watch_dt"])
interactions_df.rename(
    columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True
)

In [200]:
def visualize(model, dataset, ids, features, items_df):
  # Генерим рекомендации для отобранных user_id
  recommendations = model.recommend(ids, dataset, 3, True)
  interactions_df = dataset.interactions.df
  history = interactions_df[interactions_df.user_id.isin(ids)]

  for id in ids:

    # Получаем из датасета истории просмотров юзеров
    history_for_id = history[history.user_id.isin([id])]
    hist = items_df.join(history_for_id.set_index('item_id'), on='item_id')[["user_id"] + features]
    hist_for_id = hist.loc[hist['user_id'].isin([id])]
    print("id {} смотрел".format(id))
    print(hist_for_id)
    print('')

    # Отображаем в любом удобном виде, который позволит смотреть на историю просмотра юзера + на его рекомендации от модели
    recommendations_for_id = model.recommend(ids, dataset, 3, True)
    rec = pd.merge(recommendations_for_id, items_df)[["user_id"] + features]
    rec_for_id = rec.loc[rec['user_id'].isin([id])]
    print("id {} порекомендовали".format(id))
    print(rec_for_id)
    print('---------------------------------------------------------------------------')
    print('')

In [201]:
model = RandomModel(random_state=32)
dataset = Dataset.construct(interactions_df)
model.fit(dataset)
ids = np.array([666262, 672861, 955527])

In [202]:
visualize(model, dataset, ids, ["title"], items)

id 666262 смотрел
        user_id                 title
11230  666262.0  Дом ночных призраков

id 666262 порекомендовали
   user_id                                         title
0   666262                           Возвращение Будулая
1   666262  Новые приключения Аладдина (жестовым языком)
2   666262                             Пропавшая грамота
---------------------------------------------------------------------------

id 672861 смотрел
        user_id                          title
11182  672861.0                 В ритме сердца
13578  672861.0  Медвежонок Винни и его друзья

id 672861 порекомендовали
   user_id                     title
3   672861          Женщина в беде 3
4   672861  Гордость и предубеждение
5   672861                Болванчики
---------------------------------------------------------------------------

id 955527 смотрел
       user_id        title
8909  955527.0  Признание 5

id 955527 порекомендовали
   user_id                      title
6   955527              