#  Семинар 3. userkNN  CV для сравнения моделей

In [3]:
import pandas as pd
import requests
from tqdm.auto import tqdm
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
import warnings

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import PopularModel, RandomModel, ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from rectools.metrics import MeanInvUserFreq, AvgRecPopularity
from implicit.nearest_neighbours import CosineRecommender

from userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Датасет KION 

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...', 
                        total=total_size_in_bytes, 
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

In [None]:
!unzip kion_train.zip -x '__MACOSX/*'

In [12]:
interactions = (
    pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={'total_dur': Columns.Weight,
                     'last_watch_dt': Columns.Datetime})
)
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

### ! если хотите быстро прогнать этот ноутбук - раскомментируйте эти строки - она уменьшает данные
import numpy as np
user_ids = np.random.choice(interactions.user_id.unique(), size=5000, replace=False)
interactions = interactions[interactions.user_id.isin(user_ids)]
###

print(interactions.shape, interactions.user_id.nunique())
interactions.head()

(29428, 5) 5000


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
263,243946,14317,2021-05-14,128,2.0
451,993099,1465,2021-05-23,128,0.0
612,230196,3130,2021-07-28,6566,87.0
642,546831,10440,2021-05-25,786,0.0
805,183259,5919,2021-05-22,583,8.0


In [5]:
%load_ext autoreload
%autoreload 2

# Тест как работает наш кастомный класс

In [13]:
model = UserKnn(model=CosineRecommender(), N_similar_users=30)
model.fit(interactions)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [14]:
recs = model.recommend(interactions.user_id.unique(), k=10)
recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,1097395,1290,5.061712,1
1,1097395,10876,4.920837,2
2,1097395,5600,3.9413,3
4,1097395,6738,3.878243,4
7,1097395,10878,3.286824,5


In [15]:
# TODO не всем пользователем достались 10 рекомендаций
assert (recs.groupby('user_id')['item_id'].nunique() == 10).all()

AssertionError: 

# Задаем метрики и модели, по которым будем делать CV

In [9]:
metrics = {
    'map@10': MAP(k=10),
    "novelty": MeanInvUserFreq(10),  # новизна
    "AvgRecPopularity": AvgRecPopularity(10),  # popularity bias
}

# модели rectools + наша кастомная
models = {
    "popular_all": PopularModel(),
    "random": RandomModel(random_state=1),
    "itemknn_cosine": ImplicitItemKNNWrapperModel(model=CosineRecommender()),
    "itemknn_tfidf": ImplicitItemKNNWrapperModel(model=TFIDFRecommender()),
    "userknn_cosine": UserKnn(model=CosineRecommender(), N_similar_users=50),
    "userknn_tfidf": UserKnn(model=TFIDFRecommender(), N_similar_users=50)
}

In [16]:
# инициализируем сплиттер (TimeRangeSplitter)
splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=3,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

fold_iterator = splitter.split(Interactions(interactions), collect_fold_stats=True)

# цикл по фолдам и моделям 
results = []

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n===== Fold {i_fold} =====")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()

    # обучаем и оцениваем каждую модель
    for model_name, model in models.items():
        print(f"Training '{model_name}' on fold {i_fold}...")

        if 'userknn' in model_name:
            model.fit(df_train)
            reco = model.recommend(
                users=df_test[Columns.User].unique(),
                k=10
            )
            
        else:
            dataset = Dataset.construct(df_train)
            model.fit(dataset)  # rectools - подаем Dataset (train_data)
            reco = model.recommend(
                users=df_test[Columns.User].unique(),
                dataset=dataset,  
                k=10,
                filter_viewed=True
            )
      
        # метрики
        metric_values = calc_metrics(
            metrics=metrics,
            reco=reco,
            interactions=df_test,   
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold_result = {"fold": i_fold, "model": model_name}
        fold_result.update(metric_values)
        results.append(fold_result)



===== Fold 0 =====
{'end': Timestamp('2021-08-09 00:00:00'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00'),
 'test': 1291,
 'test_items': 669,
 'test_users': 506,
 'train': 23049,
 'train_items': 3792,
 'train_users': 4116}
Training 'popular_all' on fold 0...
Training 'random' on fold 0...
Training 'itemknn_cosine' on fold 0...
Training 'itemknn_tfidf' on fold 0...
Training 'userknn_cosine' on fold 0...


  0%|          | 0/4116 [00:00<?, ?it/s]

Training 'userknn_tfidf' on fold 0...


  0%|          | 0/4116 [00:00<?, ?it/s]


===== Fold 1 =====
{'end': Timestamp('2021-08-16 00:00:00'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00'),
 'test': 1254,
 'test_items': 627,
 'test_users': 509,
 'train': 25106,
 'train_items': 3986,
 'train_users': 4410}
Training 'popular_all' on fold 1...
Training 'random' on fold 1...
Training 'itemknn_cosine' on fold 1...
Training 'itemknn_tfidf' on fold 1...
Training 'userknn_cosine' on fold 1...


  0%|          | 0/4410 [00:00<?, ?it/s]

Training 'userknn_tfidf' on fold 1...


  0%|          | 0/4410 [00:00<?, ?it/s]


===== Fold 2 =====
{'end': Timestamp('2021-08-23 00:00:00'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00'),
 'test': 1447,
 'test_items': 724,
 'test_users': 546,
 'train': 27163,
 'train_items': 4165,
 'train_users': 4704}
Training 'popular_all' on fold 2...
Training 'random' on fold 2...
Training 'itemknn_cosine' on fold 2...
Training 'itemknn_tfidf' on fold 2...
Training 'userknn_cosine' on fold 2...


  0%|          | 0/4704 [00:00<?, ?it/s]

Training 'userknn_tfidf' on fold 2...


  0%|          | 0/4704 [00:00<?, ?it/s]

In [17]:
results_df = pd.DataFrame(results)
display(results_df.groupby('model').mean()[metrics.keys()])

Unnamed: 0_level_0,map@10,novelty,AvgRecPopularity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
itemknn_cosine,0.017091,10.030123,44.131601
itemknn_tfidf,0.044462,8.799985,117.712863
popular_all,0.088588,3.677358,409.481105
random,0.000891,10.756232,5.947228
userknn_cosine,0.002922,8.208261,135.315994
userknn_tfidf,0.004607,8.125517,118.847867
