In [1]:
from pprint import pprint

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import MRR, MAP, Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from userknn import UserKnn
import pickle
import random



# Load data

In [2]:
%%time
# !wget https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O ../data/data_original.zip
# !uzip ../data/data_original.zip

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs


In [3]:
interactions = pd.read_csv('../data/data_original/interactions.csv')
users = pd.read_csv('../data/data_original/users.csv')
items = pd.read_csv('../data/data_original/items.csv')

In [4]:
interactions = interactions.rename(columns={'total_dur': Columns.Weight, 
                                            'last_watch_dt': Columns.Datetime})

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Build model

In [5]:
max_date = interactions['datetime'].max()

train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]

# оставляем только теплых пользователей в тесте
test = test[test['user_id'].isin(train['user_id'].unique())]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (349088, 5)


# Popular model

In [6]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

In [7]:
pop = PopularModel()
pop.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f71c081bb50>

In [8]:
pop_recs = pop.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=20,
    filter_viewed=False  # True - удаляет просмотренные айтемы из рекомендаций 
)

pop_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,187877.0,1
1,176549,15297,178630.0,2
2,176549,9728,117779.0,3
3,176549,13865,113875.0,4
4,176549,4151,85117.0,5


In [9]:
pop_recs = pop_recs.merge(items[['item_id', 'title']],
                           on='item_id',
                           how='left')
pop_recs[pop_recs['user_id'] == 176549]

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,187877.0,1,Хрустальный
1,176549,15297,178630.0,2,Клиника счастья
2,176549,9728,117779.0,3,Гнев человеческий
3,176549,13865,113875.0,4,Девятаев
4,176549,4151,85117.0,5,Секреты семейной жизни
5,176549,3734,68835.0,6,Прабабушка легкого поведения
6,176549,2657,66017.0,7,Подслушано
7,176549,4880,52909.0,8,Афера
8,176549,142,42466.0,9,Маша
9,176549,6809,39320.0,10,Дуров


In [10]:
# with open("../service/models/pop_recs.pkl", "wb") as file: # Save popular model
#     pickle.dump(pop_recs, file)

# with open("../service/models/pop_recs.pkl", "rb") as file: # Load popular model
#     pop_recs = pickle.load(file)

# Train KNN

In [11]:
userknn_model = UserKnn(model=TFIDFRecommender(20, 4), N_users=50)

userknn_model.fit(train)

recos = userknn_model.predict(test)

metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
}

catalog = train[Columns.Item].unique()
metric_values = calc_metrics(
    metrics,
    reco=recos,
    interactions=test,
    prev_interactions=train,
    catalog=catalog,
)

# with open("../service/models/userknn_model.pkl", "wb") as file: # Save knn model
#     pickle.dump(userknn_model, file)
    
# with open("userknn_model.pkl", "rb") as file: # Load popular model
#     userknn_model = pickle.load(file)
metric_values



  0%|          | 0/896791 [00:00<?, ?it/s]

{'map@10': 0.0058447523806030265, 'novelty': 7.671744788118634}

In [12]:
userknn_model.predict(pd.DataFrame([176549], columns=["user_id"]))

Unnamed: 0,user_id,item_id,score,rank
61,176549,13962,2.547538,1
47,176549,6737,2.367816,2
18,176549,5518,2.343482,3
7,176549,15469,2.328539,4
22,176549,12448,2.306933,5
40,176549,10544,2.2719,6
9,176549,5482,2.251271,7
11,176549,10688,2.236721,8
44,176549,7000,2.216956,9
53,176549,4273,2.204946,10


# Get KNN prediction for offline

In [13]:
part_train_users = random.sample(sorted(train["user_id"].unique()), 100000) # If make more, notebook killed
userknn_predect = userknn_model.predict(pd.DataFrame(part_train_users, columns=["user_id"]), 20)
userknn_predect_result = (userknn_predect.groupby(["user_id"])).agg({"item_id": lambda x: x.tolist()}).to_dict()["item_id"]

In [14]:
# with open("../service/models/userknn_predect_offline.pkl", "wb") as file: # Save knn recs
#     pickle.dump(userknn_predect_result, file)
    
# with open("../service/models/userknn_predect_offline.pkl", "rb") as file: # Load knn recs
#     userknn_predect_result = pickle.load(file)

: 