In [1]:
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../')

In [6]:
import plotly.express as px
import numpy as np
import pandas as pd
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MAP, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.dataset.interactions import Interactions

from service.utils.user_knn import user_knn

# Data

In [7]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

interactions.shape, users.shape, items.shape

((5476251, 5), (840197, 5), (15963, 14))

In [8]:
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

## Intersection

In [9]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [10]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions[Columns.User].nunique()}")
print(f"Unique items in interactions: {interactions[Columns.Item].nunique()}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962179
Unique items in interactions: 15706


In [11]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [12]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


## Users

In [13]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [14]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique()}")

Users dataframe shape (840197, 5)
Unique users: 840197


## Items

In [15]:
pd.concat([items.head(2), items.tail(2)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
15961,4538,series,Среди камней,Darklands,2019.0,"драмы, спорт, криминал",Россия,0.0,18.0,,"Марк О’Коннор, Конор МакМахон","Дэйн Уайт О’Хара, Томас Кэйн-Бирн, Джудит Родд...",Семнадцатилетний Дэмиен мечтает вырваться за п...,"Среди, камней, 2019, Россия"
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы...","Гоша, 2019, Россия"


In [16]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique()}")

Items dataframe shape (15963, 14)
Unique item_id: 15963


#  userkNN model  CV

In [17]:
fig = px.bar(interactions.groupby(Columns.Datetime)[Columns.User].agg('count'))
fig.show()

Из графика видны **недельные тенденции** просмотров, поэтому следует fold-ы разделять по 7 дней, но т.к. на семинаре дали "намек", что private dataset имеет количество дней, меньшее чем 7. Поэтому фолды будут разбиваться на **5 и 7 дней**

In [18]:
pd.to_datetime('23-05-2021', format='%d-%m-%Y').weekday()

6

### train test split

In [19]:
def create_data_range(
    last_date: pd.Timestamp, 
    n_folds: int = 7, 
    unit: str = "W", 
    n_units: int = 1, 
    show: bool = True,
):
    periods = n_folds + 1
    freq = f"{n_units}{unit}"
    
    start_date = last_date - pd.Timedelta(n_folds * n_units + n_units, unit=unit)  
    
    date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
    
    if show:
        print(
            f"start_date: {start_date}\n"
            f"last_date: {last_date}\n"
            f"periods: {periods}\n"
            f"freq: {freq}\n"
            f"Test fold borders: {date_range.values.astype('datetime64[D]')}\n"
        )
        
    return date_range

In [20]:
CONFIG_CV = {
    "cv_v1": {
        "n_folds": 7,
        "unit": "W",
        "n_units": 1,
    },
    "cv_v2": {
        "n_folds": 7,
        "unit": "D",
        "n_units": 5,
    }, 
}

In [21]:
last_date = interactions[Columns.Datetime].max().normalize()
last_date

Timestamp('2021-08-22 00:00:00')

In [22]:
print("***Folds v1***")
date_range_v1 = create_data_range(
    last_date, 
    n_folds=CONFIG_CV["cv_v2"]["n_folds"], 
    unit=CONFIG_CV["cv_v2"]["unit"], 
    n_units=CONFIG_CV["cv_v2"]["n_units"]
)

***Folds v1***
start_date: 2021-07-13 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 5D
Test fold borders: ['2021-07-13' '2021-07-18' '2021-07-23' '2021-07-28' '2021-08-02'
 '2021-08-07' '2021-08-12' '2021-08-17']



**генерируем фолды** 

In [28]:
date_range_v1

DatetimeIndex(['2021-07-13', '2021-07-18', '2021-07-23', '2021-07-28',
               '2021-08-02', '2021-08-07', '2021-08-12', '2021-08-17'],
              dtype='datetime64[ns]', freq='5D')

In [41]:
cv_v1 = TimeRangeSplitter(
    test_size='6D',
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv_v1.get_test_fold_borders(Interactions(interactions))}")

CV = [cv_v1]

Real number of folds: [(Timestamp('2021-08-17 00:00:00', freq='6D'), Timestamp('2021-08-23 00:00:00', freq='6D'))]


**Формируем метрики**

In [42]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [43]:
K = [30, 40]
models = dict()

for k in K:
    models[f"cosine_userknn_K{k}"] = CosineRecommender(K=k)
    models[f"tfidf_userknn_K{k}"] = TFIDFRecommender(K=k)
    models[f"bm25_userknn_K{k}"] = BM25Recommender(K=k)

models

{'cosine_userknn_K30': <implicit.nearest_neighbours.CosineRecommender at 0x7f01eff38a60>,
 'tfidf_userknn_K30': <implicit.nearest_neighbours.TFIDFRecommender at 0x7f01ee592b90>,
 'bm25_userknn_K30': <implicit.nearest_neighbours.BM25Recommender at 0x7f01ee592470>,
 'cosine_userknn_K40': <implicit.nearest_neighbours.CosineRecommender at 0x7f01ee591c30>,
 'tfidf_userknn_K40': <implicit.nearest_neighbours.TFIDFRecommender at 0x7f01ee5931f0>,
 'bm25_userknn_K40': <implicit.nearest_neighbours.BM25Recommender at 0x7f01ee5923e0>}

## Training

In [44]:
N_USERS = 50

In [46]:
%%time

results = []

for idx, cv in enumerate(CV):
    print(f"\n CV version {idx}")
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        pprint(fold_info)

        df_train = interactions.iloc[train_ids].copy()
        df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            userknn_model = user_knn.UserKnn(model=model, N_users=N_USERS, use_weight_idf=True)
            userknn_model.fit(df_train)

            if 'bm25' in model_name:
                recos = userknn_model.predict(df_test, bmp25=True)
            else:
                recos = userknn_model.predict(df_test)

            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            full_model_name = f"{model_name}_cv-{idx}"
            fold = {"fold": i_fold, "model": full_model_name}
            fold.update(metric_values)
            results.append(fold)


 CV version 0

{'end': Timestamp('2021-08-23 00:00:00', freq='6D'),
 'i_split': 0,
 'start': Timestamp('2021-08-17 00:00:00', freq='6D'),
 'test': 264626,
 'test_items': 6478,
 'test_users': 102083,
 'train': 5106361,
 'train_items': 15589,
 'train_users': 913604}


  0%|          | 0/15589 [00:00<?, ?it/s]

ValueError: not enough values to unpack (expected 2, got 0)

Работало больше 10 часов, случайно при перезапуске ноутбука была вызвана ячейка и остановлена, поэтому завершилась с ошибкой, поэтому ошибку убрали для лучшего вида

In [47]:
df_metrics = pd.DataFrame(results)
df_metrics

In [47]:
df_metrics.to_pickle("../data/hw_3/df_metrics.pickle")

In [48]:
df_metrics.groupby('model').mean()

Unnamed: 0_level_0,fold,prec@10,recall@10,MAP@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bm25_userknn_K30_cv-0,3.0,0.002623,0.01298,0.002507,9.563068,9.1e-05
bm25_userknn_K40_cv-0,3.0,0.00232,0.011307,0.00223,9.827477,9e-05
cosine_userknn_K30_cv-0,3.0,0.003241,0.018466,0.003272,8.451809,4.5e-05
cosine_userknn_K40_cv-0,3.0,0.003008,0.016994,0.003028,8.673252,4.7e-05
tfidf_userknn_K30_cv-0,3.0,0.005928,0.034234,0.006449,8.272573,5.8e-05
tfidf_userknn_K40_cv-0,3.0,0.005826,0.0336,0.006334,8.404775,6.1e-05


In [49]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,MAP@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bm25_userknn_K30_cv-0,7.2e-05,0.000612,8.3e-05,0.104468,7e-06
bm25_userknn_K40_cv-0,7.4e-05,0.000442,8.1e-05,0.097359,7e-06
cosine_userknn_K30_cv-0,0.000231,0.001749,0.000314,0.074699,3e-06
cosine_userknn_K40_cv-0,0.000213,0.001603,0.000295,0.06931,3e-06
tfidf_userknn_K30_cv-0,0.000398,0.003003,0.000577,0.066627,5e-06
tfidf_userknn_K40_cv-0,0.000321,0.002534,0.000487,0.059565,4e-06


по **ofline** метрикам лучше всего себя показывает модель TFIDFRecommender
TFIDFRecommender подбор К

# Подбор оптимального K для TFIDFRecommender

In [48]:
N_USERS = 50

# Т.к. метрики для К 30 и 40 уже есть
K = [k for k in range(50, 71, 10)]
models = dict()

for k in K:
    models[f"tfidf_userknn_K{k}"] = TFIDFRecommender(K=k)
models

{'tfidf_userknn_K50': <implicit.nearest_neighbours.TFIDFRecommender at 0x7f01d1b999c0>,
 'tfidf_userknn_K60': <implicit.nearest_neighbours.TFIDFRecommender at 0x7f01d1b99e10>,
 'tfidf_userknn_K70': <implicit.nearest_neighbours.TFIDFRecommender at 0x7f01d1b9a2f0>}

In [None]:
%%time

results_idf = []

fold_iterator = cv_v1.split(Interactions(interactions), collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=N_USERS)
        userknn_model.fit(df_train)
        recos = userknn_model.predict(df_test)

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        full_model_name = f"{model_name}"
        fold = {"fold": i_fold, "model": full_model_name}
        fold.update(metric_values)
        results_idf.append(fold)


{'End date': Timestamp('2021-07-18 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-13 00:00:00', freq='5D'),
 'Test': 156580,
 'Test items': 5793,
 'Test users': 68150,
 'Train': 3281612,
 'Train items': 14754,
 'Train users': 652905}


  0%|          | 0/652905 [00:00<?, ?it/s]

In [None]:
df_metrics_tfidf = pd.DataFrame(results_idf)
df_metrics_tfidf

# Train TFIDFRecommender on all data

Обучение TFIDFRecommender на всём объеме данных

In [49]:
%%time

results = []

df_train = interactions.copy()
catalog = df_train[Columns.Item].unique()

tfidf_model = TFIDFRecommender(K=30)
userknn_model = user_knn.UserKnn(model=tfidf_model, N_users=50, use_weight_idf=True)
userknn_model.fit(df_train)

  0%|          | 0/15706 [00:00<?, ?it/s]

CPU times: user 34.3 s, sys: 2.7 s, total: 37 s
Wall time: 33.9 s


In [50]:
import dill

with open('../service/weights/userKNN/userknn_tfidf_k30.dill', 'wb') as f:
    dill.dump(userknn_model.user_knn, f)

In [51]:
df_train.iloc[0]

user_id                     176549
item_id                       9506
datetime       2021-05-11 00:00:00
weight                      4250.0
watched_pct                   72.0
Name: 0, dtype: object

In [52]:
555555555 in df_train[Columns.User].tolist()

False

In [53]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0
5476246,648596,12225,2021-08-13,76.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0
5476248,697262,15297,2021-08-20,18307.0,63.0
5476249,384202,16197,2021-04-19,6203.0,100.0
5476250,319709,4436,2021-08-15,3921.0,45.0


In [54]:
interactions['user_id'].unique().shape

(962179,)

In [55]:
import dill

with open('../service/weights/userKNN/userknn_tfidf_k30.dill', 'rb') as f:
    userknn = dill.load(f)

userknn.similar_items(962178, 10)

(array([], dtype=float64), array([], dtype=float64))

# Popular Model

In [56]:
from rectools.models import PopularModel
from rectools.dataset import Dataset

In [57]:
max_date = interactions[Columns.Datetime].max().normalize()
max_date

Timestamp('2021-08-22 00:00:00')

In [58]:
train = interactions[[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]][
            interactions[Columns.Datetime] < max_date - pd.Timedelta(5, "D")]

test = interactions[[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]][
           interactions[Columns.Datetime] >= max_date - pd.Timedelta(5, "D")]

dataset_train = Dataset.construct(train)

In [59]:
popilarity_models = {
    "popular": PopularModel(),
    "popular_mw": PopularModel(popularity="mean_weight")
}

In [60]:
popilarity_models["popular"].fit(dataset_train)
popilarity_models["popular_mw"].fit(dataset_train);

In [61]:
popilarity_models["popular"].popularity_list[0][:10]

array([ 24,  20,  31,  15, 167,  81,  89, 135, 355, 116])

In [62]:
popilarity_models["popular_mw"].popularity_list[0][:10]

array([11363, 11681, 12841, 13017,  2069, 13691, 13552, 13397, 11774,
       12913])

In [63]:
pecos_pop = popilarity_models["popular"].recommend(
    users=test[Columns.User].unique(),
    dataset=dataset,
    k=100,
    filter_viewed=False,
)

pecos_pop_mw = popilarity_models["popular_mw"].recommend(
    users=test[Columns.User].unique(),
    dataset=dataset,
    k=100,
    filter_viewed=False,
)

NameError: name 'dataset' is not defined

In [152]:
metrics = {
    "prec@5": Precision(k=5),
    "recall@5": Recall(k=5),
    "MAP@5": MAP(k=5),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@20": MAP(k=20),
    "prec@20": Precision(k=20),
    "recall@20": Recall(k=20),
    "MAP@100": MAP(k=100),
    "prec@100": Precision(k=100),
    "recall@100": Recall(k=100),
    "MAP@100": MAP(k=100),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}
catalog = train[Columns.Item].unique()
metric_values_pop = calc_metrics(metrics, pecos_pop, test, train, catalog)
metric_values_pop_mean_weight = calc_metrics(metrics, pecos_pop_mw, test, train, catalog)

In [153]:
metric_values_pop

{'prec@5': 0.0017855613317256697,
 'recall@5': 0.004623809755660008,
 'prec@10': 0.0011648975773029461,
 'recall@10': 0.005682095875283048,
 'prec@20': 0.0010502526799891945,
 'recall@20': 0.00880186008464912,
 'prec@100': 0.003247020220987923,
 'recall@100': 0.16609031082955295,
 'MAP@5': 0.0013179725619140792,
 'MAP@20': 0.0016695313583723814,
 'MAP@100': 0.005578924867474493,
 'novelty': 9.976033936531364,
 'serendipity': 1.2752762676592953e-05}

In [154]:
metric_values_pop_mean_weight

{'prec@5': 9.09252633867684e-05,
 'recall@5': 0.00014799438063171262,
 'prec@10': 4.612151041357817e-05,
 'recall@10': 0.00015458316783365238,
 'prec@20': 2.635514880775895e-05,
 'recall@20': 0.00016946607539568094,
 'prec@100': 0.00015147621777259455,
 'recall@100': 0.0065476971391510656,
 'MAP@5': 3.0257754846536496e-05,
 'MAP@20': 3.1771198360212185e-05,
 'MAP@100': 0.00011355765992119742,
 'novelty': 17.423655787689828,
 'serendipity': 1.8991632826477633e-06}

**На офлайн метриках выигрывает обычная модель по популярному**

# Save item_idf data

Создаем датасет со взвешенными item-ами по механизму idf для использования в будущем

In [65]:
from collections import Counter

In [66]:
item_cnt = Counter(interactions['item_id'].values)
item_idf = pd.DataFrame.from_dict(item_cnt, orient='index', columns=['doc_freq']).reset_index()
n = interactions.shape[0]
item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))
del item_idf['doc_freq']
item_idf

Unnamed: 0,index,idf
0,9506,7.150811
1,1659,8.524953
2,7107,5.821207
3,7638,8.407093
4,6686,7.778734
...,...,...
15701,7833,14.822785
15702,9125,14.822785
15703,10064,14.822785
15704,13019,14.822785


In [67]:
item_idf = item_idf.sort_values("idf", ascending=False)
item_idf.to_csv('../data/kion_train/items_idf.csv', index=False)