In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')

interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [3]:
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Interactions

# # generator of folds
cv = TimeRangeSplitter(
    test_size='7D',
    # n_splits=4,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
# print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

from rectools.dataset import Dataset

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

users_mapping amount: 906071
items_mapping amount: 15577


In [11]:
import mlflow
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics
import time
import sys


mlflow.set_tracking_uri('http://localhost:5000')
list_of_prec = []

def mlflow_logger(
        experiment_nm,
        model_nm,
        K, 
        model,
        dataset,
        train, 
        test, 
        ):
    start_time = time.time()
    
    model.fit(dataset)

    recs = model.recommend(
        test['user_id'].unique(), 
        dataset=dataset, 
        k=10, 
        filter_viewed=False
    )

    metrics = {
        "prec@10": Precision(k=10),
        "recall@10": Recall(k=10),
        "MAP@10": MAP(k=10),
        "novelty": MeanInvUserFreq(k=10),
        "serendipity": Serendipity(k=10),
    }

    catalog = train['item_id'].unique()

    metric_values_icosine = calc_metrics(
            metrics,
            reco=recs,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )
    
    list_of_prec.append(metric_values_icosine['prec@10'])

    mlflow.set_experiment(experiment_nm) 

    finish_time = time.time()

    with mlflow.start_run(run_name=f'{experiment_nm} K = {K}') as run:
        mlflow.log_metric('prec10', metric_values_icosine['prec@10'])
        mlflow.log_metric('recall10', metric_values_icosine['recall@10'])
        mlflow.log_metric('MAP10', metric_values_icosine['MAP@10'])
        mlflow.log_metric('novelty10', metric_values_icosine['novelty'])
        mlflow.log_metric('serendipity10', metric_values_icosine['serendipity'])

        mlflow.log_metric('working_time', finish_time - start_time)
        mlflow.log_metric('size_of_model', sys.getsizeof(model))

        mlflow.log_metric('max_prec10', max(list_of_prec))

        artifact_path = "model"
        run_id = run.info.run_id
        model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path)
        mlflow.register_model(model_uri, name=model_nm) 

        mlflow.set_tag('test_run', True)



K_vals = [30, 40, 45, 50]

for K in K_vals:
    model = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=K))

    mlflow_logger(experiment_nm='item_knn', 
                  model_nm=f'ImplicitItemKNNWrapperModel{K}',
                  K=K,
                  model=model,
                  dataset=dataset,
                  train=train,
                  test=test
                )


Registered model 'ImplicitItemKNNWrapperModel30' already exists. Creating a new version of this model...
2023/10/30 02:04:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ImplicitItemKNNWrapperModel30, version 6
Created version '6' of model 'ImplicitItemKNNWrapperModel30'.
Registered model 'ImplicitItemKNNWrapperModel40' already exists. Creating a new version of this model...
2023/10/30 02:04:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ImplicitItemKNNWrapperModel40, version 4
Created version '4' of model 'ImplicitItemKNNWrapperModel40'.
Registered model 'ImplicitItemKNNWrapperModel45' already exists. Creating a new version of this model...
2023/10/30 02:05:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ImplicitItemKNNWrapperModel45, version 3
Cre

In [12]:
# get info from mlflow api

experiment_name = 'item_knn'
experiment_id = mlflow.set_experiment(experiment_name).experiment_id
runs = mlflow.search_runs(experiment_ids=experiment_id)
print(f'maximum recall@10 in {experiment_name} experiments =', max(runs['metrics.recall10']))
print(f'maximum MAP@10 in {experiment_name} experiments =', max(runs['metrics.MAP10']))
max_recall = max(runs['metrics.recall10'])

maximum recall@10 in item_knn experiments = 0.08648616613372442
maximum MAP@10 in item_knn experiments = 0.020808770004909608


In [13]:
# best model info
runs[runs['metrics.recall10'] == max_recall]

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.recall10,metrics.serendipity10,metrics.prec10,metrics.working_time,metrics.max_prec10,metrics.size_of_model,metrics.novelty10,metrics.MAP10,tags.mlflow.source.git.commit,tags.mlflow.runName,tags.mlflow.user,tags.test_run,tags.mlflow.source.name,tags.mlflow.source.type
0,53bda1e5a3fd4ae0a40f37800c2de7bd,331789875642997515,FINISHED,mlflow-artifacts:/331789875642997515/53bda1e5a3fd4ae0a40f37800c2de7bd/artifacts,2023-10-29 20:06:01.212000+00:00,2023-10-29 20:06:01.292000+00:00,0.086486,6.2e-05,0.015804,33.806886,0.015804,48.0,8.050896,0.020809,b859f71a1e62b753cd767782371f57191ef8d23a,item_knn K = 50,bazilyq,True,/home/bazilyq/Рабочий стол/RecSys/new_work/RecoServiceTemplate/.venv/lib/python3.10/site-packages/ipykernel_launcher.py,LOCAL


# ItemKNN TFIDFRecommender

In [22]:
from implicit.nearest_neighbours import TFIDFRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

item_knn_tfidf = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=30))
item_knn_tfidf.fit(dataset);

In [24]:
recs_itemknn_tfidf = item_knn_tfidf.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False 
)

In [26]:
recs_itemknn_tfidf.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,21745.376927,1
1,1016458,4457,10234.863308,2
2,1016458,7102,8987.878129,3
3,1016458,12192,8957.109813,4
4,1016458,1986,8369.832448,5


In [33]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

metric_values_itemknn_tfidf = calc_metrics(
            metrics,
            reco=recs_itemknn_tfidf,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

metric_values_itemknn_tfidf

{'prec@10': 0.023772589549238603,
 'recall@10': 0.12652382351172245,
 'MAP@10': 0.03005237337960426,
 'novelty': 6.699663403861505,
 'serendipity': 0.00010222896681730396}

# UserKNN BMP25

In [28]:
from implicit.nearest_neighbours import BM25Recommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

item_knn_bmp = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=30))
item_knn_bmp.fit(dataset);

In [29]:
recs_itemknn_bmp = item_knn_bmp.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False 
)

recs_itemknn_bmp.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,685454700000.0,1
1,1016458,15297,232313800000.0,2
2,1016458,13865,172474000000.0,3
3,1016458,9728,138320800000.0,4
4,1016458,4151,114935800000.0,5


In [31]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

metric_values_itemknn_bmp = calc_metrics(
            metrics,
            reco=recs_itemknn_bmp,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

metric_values_itemknn_bmp

{'prec@10': 0.03252208701450242,
 'recall@10': 0.1683399650610623,
 'MAP@10': 0.04827657497255996,
 'novelty': 3.9201705312554833,
 'serendipity': 2.616232292298612e-05}