[Оригинальный notebook](https://www.kaggle.com/code/sharthz23/implicit-lightfm/notebook)

In [None]:
from collections import defaultdict
import os

import pandas as pd
import numpy as np
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from implicit.als import AlternatingLeastSquares
from matplotlib import pyplot as plt

import hvplot.pandas  # noqa

hvplot.extension('matplotlib')


In [None]:
plt.style.use("ggplot")

In [None]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, implicit_cross_validate
from recs_utils.load_data import load_users, load_items, load_interactions, sample_true_rec_data
from recs_utils.simple_rec import PopularRecommender, PopularRecommenderPerAge
from recs_utils.split import train_test_split, TimeRangeSplit
from recs_utils.matrix_ops import interactions_to_csr_matrix
from recs_utils.implicit_model import ImplicitRecommender

# Подготовка данных

In [None]:
data_dir = "data"

In [None]:
df = load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = load_users(os.path.join(data_dir, "users.csv"))
df_items = load_items(os.path.join(data_dir, "items.csv"))

In [None]:
df_items.head()

In [None]:
users_inv_mapping = dict(enumerate(df.index.unique("user_id")))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)

In [None]:
items_inv_mapping = dict(enumerate(df.index.unique("item_id")))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(items_mapping)

In [None]:
df_items["title"] = df_items["title"].str.strip().str.lower()

In [None]:
item_titles = pd.Series(df_items['title'], index=df_items.index).to_dict()
len(item_titles), item_titles[128115]

In [None]:
df_items.reset_index().head()

In [None]:
title_items = df_items.reset_index().groupby('title')["item_id"].agg(list)
title_items.head(n=4)

In [None]:
title_count = title_items.map(len)
title_count.value_counts()

In [None]:
title_items[title_count > 1].tail()

In [None]:
df_items[df_items['title'] == 'ящик пандоры']

In [None]:
title_items[title_count > 1].head()

In [None]:
df_items[df_items['title'] == '451 градус по фаренгейту']

In [None]:
df.head(n=3)

In [None]:
df['rating'] = df['rating'].to_numpy(dtype=np.float32)

df.loc[(slice(None), [44681, 162716]), :].groupby('item_id').agg({
    'progress': np.size,
    'rating': ['mean', "min", "max"],
    'start_date': ['min', 'max'],
})

In [None]:
last_date = df['start_date'].max().normalize()
folds = 7
start_date = last_date - pd.Timedelta(days=folds)
start_date, last_date

In [None]:
cv = TimeRangeSplit(start_date=start_date, periods=folds + 1)

cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

In [None]:
cv.date_range

In [None]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [None]:
folds_info_with_stats

# Implicit

In [None]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.loc[train_idx, :]
test = df.loc[test_idx, :]
train.shape, test.shape

In [None]:
train_mat = interactions_to_csr_matrix(train, users_mapping, items_mapping)

In [None]:
cosine_model = ImplicitRecommender(CosineRecommender(K=10), users_mapping, items_inv_mapping)

In [None]:
cosine_model.fit(train_mat)

In [None]:
top_N = 10
user_id = test.index.get_level_values("user_id")[0]
print(f'Рекомендации для пользователя {user_id}')

In [None]:
pred_recs = cosine_model.recommend([user_id], n=top_N)

In [None]:
pred_recs

In [None]:
test.head()

In [None]:
pred_recs = cosine_model.recommend(test.index.get_level_values("user_id").unique(), n=top_N)

In [None]:
pred_recs.head(top_N + 3)

In [None]:
pred_recs.loc[(user_id, slice(None)), :]

In [None]:
metrics = compute_metrics(test, pred_recs, top_N)

In [None]:
metrics

In [None]:
def create_cosine_model():
    return ImplicitRecommender(CosineRecommender(K=top_N), users_mapping, items_mapping, items_inv_mapping)


def create_bm25():
    return ImplicitRecommender(BM25Recommender(K=top_N), users_mapping, items_mapping, items_inv_mapping)

def create_tfidf():
    return ImplicitRecommender(TFIDFRecommender(K=top_N), users_mapping, items_mapping, items_inv_mapping)

def create_alsm():
    return ImplicitRecommender(AlternatingLeastSquares(factors=32, iterations=30, user_native=True), users_mapping, items_mapping, items_inv_mapping)

In [None]:
cv_res_per_model = []
models = {}

for factory in (create_cosine_model, create_bm25, create_tfidf, create_alsm):
    cross_valid_res, model = implicit_cross_validate(df, items_mapping, folds_with_stats, factory, top_N)
    models[model.model_name()] = model
    cv_res_per_model.append(cross_valid_res)

In [None]:
cross_valid_res = pd.concat(cv_res_per_model)

In [None]:
cross_valid_res.hvplot.scatter(x="fold", y="MAP", by="model", legend="top")

In [None]:
cross_valid_res.groupby("model").agg({
    'MRR': ['mean', 'std', 'min', 'max'],
    'MAP': ['mean', 'std', 'min', 'max'],
    f'recall@{top_N}': ['mean', 'std', 'min', 'max'],
})

In [None]:
models.keys()

In [None]:
similiar_items = models["TFIDFRecommender"].similiar_items([60193], top_N)

In [None]:
similiar_items["orig_title"] = similiar_items["item_id"].map(item_titles.get)
similiar_items["sim_titles"] = similiar_items["similiar_item_id"].map(item_titles.get)

In [None]:
similiar_items