# Initialization

In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [3]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

In [4]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = pd.Series(list(set(users_train).intersection(set(users_test))))
print(len(users_train), len(users_test), len(common_users))

# Идентифицируйте холодных пользователей и оцените их количество.
cold_users = users_test[~users_test.isin(users_train)]

print(len(cold_users))

428220 123223 120858
2365


In [5]:
# получить топ-100 наиболее популярных книг

from sklearn.preprocessing import MinMaxScaler

top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["item_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()

# нормализация пользователей и среднего рейтинга, требуется для их приведения к одному масштабу
scaler = MinMaxScaler()
item_popularity[["users_norm", "avg_rating_norm"]] = scaler.fit_transform(
    item_popularity[["users", "avg_rating"]]
)

# вычисляем popularity_score, как скор популярности со штрафом за низкий рейтинг
item_popularity["popularity_score"] = (
    item_popularity["users_norm"] * item_popularity["avg_rating_norm"]
)

# сортируем по убыванию popularity_score
item_popularity = item_popularity.sort_values(by="popularity_score", ascending=False)

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity.query("avg_rating >= 4").head(100)

top_k_pop_items

Unnamed: 0,item_id,users,avg_rating,users_norm,avg_rating_norm,popularity_score
32387,18007564,20207,4.321275,0.496596,0.830319,0.412333
32623,18143977,19462,4.290669,0.478287,0.822667,0.393471
2,3,15139,4.706057,0.372042,0.926514,0.344702
30695,16096824,16770,4.301014,0.412126,0.825253,0.340108
1916,15881,13043,4.632447,0.320529,0.908112,0.291076
...,...,...,...,...,...,...
24837,8490112,4792,4.080968,0.117747,0.770242,0.090694
33611,18966819,4361,4.374914,0.107154,0.843729,0.090409
378,3636,4667,4.098564,0.114675,0.774641,0.088832
32835,18293427,4674,4.092640,0.114847,0.773160,0.088795


# === Знакомство: первые персональные рекомендации

In [6]:
# добавляем информацию о книгах
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("item_id")[["author", "title", "genre_and_votes", "publication_year"]], on="item_id")

with pd.option_context('display.max_rows', 100):
    display(top_k_pop_items[["item_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_score", "genre_and_votes"]])

Unnamed: 0,item_id,author,title,publication_year,users,avg_rating,popularity_score,genre_and_votes
0,18007564,Andy Weir,The Martian,2014.0,20207,4.321275,0.412333,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,18143977,Anthony Doerr,All the Light We Cannot See,2014.0,19462,4.290669,0.393471,"{'Historical-Historical Fiction': 13679, 'Fict..."
2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0,15139,4.706057,0.344702,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
3,16096824,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2015.0,16770,4.301014,0.340108,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
4,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,1999.0,13043,4.632447,0.291076,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict..."
5,38447,Margaret Atwood,The Handmaid's Tale,1998.0,14611,4.23277,0.290194,"{'Fiction': 15424, 'Classics': 9937, 'Science ..."
6,11235712,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)",2012.0,14348,4.179189,0.280247,"{'Young Adult': 10539, 'Fantasy': 9237, 'Scien..."
7,17927395,Sarah J. Maas,A Court of Mist and Fury (A Court of Thorns an...,2016.0,12177,4.73064,0.279094,"{'Fantasy': 10186, 'Romance': 3346, 'Young Adu..."
8,5,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,2004.0,11890,4.770143,0.275401,"{'Fantasy': 49784, 'Young Adult': 15393, 'Fict..."
9,13206900,Marissa Meyer,"Winter (The Lunar Chronicles, #4)",2015.0,12291,4.534293,0.266881,"{'Fantasy': 4835, 'Young Adult': 4672, 'Scienc..."


In [7]:
cold_users_events_with_recs = \
    events_test[events_test["user_id"].isin(cold_users)] \
    .merge(top_k_pop_items, on="item_id", how="left")

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx] \
    [["user_id", "item_id", "rating", "avg_rating"]]

In [8]:
# Проверьте количество строк, чтобы убедиться, что оно осталось прежним
original_row_count = len(events_test[events_test["user_id"].isin(cold_users)])
current_row_count = len(cold_users_events_with_recs)

# Проверяем, одинаково ли количество строк
original_row_count, current_row_count

(9672, 9672)

In [9]:
# доли событий «холодных» пользователей в events_test рекомендации в top_k_pop_items
proportion_matched_recs = len(cold_user_recs) / len(cold_user_items_no_avg_rating_idx)
proportion_matched_recs_rounded = round(proportion_matched_recs, 2)
proportion_matched_recs_rounded

0.2

In [10]:
# посчитаем метрики рекомендаций
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"], squared=False)
mae = mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2))

0.78 0.62


In [11]:
cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio != 0].mean().iat[0]:.2f}")

Доля пользователей без релевантных рекомендаций: 0.59
Среднее покрытие пользователей: 0.44


In [12]:
# Посчитайте количество уникальных пользователей и предметов
num_users = len(events['user_id'].unique())
num_items = len(events['item_id'].unique())

# количество всех ячеек в матрице
num_cells = num_users * num_items

# количество пустых ячеек в матрице
col_null_cells = len(events[events['rating'] == 0])

# Посчитайте степень разреженности
sparsity = col_null_cells / num_cells

print('Степень разреженности U-I-матрицы: ', sparsity)

Степень разреженности U-I-матрицы:  0.0


### Модель SVD

In [56]:
pip install surprise

Defaulting to user installation because normal site-packages is not writeable
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[0mDiscarding [4;34mhttps://files.pythonhosted.org/packages/d9/8d/40ac32e703f3808159f9e2b33760cfbd6224cc7783eb663091eddc9581c2/scikit_surprise-1.1.4.tar.gz#sha256=130c45feaee9de4b8cba0aff413ad9b51b2d5c5c90d41aee4759e00059913752 (from https://pypi.org/simple/scikit-surprise/) (requires-python:>=3.8)[0m: [33mRequested unknown from https://files.pythonhosted.org/packages/d9/8d/40ac32e703f3808159f9e2b33760cfbd6224cc7783eb663091eddc9581c2/scikit_surpris

In [57]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd14eafee90>

In [58]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

In [59]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae)

RMSE: 0.8289
MAE:  0.6474
0.8288711689059135 0.647437483750257


In [62]:
from surprise import NormalPredictor

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set)

In [63]:
rmse = accuracy.rmse(random_predictions)
mae = accuracy.mae(random_predictions)

RMSE: 1.2628
MAE:  1.0018


#### Получение рекомендаций

In [65]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):

    """ возвращает n рекомендаций для user_id """
    
    # получим список идентификаторов всех книг
    all_items = set(events['item_id'].unique())
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events[events['user_id'] == user_id]['item_id'].unique())
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["item_id", "score"])

In [71]:
get_recommendations_svd(1296647, items, events_test, svd_model)

Unnamed: 0,item_id,score
0,7864312,4.981188
1,25793186,4.912001
2,12174312,4.898052
3,13208,4.894869
4,33353628,4.891661


In [74]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations)

user_id: 1095219
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
142,Jocelyn Davies,"A Beautiful Dark (A Beautiful Dark, #1)",2014-10-04,2014-12-26,4,"{'Paranormal-Angels': 368, 'Young Adult': 324,..."
143,Kelley Armstrong,"The Gathering (Darkness Rising, #1)",2013-07-31,2013-08-07,2,"{'Young Adult': 1308, 'Fantasy-Paranormal': 96..."
144,Courtney Allison Moulton,"Angelfire (Angelfire, #1)",2012-12-06,2012-12-21,5,"{'Paranormal-Angels': 613, 'Young Adult': 601,..."
145,Andrea Cremer,Rift (Nightshade Prequel #1; Nightshade World #1),2013-07-30,2013-07-31,5,"{'Fantasy': 215, 'Young Adult': 177, 'Fantasy-..."
146,Becca Fitzpatrick,"Hush, Hush (Hush, Hush, #1)",2013-12-31,2014-01-02,4,"{'Young Adult': 5341, 'Fantasy': 4355, 'Romanc..."
147,Rachel Vincent,"Rogue (Shifters, #2)",2015-03-16,2015-03-16,5,"{'Fantasy-Urban Fantasy': 558, 'Fantasy-Parano..."
148,"P.C. Cast, Kristin Cast","Marked (House of Night, #1)",2013-03-11,2013-04-04,4,"{'Young Adult': 4296, 'Paranormal-Vampires': 3..."
149,Carrie Jones,"Entice (Need, #3)",2012-07-29,2012-11-03,5,"{'Young Adult': 402, 'Fantasy': 337, 'Fantasy-..."
150,Carrie Jones,"Captivate (Need, #2)",2012-07-24,2012-07-25,5,"{'Young Adult': 558, 'Fantasy': 455, 'Fantasy-..."
151,Carrie Jones,"Need (Need, #1)",2012-04-10,2012-04-11,5,"{'Young Adult': 1122, 'Fantasy': 947, 'Fantasy..."


Рекомендации


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,78,5,John McPhee,Annals of the Former World,"{'Science': 257, 'Nonfiction': 213, 'Science-G..."
1,6816890,5,José Luís Peixoto,Morreste-me,"{'Cultural-Portugal': 16, 'European Literature..."
2,394535,5,Cormac McCarthy,"Blood Meridian, or the Evening Redness in the ...","{'Fiction': 3132, 'Historical-Historical Ficti..."
3,132778,5,J.D. Robb,"Vengeance in Death (In Death, #6)","{'Mystery': 991, 'Romance': 483, 'Mystery-Crim..."
4,33163378,5,Jennifer Mathieu,Moxie,"{'Young Adult': 871, 'Contemporary': 611, 'Fem..."


# === Базовые подходы: коллаборативная фильтрация

In [13]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transfor

In [15]:
events_train['item_id_enc'].max()

43304

In [16]:
user_item_matrix_train

NameError: name 'user_item_matrix_train' is not defined

In [20]:
# Посчитайте количество уникальных пользователей и предметов
num_users = len(events_train['user_id'].unique())
num_items = len(events_train['item_id'].unique())

# рассчитываем размер матрицы user_item_matrix_train в гигабайтах
num_cells = (num_users * num_items) / (1024 ** 3)

print('Размер матрицы в гигабайтах: ', num_cells)

Размер матрицы в гигабайтах:  16.54028546065092


In [21]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8)

In [None]:
import sys

# sparse-матрица формата CSR меньше занимает оперативной памяти
sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3

0.26370687410235405

### Модель ALS

In [25]:
pip install implicit

Defaulting to user installation because normal site-packages is not writeable
Collecting implicit
  Using cached implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
Installing collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [26]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [04:35<00:00,  5.52s/it]


In [27]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations

In [28]:
# Выбор произвольного пользователя из тренировочной выборки
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_als(user_item_matrix_train, als_model, user_id, user_encoder, item_encoder, include_seen=False, n=5)
user_recommendations = user_recommendations.merge(items, on="item_id")
user_recommendations["seen"] = user_recommendations["item_id"].isin(user_history["item_id"])
display(user_recommendations)

user_id: 1188575
История (последние события, recent)


Unnamed: 0,author,title,rating,genre_and_votes
1,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,4,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
2,Katie McGarry,"Pushing the Limits (Pushing the Limits, #1)",4,"{'Romance': 2052, 'Young Adult': 2048, 'Contem..."
3,Huntley Fitzpatrick,My Life Next Door,4,"{'Young Adult': 2288, 'Romance': 1997, 'Contem..."
4,Gayle Forman,"Just One Night (Just One Day, #2.5)",4,"{'Young Adult': 481, 'Romance': 413, 'Contempo..."
5,Julie Cross,Whatever Life Throws at You,4,"{'Romance': 211, 'Young Adult': 181, 'Contempo..."
6,Gayle Forman,"Just One Day (Just One Day, #1)",4,"{'Young Adult': 1829, 'Romance': 1490, 'Contem..."
7,Jennifer L. Armentrout,"Be with Me (Wait for You, #2)",4,"{'New Adult': 1163, 'Romance': 951, 'Contempor..."
8,Jenny Han,P.S. I Still Love You (To All the Boys I've Lo...,4,"{'Young Adult': 3659, 'Romance': 3112, 'Contem..."
9,Tabitha Suzuma,Forbidden,3,"{'Young Adult': 1280, 'Romance': 1145, 'Contem..."
10,Matthew Quick,"Forgive Me, Leonard Peacock",4,"{'Young Adult': 1539, 'Contemporary': 812, 'Fi..."


Рекомендации


Unnamed: 0,item_id_enc_x,score,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,...,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc_y,seen
0,24976,0.365925,6936382,Stephanie Perkins,Anna and the French Kiss (Anna and the French ...,Anna is looking forward to her senior year in ...,"{'Young Adult': 6126, 'Romance': 5253, 'Contem...",372,4.07,267364,...,US,en-US,,False,525423273.0,9780525423270.0,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 6126, Romance 5253, Contemporary 3...",24976,False
1,32860,0.344809,15749186,Jenny Han,To All the Boys I've Loved Before (To All the ...,What if all the crushes you ever had found out...,"{'Young Adult': 5919, 'Romance': 4965, 'Contem...",355,4.11,144832,...,US,eng,Paperback,False,,,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 5919, Romance 4965, Contemporary 4...",32860,False
2,28257,0.334876,9961796,Stephanie Perkins,Lola and the Boy Next Door (Anna and the Frenc...,There is an alternate cover edition for this I...,"{'Young Adult': 3437, 'Romance': 2781, 'Contem...",384,4.0,110284,...,US,en-US,Hardcover,False,525423281.0,9780525423287.0,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 3437, Romance 2781, Contemporary 2...",28257,False
3,27857,0.32637,9627755,Stephanie Perkins,Isla and the Happily Ever After (Anna and the ...,"Love ignites in the City That Never Sleeps, bu...","{'Young Adult': 2548, 'Romance': 2085, 'Contem...",339,4.1,74039,...,US,eng,Hardcover,False,525425632.0,9780525425632.0,"{'Academic': None, 'Academic-Academia': None, ...","Young Adult 2548, Romance 2085, Contemporary 2046",27857,False
4,36180,0.289114,18189606,Morgan Matson,Since You've Been Gone,It was Sloane who yanked Emily out of her shel...,"{'Contemporary': 2089, 'Young Adult': 1958, 'R...",464,4.19,62419,...,US,eng,Hardcover,False,1442435003.0,9781442435001.0,"{'Academic': None, 'Academic-Academia': None, ...","Contemporary 2089, Young Adult 1958, Romance 9...",36180,False


In [29]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100)

In [31]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [33]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.to_parquet("als_recommendations.parquet")

In [34]:
# Для удобства оценки добавим в датафрейм с рекомендациями истинные оценки из тестовой выборки:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
)

In [35]:
# Подсчитать метрику NDCG для одного пользователя
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

Умея считать NDCG для одного пользователя, посчитаем данную метрику, например, для
k=5 для всех пользователей из тестовой выборки. В результате каждому пользователю будет соответствовать одно значение NDCG@5. Запись “NDCG@5” означает, что метрика NDCG считается для пяти объектов. 

In [39]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))

# Имея ряд значений NDCG@5 по пользователям, можно посчитать среднее её значение (по всем пользователям)
print('Среднее значение NDCG@5: ', round(ndcg_at_5_scores.mean(), 2))

Среднее значение NDCG@5:  0.98


In [40]:
# Оценка доли пользователей, для которых удалось посчитать метрику NDCG
total_users = len(user_encoder.classes_)
users_ndcg = ndcg_at_5_scores.notnull().sum()
users_ndcg_proportion = users_ndcg / total_users

round(users_ndcg_proportion, 2)

0.04

##### Используем модель ALS для получения похожих items с помощью метода similar_items

In [47]:
# Предположим, что item_ids для анализа уже определены
item_ids_to_analyze = [100, 200, 300]  # пример ID айтемов, которые будем анализировать

# Используем модель для получения похожих айтемов
similar_items_dict = {}
for item_id in item_ids_to_analyze:
    similar_items = als_model.similar_items(item_id, N=10)
    similar_items_dict[item_id] = similar_items

# Преобразуем результаты в DataFrame
similar_items_list = []
for item_id, (sim_item_ids, scores) in similar_items_dict.items():
    for sim_item_id, score in zip(sim_item_ids, scores):
        similar_items_list.append((item_id, sim_item_id, score))

similar_items_df = pd.DataFrame(similar_items_list, columns=["item_id", "similar_item_id", "score"])

similar_items_df

Unnamed: 0,item_id,similar_item_id,score
0,100,100,1.0
1,100,416,0.955224
2,100,8081,0.945551
3,100,240,0.940132
4,100,2962,0.935671
5,100,1479,0.932158
6,100,8742,0.932129
7,100,1817,0.926266
8,100,6934,0.925741
9,100,3335,0.925313


##### Используем модель ALS для получения похожих users с помощью метода similar_users

In [50]:
# Выберем несколько users для анализа
users_ids_to_analyze = [1, 50, 100]  # Пример ID user, которые будем анализировать

# Используем модель для получения похожих user
similar_users_dict = {}
for user_id in users_ids_to_analyze:
    similar_user = als_model.similar_users(user_id, N=10)
    similar_users_dict[user_id] = similar_user

# Преобразуем результаты в DataFrame
similar_users_list = []
for user_id, (sim_user_ids, scores) in similar_users_dict.items():
    for sim_user_id, score in zip(sim_user_ids, scores):
        similar_users_list.append((user_id, sim_user_id, score))

similar_users_df = pd.DataFrame(similar_users_list, columns=["user_id", "similar_user_id", "score"])

similar_users_df

Unnamed: 0,user_id,similar_user_id,score
0,1,1,1.0
1,1,118312,0.994112
2,1,199361,0.99349
3,1,205072,0.992585
4,1,343231,0.992469
5,1,29103,0.992195
6,1,105530,0.991213
7,1,313396,0.990874
8,1,30087,0.990061
9,1,272370,0.989844


# === Базовые подходы: контентные рекомендации

In [51]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [56]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v["genre_and_votes"]
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items)

In [57]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850060,0.149651
1,Fiction,6406256,0.139955
38,Classics,3414934,0.074605
18,Young Adult,3296951,0.072027
34,Romance,2422614,0.052926
5,Nonfiction,1737406,0.037957
16,Historical-Historical Fiction,1531205,0.033452
20,Mystery,1371196,0.029956
24,Science Fiction,1218917,0.026629
33,Fantasy-Paranormal,857012,0.018723


Построим матрицу вида «книга-жанр»

In [58]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [59]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [60]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [65]:
user_items_genres_csr.count_nonzero()

149

In [66]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [68]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.185241
38,Classics,3414934,0.103879
25,Fantasy,6850060,0.072447
5,Nonfiction,1737406,0.050865
24,Science Fiction,1218917,0.04092


Получите наиболее релевантные рекомендации для пользователя

In [73]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[-k:]

In [74]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]]) 

Unnamed: 0,author,title,genre_and_votes
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
393210,"G.K. Chesterton, Jonathan Lethem",The Man Who Was Thursday: A Nightmare,"{'Fiction': 1257, 'Classics': 929, 'Mystery': 469, 'Fantasy': 293, 'Philosophy': 156, 'Literatur..."
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


# === Базовые подходы: валидация

In [75]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [82]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


In [79]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall

In [81]:
precision, recall = compute_cls_metrics(events_recs_for_binary_metrics)
precision, round(recall, 3)

(0.007581376853347184, 0.014)

In [83]:
precision, recall = compute_cls_metrics(events_recs_for_binary_metrics)
precision, round(recall, 3)

(0.008732947582837622, 0.031)

# === Двухстадийный подход: метрики

# === Двухстадийный подход: модель

# === Двухстадийный подход: построение признаков