In [None]:
# расскоментируйте код ниже, чтобы установить все зависимости
!pip install -q \
    pyarrow==12.0.1 \
    polars==0.18.6 \
    pandas==2.0.3 \
    optuna==3.3.0 \
    tqdm==4.65.0 \
    numpy==1.24.3 \
    redis==4.6.0 \
    gensim==4.3.2 --user

In [3]:
# раскоментируйте код ниже, чтобы скачать данные
# !wget -q https://files.grouplens.org/datasets/movielens/ml-100k.zip
# !unzip -q ml-100k.zip

In [1]:
import uuid
import redis
import optuna
import random

import polars as pl
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Word2Vec

from typing import List, Any

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t',
    names=['user_id', 'item_id', 'rating', 'timestamp']
)
ratings = pl.from_pandas(ratings).filter(pl.col('rating') >= 4)
ratings

user_id,item_id,rating,timestamp
i64,i64,i64,i64
298,474,4,884182806
253,465,5,891628467
286,1014,5,879781125
200,222,5,876042340
122,387,5,879270459
291,1042,4,874834944
119,392,4,886176814
167,486,4,892738452
299,144,4,877881320
308,1,4,887736532


In [3]:
grouped_df = (
    ratings
    .groupby('user_id')
    .agg([
        pl.col('item_id').apply(lambda x: x[:-3]).alias('train_ids'),
        pl.col('rating').apply(lambda x: x[:-3]).alias('train_ratings'),
        pl.col('item_id').apply(lambda x: x[-3:]).alias('test_ids'),
        pl.col('rating').apply(lambda x: x[-3:]).alias('test_ratings'),
    ])
)
grouped_df

user_id,train_ids,train_ratings,test_ids,test_ratings
i64,list[i64],list[i64],list[i64],list[i64]
576,"[280, 825, … 204]","[5, 4, … 4]","[1, 7, 100]","[4, 5, 4]"
410,"[286, 313, … 873]","[4, 5, … 4]","[690, 905, 272]","[4, 4, 4]"
78,"[255, 412, … 237]","[4, 4, … 5]","[880, 93, 411]","[5, 4, 4]"
942,"[117, 200, … 197]","[4, 4, … 5]","[945, 498, 259]","[5, 5, 4]"
148,"[408, 1, … 189]","[5, 4, … 4]","[173, 164, 194]","[5, 4, 5]"
36,"[339, 748, … 288]","[5, 4, … 4]","[1026, 878, 261]","[5, 5, 5]"
244,"[815, 265, … 56]","[4, 4, … 5]","[88, 735, 67]","[4, 5, 4]"
854,"[979, 475, … 705]","[4, 4, … 4]","[171, 508, 528]","[4, 4, 4]"
138,"[26, 484, … 474]","[5, 4, … 5]","[147, 742, 497]","[4, 4, 5]"
222,"[366, 750, … 685]","[4, 5, … 4]","[230, 223, 64]","[4, 4, 5]"


In [7]:
grouped_df.shape

(942, 5)

In [4]:
TOP_K = 10


def user_intersection(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: number of items in intersection of y_rel and y_rec (truncated to top-K)
    """
    return len(set(y_rec[:k]).intersection(set(y_rel)))


def user_hitrate(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: 1 if top-k recommendations contains at lease one relevant item
    """
    return int(user_intersection(y_rel, y_rec, k) > 0)


def user_precision(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: percentage of relevant items through recommendations
    """
    return user_intersection(y_rel, y_rec, k) / k


def user_ap(y_rel: List[Any], y_rec: List[Any], k: int = 10) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: average precision metric for user recommendations
    """
    return np.sum([
        user_precision(y_rel, y_rec, idx + 1)
        for idx, item in enumerate(y_rec[:k]) if item in y_rel
    ]) / k

## Применим алгоритм w2v

Внутри gensim написан эффективный метод `predict_output_word`, позволяющий получать следующие возможные токены (в нашем случае объекты для рекомендации), перед фильтрацией просмотренного попросим его сгенерировать на `len(train_ids)` токенов больше (в худшем случае, нам придется все отфильтровать)

In [5]:
def evaluate_model(model):
    ap_list = []
    hitrate_list = []
    for train_ids, y_rel in grouped_df.select('train_ids', 'test_ids').rows():
        model_preds = model.predict_output_word(
            train_ids, topn=(TOP_K + len(train_ids))
        )
        if model_preds is None:
            hitrate_list.append(0)
            continue

        y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
        ap_list.append(user_ap(y_rel, y_rec))
        hitrate_list.append(user_hitrate(y_rel, y_rec))
    return np.mean(ap_list), np.mean(hitrate_list)

# обучим w2v с параметрами по умолчанию
model = Word2Vec(grouped_df['train_ids'].to_list())
mean_ap, mean_hitrate = evaluate_model(model)
print(f'MAP@{TOP_K} = {mean_ap:.4f} Hitrate@{TOP_K} = {mean_hitrate:.4f}')

MAP@10 = 0.0025 Hitrate@10 = 0.0955


## Подберем самые оптимальные гиперпараметры с помощью optuna

Дла алгоритма [W2V](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec) рассмотрим следующие параметры:

- `sg` – 1, если использовать skip-gram, иначе cbow
- `window` – размер окна для обучения алгоритма w2v
- `ns_exponent` – степень популярности объектов, которая будет использована для negative sampling
- `negative` – количество негативных примеров для сэмплирования
- `min_count` – минимальное число взаимодействий, нужное для 
- `vector_size` – размерность эмбеддингов

In [6]:
SEED = 42

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

def objective(trial):
    sg = trial.suggest_categorical('sg', [0, 1])
    window = trial.suggest_int('window', 1, 10)
    ns_exponent = trial.suggest_float('ns_exponent', -3, 3)
    negative = trial.suggest_int('negative', 3, 20)
    min_count = trial.suggest_int('min_count', 0, 20)
    vector_size = trial.suggest_categorical('vector_size', [16, 32, 64, 128])
    
    print({
        'sg': sg,
        'window_len': window,
        'ns_exponent': ns_exponent,
        'negative': negative,
        'min_count': min_count,
        'vector_size': vector_size,
    })
    
    set_seed(SEED)
    model = Word2Vec(
        grouped_df['train_ids'].to_list(),
        window=window,
        sg=sg,
        hs=0,
        min_count=min_count,
        vector_size=vector_size,
        negative=negative,
        ns_exponent=ns_exponent,
        seed=SEED,
        epochs=10,
    )
    
    mean_ap, mean_hitrate = evaluate_model(model)
    print(f'MAP@{TOP_K} = {mean_ap:.4f} Hitrate@{TOP_K} = {mean_hitrate:.4f}')
    return mean_ap
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=100)

study.best_params

[I 2024-08-12 05:21:36,433] A new study created in memory with name: no-name-68632159-90ba-4734-b620-0e8d6084a835


{'sg': 0, 'window_len': 4, 'ns_exponent': 1.9552955375092225, 'negative': 6, 'min_count': 19, 'vector_size': 64}


[I 2024-08-12 05:21:37,534] Trial 0 finished with value: 0.005803444782168187 and parameters: {'sg': 0, 'window': 4, 'ns_exponent': 1.9552955375092225, 'negative': 6, 'min_count': 19, 'vector_size': 64}. Best is trial 0 with value: 0.005803444782168187.


MAP@10 = 0.0058 Hitrate@10 = 0.1518
{'sg': 1, 'window_len': 5, 'ns_exponent': -0.1403872646315225, 'negative': 16, 'min_count': 3, 'vector_size': 64}


[I 2024-08-12 05:21:42,113] Trial 1 finished with value: 0.029436845660249913 and parameters: {'sg': 1, 'window': 5, 'ns_exponent': -0.1403872646315225, 'negative': 16, 'min_count': 3, 'vector_size': 64}. Best is trial 1 with value: 0.029436845660249913.


MAP@10 = 0.0294 Hitrate@10 = 0.4798
{'sg': 1, 'window_len': 9, 'ns_exponent': -2.0230699017216294, 'negative': 5, 'min_count': 5, 'vector_size': 64}


[W 2024-08-12 05:21:42,281] Trial 2 failed with parameters: {'sg': 1, 'window': 9, 'ns_exponent': -2.0230699017216294, 'negative': 5, 'min_count': 5, 'vector_size': 64} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/karpov/.local/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_49/2471874407.py", line 25, in objective
    model = Word2Vec(
  File "/home/karpov/.local/lib/python3.8/site-packages/gensim/models/word2vec.py", line 430, in __init__
    self.train(
  File "/home/karpov/.local/lib/python3.8/site-packages/gensim/models/word2vec.py", line 1073, in train
    trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch(
  File "/home/karpov/.local/lib/python3.8/site-packages/gensim/models/word2vec.py", line 1434, in _train_epoch
    trained_word_count, raw_word_count, job_tally = self._log_epoch_progress(
  File "/ho

KeyboardInterrupt: 

In [7]:
set_seed(SEED)
model = Word2Vec(
    grouped_df['train_ids'].to_list(),
    **study.best_params,
    hs=0,
    seed=SEED,
    epochs=50
)

hitrate_list = []
for train_ids, y_rel in grouped_df.select('train_ids', 'test_ids').rows():
    model_preds = model.predict_output_word(train_ids, topn=(TOP_K + len(train_ids)))
    if model_preds is None:
        hitrate_list.append(0)
        continue
        
    y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
    hitrate_list.append(user_hitrate(y_rel, y_rec))

mean_ap, mean_hitrate = evaluate_model(model)
print(f'MAP@{TOP_K} = {mean_ap:.4f} Hitrate@{TOP_K} = {mean_hitrate:.4f}')

MAP@10 = 0.0365 Hitrate@10 = 0.5722


## Сохраним рекомендации в redis

В реальных системах в качестве идентификаторов скорее всего будет использоваться uuid, тогда как при обучении удобно использовать представление в виде целых чисел.

Одним из вариантов работы с идентификаторами является создание словарей `user_ids_mapping` и `user_ids_inverse_mapping`, где первый делает преобразование _uuid -> int_, а второй _int -> uuid_.

Давайте просимулируем реальную рекомендательную систему и загрузим рекомендации в **redis**.

In [13]:
user_ids_inverse_mapping = {k: uuid.uuid4() for k in ratings['user_id'].unique()}
item_ids_inverse_mapping = {k: uuid.uuid4() for k in ratings['item_id'].unique()}

In [9]:
# если redis запущен в том же окружении, то можно использовать localhost
# иначе, измените host на ip-адресс сервера с запущенным redis
r = redis.Redis(host='37.27.29.69', db=0)
used_memory_before = r.info('memory')['used_memory']

In [15]:
TOP_K = 100  # сохраним топ-100 рекомендаций

for user_id, train_ids in grouped_df.select('user_id', 'train_ids').rows():
    model_preds = model.predict_output_word(train_ids, topn=(TOP_K + len(train_ids)))
    if model_preds is None:
        continue
        
    y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
    r.rpush(
        str(user_ids_inverse_mapping[user_id]),
        *[str(item_ids_inverse_mapping[item_id]) for item_id in y_rec]
    )

In [19]:
# прочитаем сохраненные рекомендации по ключу user_id
r.lrange(str(user_ids_inverse_mapping[1]), 0, TOP_K)

[b'd66f57b2-c87b-45bb-90a0-c2d1b5040f26',
 b'c3a6c010-6b66-4566-b7c5-6ebe7ab7c8b7',
 b'cc6fea8e-776e-440c-bbc7-626e034e526b',
 b'a711a5ab-efba-4741-8520-bf47f4cb590a',
 b'907e8acb-c6a3-49d4-b9d4-a55436f75c2b',
 b'77125e09-c107-4f97-bae5-ac50eafba737',
 b'6d15b989-b23e-4ab3-9928-46ef67d1877a',
 b'c66d8d8d-c3df-4738-9784-41de40187ebe',
 b'bd65a674-c4f5-488b-b51a-93ba760d79e4',
 b'69fd47ab-2bbb-47c4-b982-9105a76d7158',
 b'422921a1-cc82-4dae-a25b-8b3d151a6a94',
 b'4b90cfe1-e489-43bb-a8b2-3ef4f9cc34e7',
 b'691e7c31-fef3-42f3-bc4e-902ab95a9641',
 b'718120bb-3a58-4bdb-b564-229db0334f44',
 b'3f9a0dd5-eec3-4ad0-b91d-43c3873bf3bc',
 b'c3761ddd-0f0d-426c-ab77-e5044169703f',
 b'feed0f1c-a9e9-4ace-8556-64c52a77f5b6',
 b'a3271137-2a83-43da-875b-f36f5e1c40eb',
 b'3905cd08-7431-4b45-af8f-5cf18928ce27',
 b'682fa1f6-2895-4f80-94eb-2c1a625bc630',
 b'8d625430-0abf-498d-a6c7-1e3b028cbf10',
 b'1555968f-15cc-4c05-8ab8-bea3bc065cb5',
 b'a706f553-00fe-4c1a-9306-59acc7db4863',
 b'f84289ed-cea5-47e6-b5ba-5a59b30

In [20]:
r.info('memory')['used_memory'] - used_memory_before

4770528

Потратили около 5Мб на сохранение рекомендаций

In [22]:
r = redis.Redis(host='37.27.29.69', db=1)
used_memory_before = r.info('memory')['used_memory']

In [25]:
TOP_K = 100  # сохраним топ-100 рекомендаций

for user_id, train_ids in grouped_df.select('user_id', 'train_ids').rows():
    model_preds = model.predict_output_word(train_ids, topn=(TOP_K + len(train_ids)))
    if model_preds is None:
        continue
        
    y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
    r.rpush(
        user_ids_inverse_mapping[user_id].bytes,
        *[item_ids_inverse_mapping[item_id].bytes for item_id in y_rec]
    )

In [26]:
# прочитаем сохраненные рекомендации по ключу user_id
# для этого сначала преобразуем user_uuid в байты, а затем байты в item_uuid
[
    uuid.UUID(bytes=item_id_bytes, version=4)
    for item_id_bytes in r.lrange(user_ids_inverse_mapping[1].bytes, 0, TOP_K)
]

[UUID('d66f57b2-c87b-45bb-90a0-c2d1b5040f26'),
 UUID('c3a6c010-6b66-4566-b7c5-6ebe7ab7c8b7'),
 UUID('cc6fea8e-776e-440c-bbc7-626e034e526b'),
 UUID('a711a5ab-efba-4741-8520-bf47f4cb590a'),
 UUID('907e8acb-c6a3-49d4-b9d4-a55436f75c2b'),
 UUID('77125e09-c107-4f97-bae5-ac50eafba737'),
 UUID('6d15b989-b23e-4ab3-9928-46ef67d1877a'),
 UUID('c66d8d8d-c3df-4738-9784-41de40187ebe'),
 UUID('bd65a674-c4f5-488b-b51a-93ba760d79e4'),
 UUID('69fd47ab-2bbb-47c4-b982-9105a76d7158'),
 UUID('422921a1-cc82-4dae-a25b-8b3d151a6a94'),
 UUID('4b90cfe1-e489-43bb-a8b2-3ef4f9cc34e7'),
 UUID('691e7c31-fef3-42f3-bc4e-902ab95a9641'),
 UUID('718120bb-3a58-4bdb-b564-229db0334f44'),
 UUID('3f9a0dd5-eec3-4ad0-b91d-43c3873bf3bc'),
 UUID('c3761ddd-0f0d-426c-ab77-e5044169703f'),
 UUID('feed0f1c-a9e9-4ace-8556-64c52a77f5b6'),
 UUID('a3271137-2a83-43da-875b-f36f5e1c40eb'),
 UUID('3905cd08-7431-4b45-af8f-5cf18928ce27'),
 UUID('682fa1f6-2895-4f80-94eb-2c1a625bc630'),
 UUID('8d625430-0abf-498d-a6c7-1e3b028cbf10'),
 UUID('155596

In [27]:
r.info('memory')['used_memory'] - used_memory_before

2271040

Потратили в 2 раза меньше памяти для сохранения рекомендаций 🎉🎉