In [None]:
pip install optuna

In [None]:
pip install implicit

In [None]:
pip install gensim

In [None]:
pip install catboost

In [None]:
pip install faiss

In [17]:
# раскоментируйте код ниже, чтобы скачать данные
!wget -q https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -q ml-1m.zip

In [18]:
import optuna
import random

import polars as pl
import pandas as pd
import numpy as np

from typing import List, Any
import scipy.sparse as sp
from tqdm import tqdm

import implicit
import faiss
from gensim.models import Word2Vec

from catboost import Pool, CatBoost, CatBoostClassifier, CatBoostRanker
from sklearn.model_selection import train_test_split

In [19]:
ratings = pd.read_csv(
    'ml-1m/ratings.dat', delimiter='::', header=None, 
    names=['user_id', 'item_id', 'rating', 'timestamp'], 
    engine='python'
)
ratings = pl.from_pandas(ratings).filter(pl.col('rating') >= 4)
ratings

user_id,item_id,rating,timestamp
i64,i64,i64,i64
1,1193,5,978300760
1,3408,4,978300275
1,2355,5,978824291
1,1287,5,978302039
1,2804,5,978300719
…,…,…,…
6040,1089,4,956704996
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


In [22]:
grouped_df = (
    ratings
    .group_by('user_id')
    .agg([
        pl.col('item_id').map_elements(lambda x: x[:-3]).alias('train_item_ids'),
        pl.col('rating').map_elements(lambda x: x[:-3]).alias('train_ratings'),
        pl.col('item_id').map_elements(lambda x: x[-3:]).alias('test_item_ids'),
        pl.col('rating').map_elements(lambda x: x[-3:]).alias('test_ratings'),
    ])
)
grouped_df



user_id,train_item_ids,train_ratings,test_item_ids,test_ratings
i64,list[i64],list[i64],list[i64],list[i64]
4073,"[1250, 1272, … 1222]","[4, 4, … 4]","[2028, 1090, 1242]","[5, 4, 4]"
2236,"[3793, 720, … 3916]","[4, 4, … 4]","[1241, 3784, 3785]","[4, 4, 4]"
2370,"[1250, 589, … 1234]","[4, 4, … 5]","[1090, 1094, 1247]","[5, 4, 5]"
268,"[2987, 647, … 2115]","[4, 4, … 4]","[1242, 3784, 1246]","[4, 4, 4]"
6,"[2406, 1101, … 1674]","[5, 4, … 4]","[3565, 1028, 34]","[4, 4, 4]"
…,…,…,…,…
4710,"[3044, 969, … 2762]","[4, 5, … 4]","[529, 2022, 1086]","[5, 5, 4]"
5627,"[588, 589, … 1090]","[4, 4, … 4]","[1093, 1095, 562]","[4, 4, 4]"
4454,"[589, 1415, … 3751]","[5, 5, … 4]","[555, 1089, 1095]","[4, 5, 5]"
1170,"[1248, 1252, … 1237]","[4, 5, … 5]","[1094, 1244, 1247]","[4, 5, 5]"


In [24]:
median_seq_len = int(grouped_df['train_item_ids'].map_elements(len).median())
print(f"средняя длина сессии {median_seq_len}")

средняя длина сессии 55


  median_seq_len = int(grouped_df['train_item_ids'].map_elements(len).median())


In [25]:
# соберем строчки для разреженной матрицы
rows = []
cols = []
values = []

for user_id, train_item_ids, train_ratings in grouped_df.select('user_id', 'train_item_ids', 'train_ratings').rows():
    rows.extend([user_id] * len(train_item_ids))
    cols.extend(train_item_ids)
    values.extend(train_ratings)

user_item_data = sp.csr_matrix((values, (rows, cols)), dtype=np.float32)
user_item_data

<6041x3953 sparse matrix of type '<class 'numpy.float32'>'
	with 557171 stored elements in Compressed Sparse Row format>

In [26]:
TOP_K = 20


def user_hitrate(y_relevant: List[str], y_preds: List[str], k: int = TOP_K) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: 1 if top-k recommendations contains at lease one relevant item
    """
    return int(len(set(y_relevant).intersection(y_preds[:k])) > 0)


def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = TOP_K) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg

In [27]:
RANDOM_STATE = 42

def set_seed():
    random.seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)
    
    
def get_recommendations(user_embs: np.array, item_embs: np.array, k: int = TOP_K):
    # строим индекс объектов
    index = faiss.IndexFlatIP(item_embs.shape[1])
    index.add(item_embs)

    # строим рекомендации с помощью dot-product расстояния
    # с запасом, чтобы после фильтрации просмотренных осталось хотя бы TOP_K
    return index.search(user_embs, k)

## ALS

В качестве первой модели возьмем ALS факторизацию и подберем оптимальные гиперпараметры с помощью библиотеки `optuna`

In [32]:
als_model = implicit.als.AlternatingLeastSquares(
        factors=8,
        iterations=5,
        random_state=1,
        alpha=0.1,
        regularization=1e-3
    )
als_model.fit(user_item_data)

  0%|          | 0/5 [00:00<?, ?it/s]

In [52]:
def objective(trial):
    factors = trial.suggest_int('factors', 8, 128)
    iterations = trial.suggest_int('iterations', 5, 30)
    alpha = trial.suggest_float('alpha', 0.1, 5.0)
    regularization = trial.suggest_float('regularization', 1e-3, 1.0)
        
    print({
        'factors': factors,
        'iterations': iterations,
        'alpha': alpha,
        'regularization': regularization,
    })
    
    set_seed()
    als_model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        iterations=iterations,
        random_state=RANDOM_STATE,
        alpha=alpha,
        regularization=regularization
    )
    als_model.fit(user_item_data)
    
    _, recs = get_recommendations(
        als_model.user_factors.to_numpy(),
        als_model.item_factors.to_numpy(),
        TOP_K + median_seq_len
    )
    
    ndcg_list = []
    for user_id, user_history, y_rel in grouped_df.select('user_id', 'train_item_ids', 'test_item_ids').rows():
        y_rec = [item_id for item_id in recs[user_id] if item_id not in user_history]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
    mean_ndcg = np.mean(ndcg_list)
    print(f'NDCG@{TOP_K} = {mean_ndcg}')
    return mean_ndcg
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=20)

study.best_params

[I 2024-09-01 22:52:45,764] A new study created in memory with name: no-name-1cdcc679-ed3b-4392-b88c-821d4a104d6c


{'factors': 116, 'iterations': 10, 'alpha': 0.7938338954232596, 'regularization': 0.5373180605268009}


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2024-09-01 22:52:47,595] Trial 0 finished with value: 0.07276115022641552 and parameters: {'factors': 116, 'iterations': 10, 'alpha': 0.7938338954232596, 'regularization': 0.5373180605268009}. Best is trial 0 with value: 0.07276115022641552.


NDCG@20 = 0.07276115022641552
{'factors': 121, 'iterations': 12, 'alpha': 4.625421771769431, 'regularization': 0.5806367277385549}


  0%|          | 0/12 [00:00<?, ?it/s]

[I 2024-09-01 22:52:49,345] Trial 1 finished with value: 0.07103272938082096 and parameters: {'factors': 121, 'iterations': 12, 'alpha': 4.625421771769431, 'regularization': 0.5806367277385549}. Best is trial 0 with value: 0.07276115022641552.


NDCG@20 = 0.07103272938082096
{'factors': 71, 'iterations': 20, 'alpha': 2.3956767036658255, 'regularization': 0.40983640823220635}


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-01 22:52:51,078] Trial 2 finished with value: 0.07478491658006602 and parameters: {'factors': 71, 'iterations': 20, 'alpha': 2.3956767036658255, 'regularization': 0.40983640823220635}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07478491658006602
{'factors': 30, 'iterations': 11, 'alpha': 2.930701707185796, 'regularization': 0.9375817890631041}


  0%|          | 0/11 [00:00<?, ?it/s]

[I 2024-09-01 22:52:52,595] Trial 3 finished with value: 0.06993801993136935 and parameters: {'factors': 30, 'iterations': 11, 'alpha': 2.930701707185796, 'regularization': 0.9375817890631041}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.06993801993136935
{'factors': 54, 'iterations': 6, 'alpha': 4.172322114503123, 'regularization': 0.7461032789778412}


  0%|          | 0/6 [00:00<?, ?it/s]

[I 2024-09-01 22:52:54,182] Trial 4 finished with value: 0.07316168427564372 and parameters: {'factors': 54, 'iterations': 6, 'alpha': 4.172322114503123, 'regularization': 0.7461032789778412}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07316168427564372
{'factors': 109, 'iterations': 7, 'alpha': 1.2771124943347483, 'regularization': 0.34161562263702966}


  0%|          | 0/7 [00:00<?, ?it/s]

[I 2024-09-01 22:52:55,895] Trial 5 finished with value: 0.07300273652434505 and parameters: {'factors': 109, 'iterations': 7, 'alpha': 1.2771124943347483, 'regularization': 0.34161562263702966}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07300273652434505
{'factors': 22, 'iterations': 30, 'alpha': 2.9105112861437665, 'regularization': 0.263418426872155}


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2024-09-01 22:52:57,635] Trial 6 finished with value: 0.06951598208718568 and parameters: {'factors': 22, 'iterations': 30, 'alpha': 2.9105112861437665, 'regularization': 0.263418426872155}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.06951598208718568
{'factors': 61, 'iterations': 12, 'alpha': 4.791224451496947, 'regularization': 0.5665328769656973}


  0%|          | 0/12 [00:00<?, ?it/s]

[I 2024-09-01 22:52:59,192] Trial 7 finished with value: 0.07432069178909803 and parameters: {'factors': 61, 'iterations': 12, 'alpha': 4.791224451496947, 'regularization': 0.5665328769656973}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07432069178909803
{'factors': 36, 'iterations': 8, 'alpha': 0.13637184147932718, 'regularization': 0.9057522871221995}


  0%|          | 0/8 [00:00<?, ?it/s]

[I 2024-09-01 22:53:00,890] Trial 8 finished with value: 0.0730733211580251 and parameters: {'factors': 36, 'iterations': 8, 'alpha': 0.13637184147932718, 'regularization': 0.9057522871221995}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.0730733211580251
{'factors': 57, 'iterations': 13, 'alpha': 4.5389947726131785, 'regularization': 0.7286356798871904}


  0%|          | 0/13 [00:00<?, ?it/s]

[I 2024-09-01 22:53:02,470] Trial 9 finished with value: 0.07286572277337486 and parameters: {'factors': 57, 'iterations': 13, 'alpha': 4.5389947726131785, 'regularization': 0.7286356798871904}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07286572277337486
{'factors': 89, 'iterations': 23, 'alpha': 2.0352859434818193, 'regularization': 0.019141424962364595}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2024-09-01 22:53:04,231] Trial 10 finished with value: 0.07344935960453579 and parameters: {'factors': 89, 'iterations': 23, 'alpha': 2.0352859434818193, 'regularization': 0.019141424962364595}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07344935960453579
{'factors': 80, 'iterations': 17, 'alpha': 3.778988654711237, 'regularization': 0.32376294854374055}


  0%|          | 0/17 [00:00<?, ?it/s]

[I 2024-09-01 22:53:06,090] Trial 11 finished with value: 0.07400799365406008 and parameters: {'factors': 80, 'iterations': 17, 'alpha': 3.778988654711237, 'regularization': 0.32376294854374055}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07400799365406008
{'factors': 75, 'iterations': 20, 'alpha': 2.062292927041175, 'regularization': 0.4681709076099508}


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-01 22:53:07,734] Trial 12 finished with value: 0.0741914650258135 and parameters: {'factors': 75, 'iterations': 20, 'alpha': 2.062292927041175, 'regularization': 0.4681709076099508}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.0741914650258135
{'factors': 52, 'iterations': 17, 'alpha': 3.4870129521658924, 'regularization': 0.15029962211074915}


  0%|          | 0/17 [00:00<?, ?it/s]

[I 2024-09-01 22:53:09,361] Trial 13 finished with value: 0.07365690574095905 and parameters: {'factors': 52, 'iterations': 17, 'alpha': 3.4870129521658924, 'regularization': 0.15029962211074915}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07365690574095905
{'factors': 97, 'iterations': 25, 'alpha': 2.2708954899034923, 'regularization': 0.6743995351743721}


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-09-01 22:53:11,134] Trial 14 finished with value: 0.07237783644766518 and parameters: {'factors': 97, 'iterations': 25, 'alpha': 2.2708954899034923, 'regularization': 0.6743995351743721}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07237783644766518
{'factors': 66, 'iterations': 15, 'alpha': 1.4709050518016538, 'regularization': 0.42808961018734026}


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2024-09-01 22:53:12,724] Trial 15 finished with value: 0.07394721867486774 and parameters: {'factors': 66, 'iterations': 15, 'alpha': 1.4709050518016538, 'regularization': 0.42808961018734026}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07394721867486774
{'factors': 95, 'iterations': 21, 'alpha': 4.924378980388504, 'regularization': 0.6189914236865027}


  0%|          | 0/21 [00:00<?, ?it/s]

[I 2024-09-01 22:53:14,484] Trial 16 finished with value: 0.07302147376695213 and parameters: {'factors': 95, 'iterations': 21, 'alpha': 4.924378980388504, 'regularization': 0.6189914236865027}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07302147376695213
{'factors': 38, 'iterations': 27, 'alpha': 3.4218144497594514, 'regularization': 0.8312827256074535}


  0%|          | 0/27 [00:00<?, ?it/s]

[I 2024-09-01 22:53:16,285] Trial 17 finished with value: 0.07116182374604124 and parameters: {'factors': 38, 'iterations': 27, 'alpha': 3.4218144497594514, 'regularization': 0.8312827256074535}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07116182374604124
{'factors': 67, 'iterations': 20, 'alpha': 2.7193844756118124, 'regularization': 0.14308771653929853}


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-01 22:53:17,948] Trial 18 finished with value: 0.07373530820780608 and parameters: {'factors': 67, 'iterations': 20, 'alpha': 2.7193844756118124, 'regularization': 0.14308771653929853}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.07373530820780608
{'factors': 10, 'iterations': 15, 'alpha': 3.768542142292657, 'regularization': 0.3878788464927199}


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2024-09-01 22:53:19,758] Trial 19 finished with value: 0.059346099102068 and parameters: {'factors': 10, 'iterations': 15, 'alpha': 3.768542142292657, 'regularization': 0.3878788464927199}. Best is trial 2 with value: 0.07478491658006602.


NDCG@20 = 0.059346099102068


{'factors': 71,
 'iterations': 20,
 'alpha': 2.3956767036658255,
 'regularization': 0.40983640823220635}

In [53]:
set_seed()
als_model = implicit.als.AlternatingLeastSquares(
    factors=70,
    iterations=50,
    random_state=RANDOM_STATE,
    regularization=0.5,
    alpha=0.7,
)
als_model.fit(user_item_data)

_, als_recs = get_recommendations(
    als_model.user_factors.to_numpy(),
    als_model.item_factors.to_numpy(),
    TOP_K + median_seq_len
)

ndcg_list = []
hitrate_list = []
for user_id, user_history, y_rel in grouped_df.select('user_id', 'train_item_ids', 'test_item_ids').rows():
    y_rec = [item_id for item_id in als_recs[user_id] if item_id not in user_history]
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

  0%|          | 0/50 [00:00<?, ?it/s]

NDCG@20 = 0.0730, Hitrate@20 = 0.3185


## Word2Vec

В качестве второго подхода попробуем использовать сессионные рекомендации и модель w2v, для нее так же подберем оптимальные гиперпараметры с помощью библиотеки `optuna`

In [54]:
def evaluate_model(model):
    ndcg_list = []
    hitrate_list = []
    for train_ids, y_rel in grouped_df.select('train_item_ids', 'test_item_ids').rows():
        model_preds = model.predict_output_word(train_ids[-model.window:], topn=(TOP_K + len(train_ids)))
        if model_preds is None:
            ndcg_list.append(0)
            hitrate_list.append(0)
            continue

        y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        hitrate_list.append(user_hitrate(y_rel, y_rec))
    return np.mean(ndcg_list), np.mean(hitrate_list)


def objective(trial):
    sg = trial.suggest_categorical('sg', [0, 1])
    window = trial.suggest_int('window', 1, 10)
    ns_exponent = trial.suggest_float('ns_exponent', -3, 3)
    negative = trial.suggest_int('negative', 3, 20)
    min_count = trial.suggest_int('min_count', 0, 20)
    vector_size = trial.suggest_categorical('vector_size', [16, 32, 64, 128])
    
    print({
        'sg': sg,
        'window_len': window,
        'ns_exponent': ns_exponent,
        'negative': negative,
        'min_count': min_count,
        'vector_size': vector_size,
    })
    
    set_seed()
    model = Word2Vec(
        grouped_df['train_item_ids'].to_list(),
        window=window,
        sg=sg,
        hs=0,
        min_count=min_count,
        vector_size=vector_size,
        negative=negative,
        ns_exponent=ns_exponent,
        seed=RANDOM_STATE,
        epochs=10,
    )
    
    mean_ndcg, mean_hitrate = evaluate_model(model)
    print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Hitrate@{TOP_K} = {mean_hitrate:.4f}')
    return mean_ndcg
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=20)

study.best_params

[I 2024-09-01 22:53:30,756] A new study created in memory with name: no-name-539b77e9-6cad-4de3-85e9-3c3a910ec92d


{'sg': 1, 'window_len': 9, 'ns_exponent': 0.02499797691326755, 'negative': 16, 'min_count': 11, 'vector_size': 64}


[I 2024-09-01 22:54:57,113] Trial 0 finished with value: 0.03406977374488073 and parameters: {'sg': 1, 'window': 9, 'ns_exponent': 0.02499797691326755, 'negative': 16, 'min_count': 11, 'vector_size': 64}. Best is trial 0 with value: 0.03406977374488073.


NDCG@20 = 0.0341, Hitrate@20 = 0.2047
{'sg': 0, 'window_len': 5, 'ns_exponent': -1.9580627741581444, 'negative': 11, 'min_count': 13, 'vector_size': 16}


[I 2024-09-01 22:55:05,860] Trial 1 finished with value: 0.10291239680353295 and parameters: {'sg': 0, 'window': 5, 'ns_exponent': -1.9580627741581444, 'negative': 11, 'min_count': 13, 'vector_size': 16}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.1029, Hitrate@20 = 0.3756
{'sg': 1, 'window_len': 4, 'ns_exponent': -2.1558299873916362, 'negative': 4, 'min_count': 13, 'vector_size': 64}


[I 2024-09-01 22:55:21,746] Trial 2 finished with value: 0.07788980135356381 and parameters: {'sg': 1, 'window': 4, 'ns_exponent': -2.1558299873916362, 'negative': 4, 'min_count': 13, 'vector_size': 64}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0779, Hitrate@20 = 0.2931
{'sg': 0, 'window_len': 1, 'ns_exponent': 2.0263659327684227, 'negative': 9, 'min_count': 10, 'vector_size': 16}


[I 2024-09-01 22:55:28,251] Trial 3 finished with value: 0.029284339025503564 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': 2.0263659327684227, 'negative': 9, 'min_count': 10, 'vector_size': 16}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0293, Hitrate@20 = 0.1714
{'sg': 0, 'window_len': 3, 'ns_exponent': -1.531425500975288, 'negative': 16, 'min_count': 6, 'vector_size': 32}


[I 2024-09-01 22:55:39,330] Trial 4 finished with value: 0.09409749190071076 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -1.531425500975288, 'negative': 16, 'min_count': 6, 'vector_size': 32}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0941, Hitrate@20 = 0.3844
{'sg': 1, 'window_len': 1, 'ns_exponent': 1.9753023088541335, 'negative': 7, 'min_count': 7, 'vector_size': 32}


[I 2024-09-01 22:55:48,366] Trial 5 finished with value: 0.08014084549707323 and parameters: {'sg': 1, 'window': 1, 'ns_exponent': 1.9753023088541335, 'negative': 7, 'min_count': 7, 'vector_size': 32}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0801, Hitrate@20 = 0.3735
{'sg': 1, 'window_len': 4, 'ns_exponent': -0.2091976087459808, 'negative': 18, 'min_count': 15, 'vector_size': 64}


[I 2024-09-01 22:56:28,727] Trial 6 finished with value: 0.08269408494465277 and parameters: {'sg': 1, 'window': 4, 'ns_exponent': -0.2091976087459808, 'negative': 18, 'min_count': 15, 'vector_size': 64}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0827, Hitrate@20 = 0.3660
{'sg': 0, 'window_len': 4, 'ns_exponent': -1.178691191036925, 'negative': 20, 'min_count': 0, 'vector_size': 32}


[I 2024-09-01 22:56:41,056] Trial 7 finished with value: 0.08420317126810369 and parameters: {'sg': 0, 'window': 4, 'ns_exponent': -1.178691191036925, 'negative': 20, 'min_count': 0, 'vector_size': 32}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0842, Hitrate@20 = 0.3637
{'sg': 0, 'window_len': 9, 'ns_exponent': 1.7806908617739676, 'negative': 5, 'min_count': 7, 'vector_size': 64}


[I 2024-09-01 22:56:52,611] Trial 8 finished with value: 0.036355825433832895 and parameters: {'sg': 0, 'window': 9, 'ns_exponent': 1.7806908617739676, 'negative': 5, 'min_count': 7, 'vector_size': 64}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0364, Hitrate@20 = 0.1709
{'sg': 0, 'window_len': 2, 'ns_exponent': -2.969259376747566, 'negative': 15, 'min_count': 14, 'vector_size': 16}


[I 2024-09-01 22:57:00,765] Trial 9 finished with value: 0.08420379106932491 and parameters: {'sg': 0, 'window': 2, 'ns_exponent': -2.969259376747566, 'negative': 15, 'min_count': 14, 'vector_size': 16}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0842, Hitrate@20 = 0.3523
{'sg': 0, 'window_len': 7, 'ns_exponent': 0.3934115394531681, 'negative': 12, 'min_count': 20, 'vector_size': 128}


[I 2024-09-01 22:57:27,428] Trial 10 finished with value: 0.050881724472260755 and parameters: {'sg': 0, 'window': 7, 'ns_exponent': 0.3934115394531681, 'negative': 12, 'min_count': 20, 'vector_size': 128}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0509, Hitrate@20 = 0.2527
{'sg': 0, 'window_len': 6, 'ns_exponent': -1.7362860582601238, 'negative': 13, 'min_count': 2, 'vector_size': 16}


[I 2024-09-01 22:57:35,441] Trial 11 finished with value: 0.06570881957051887 and parameters: {'sg': 0, 'window': 6, 'ns_exponent': -1.7362860582601238, 'negative': 13, 'min_count': 2, 'vector_size': 16}. Best is trial 1 with value: 0.10291239680353295.


NDCG@20 = 0.0657, Hitrate@20 = 0.2585
{'sg': 0, 'window_len': 3, 'ns_exponent': -0.9656330503503104, 'negative': 9, 'min_count': 6, 'vector_size': 32}


[I 2024-09-01 22:57:42,760] Trial 12 finished with value: 0.1112619233440495 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -0.9656330503503104, 'negative': 9, 'min_count': 6, 'vector_size': 32}. Best is trial 12 with value: 0.1112619233440495.


NDCG@20 = 0.1113, Hitrate@20 = 0.4243
{'sg': 0, 'window_len': 6, 'ns_exponent': -0.8364871077274892, 'negative': 9, 'min_count': 18, 'vector_size': 128}


[I 2024-09-01 22:58:10,913] Trial 13 finished with value: 0.07594103160633482 and parameters: {'sg': 0, 'window': 6, 'ns_exponent': -0.8364871077274892, 'negative': 9, 'min_count': 18, 'vector_size': 128}. Best is trial 12 with value: 0.1112619233440495.


NDCG@20 = 0.0759, Hitrate@20 = 0.3258
{'sg': 0, 'window_len': 7, 'ns_exponent': -2.673174450605165, 'negative': 10, 'min_count': 4, 'vector_size': 16}


[I 2024-09-01 22:58:19,295] Trial 14 finished with value: 0.0493590282200693 and parameters: {'sg': 0, 'window': 7, 'ns_exponent': -2.673174450605165, 'negative': 10, 'min_count': 4, 'vector_size': 16}. Best is trial 12 with value: 0.1112619233440495.


NDCG@20 = 0.0494, Hitrate@20 = 0.2021
{'sg': 0, 'window_len': 5, 'ns_exponent': 0.999245737590735, 'negative': 7, 'min_count': 9, 'vector_size': 32}


[I 2024-09-01 22:58:26,276] Trial 15 finished with value: 0.04421517257661466 and parameters: {'sg': 0, 'window': 5, 'ns_exponent': 0.999245737590735, 'negative': 7, 'min_count': 9, 'vector_size': 32}. Best is trial 12 with value: 0.1112619233440495.


NDCG@20 = 0.0442, Hitrate@20 = 0.2216
{'sg': 0, 'window_len': 3, 'ns_exponent': -0.7374307397084723, 'negative': 13, 'min_count': 16, 'vector_size': 32}


[I 2024-09-01 22:58:36,177] Trial 16 finished with value: 0.10547884536669312 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -0.7374307397084723, 'negative': 13, 'min_count': 16, 'vector_size': 32}. Best is trial 12 with value: 0.1112619233440495.


NDCG@20 = 0.1055, Hitrate@20 = 0.4195
{'sg': 0, 'window_len': 2, 'ns_exponent': -0.5263680208795772, 'negative': 14, 'min_count': 19, 'vector_size': 32}


[I 2024-09-01 22:58:46,639] Trial 17 finished with value: 0.1370429749989143 and parameters: {'sg': 0, 'window': 2, 'ns_exponent': -0.5263680208795772, 'negative': 14, 'min_count': 19, 'vector_size': 32}. Best is trial 17 with value: 0.1370429749989143.


NDCG@20 = 0.1370, Hitrate@20 = 0.4783
{'sg': 1, 'window_len': 2, 'ns_exponent': 1.0122831051570036, 'negative': 7, 'min_count': 4, 'vector_size': 32}


[I 2024-09-01 22:58:58,988] Trial 18 finished with value: 0.07485692379822896 and parameters: {'sg': 1, 'window': 2, 'ns_exponent': 1.0122831051570036, 'negative': 7, 'min_count': 4, 'vector_size': 32}. Best is trial 17 with value: 0.1370429749989143.


NDCG@20 = 0.0749, Hitrate@20 = 0.3175
{'sg': 0, 'window_len': 2, 'ns_exponent': 2.8218508958476636, 'negative': 14, 'min_count': 18, 'vector_size': 32}


[I 2024-09-01 22:59:07,058] Trial 19 finished with value: 0.08302210204162766 and parameters: {'sg': 0, 'window': 2, 'ns_exponent': 2.8218508958476636, 'negative': 14, 'min_count': 18, 'vector_size': 32}. Best is trial 17 with value: 0.1370429749989143.


NDCG@20 = 0.0830, Hitrate@20 = 0.3133


{'sg': 0,
 'window': 2,
 'ns_exponent': -0.5263680208795772,
 'negative': 14,
 'min_count': 19,
 'vector_size': 32}

In [55]:
set_seed()
w2v_model = Word2Vec(
    grouped_df['train_item_ids'].to_list(),
    hs=0,
    seed=RANDOM_STATE,
    epochs=30,
    sg=0,
    window=1,
    ns_exponent=0.2,
    negative=20,
    min_count=3,
    vector_size=16,
)

mean_ndcg, mean_hitrate = evaluate_model(w2v_model)
print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Hitrate@{TOP_K} = {mean_hitrate:.4f}')

NDCG@20 = 0.1621, Hitrate@20 = 0.5152


## Ранжирующая модель

Для ранжирования нам хотелось бы учитывать больше признаков для пользователей и объектов, в датасете movielens-1M, к счастью, все это имеется из коробки

Для _пользователей_ есть следующие признаки:
- gender (пол пользователя, категориальный признак)
- age (возраст пользователя, численный признак)
- occupation (род дейтельности, категориальный признак)
- zip_code (почтовый индекс пользователя, категориальный признак)

In [56]:
user_features = pd.read_csv(
    'ml-1m/users.dat', delimiter='::', header=None, 
    names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], 
    engine='python'
)
user_features = pl.from_pandas(user_features)
user_features

user_id,gender,age,occupation,zip_code
i64,str,i64,i64,str
1,"""F""",1,10,"""48067"""
2,"""M""",56,16,"""70072"""
3,"""M""",25,15,"""55117"""
4,"""M""",45,7,"""02460"""
5,"""M""",25,20,"""55455"""
…,…,…,…,…
6036,"""F""",25,15,"""32603"""
6037,"""F""",45,1,"""76006"""
6038,"""F""",56,1,"""14706"""
6039,"""F""",45,0,"""01060"""


Для _объектов_ есть следующие признаки:
- title (название тайтла, строка)
- genres (жанр тайтла, список категориальных признаков)

In [58]:
item_features = pd.read_csv(
    'ml-1m/movies.dat', delimiter='::', header=None, 
    names=['item_id', 'title', 'genres'],
    engine='python', encoding='latin-1'
)
item_features = pl.from_pandas(item_features)
item_features = item_features.with_columns(pl.col('genres').map_elements(lambda x: x.split('|')))
item_features

  item_features = item_features.with_columns(pl.col('genres').map_elements(lambda x: x.split('|')))


item_id,title,genres
i64,str,list[str]
1,"""Toy Story (1995)""","[""Animation"", ""Children's"", ""Comedy""]"
2,"""Jumanji (1995)""","[""Adventure"", ""Children's"", ""Fantasy""]"
3,"""Grumpier Old Men (1995)""","[""Comedy"", ""Romance""]"
4,"""Waiting to Exhale (1995)""","[""Comedy"", ""Drama""]"
5,"""Father of the Bride Part II (1…","[""Comedy""]"
…,…,…
3948,"""Meet the Parents (2000)""","[""Comedy""]"
3949,"""Requiem for a Dream (2000)""","[""Drama""]"
3950,"""Tigerland (2000)""","[""Drama""]"
3951,"""Two Family House (2000)""","[""Drama""]"


## Соберем датасет для ранжирования

В качестве **позитивных** объектов мы будем использовать те взаимодействия, где оценка >= 4

В качестве **негативных** объектов мы можем использовать те взаимодействия, где оценка < 4, однако для implicit реакции этот подход не всегда подходит и для простоты мы будем сэмплировать негативные примеры на примере сессионных рекомендаций

In [65]:
ns_exponent = 0.75  # степень популярности объекта
items_set = ratings['item_id'].unique()

# предподсчитаем вероятности в сэмплировании негативных примеров
item_probs = dict()
for item_id, count in ratings.group_by('item_id').count().rows():
    item_probs[item_id] = count / len(ratings)

item_probs = np.array([
    item_probs.get(item_id, 0.)
    for item_id in items_set
])**ns_exponent
item_probs /= np.sum(item_probs)  # normalize so that it's probabilities

  for item_id, count in ratings.group_by('item_id').count().rows():


In [67]:
set_seed()

n_negatives = 3  # кол-во негативных примеров на один позитивный

ranking_dataset = []
for user_id, pos_ids in ratings.group_by('user_id').agg(pl.col('item_id').alias('item_ids')).rows():
    pos_ids = set(pos_ids)
    # на каждый позитивный объект (len(pos_ids)) сэмплируем n_negatives негативных
    # и еще len(pos_ids) для фильтрации
    n_items_to_sample = min(
        len(item_probs),
        len(pos_ids) * (n_negatives + 1)
    )
    
    neg_ids = [
        item_id
        for item_id in np.random.choice(
            items_set,
            n_items_to_sample,
            replace=False,  # сэмплируем только уникальные объекты
            p=item_probs  # сэмплируем чаще популярные объекты для негативов
        )
        if item_id not in pos_ids
    ]
    
    for item_id in pos_ids:
        ranking_dataset.append([user_id, item_id, 1])
    for item_id in neg_ids:
        ranking_dataset.append([user_id, item_id, 0])
    
ranking_dataset = pl.DataFrame(ranking_dataset, schema=['user_id', 'item_id', 'target'])
ranking_dataset

  return dispatch(args[0].__class__)(*args, **kw)


user_id,item_id,target
i64,i64,i64
1980,1,1
1980,2059,1
1980,11,1
1980,2064,1
1980,2065,1
…,…,…
6026,935,0
6026,2005,0
6026,1035,0
6026,3039,0


In [68]:
# соединим датасет с признаками пользователей и объектов
ranking_dataset_with_features = (
    ranking_dataset
    .join(user_features, 'user_id')
    .join(item_features, 'item_id')
).drop(['title', 'genres'])  # пока что мы не будем использовать признаки для объектов, вам предлагается попробовать
# вам предлагается попробовать эффективно закодировать эти признаки и еще улучшить метрики

ranking_dataset_with_features

user_id,item_id,target,gender,age,occupation,zip_code
i64,i64,i64,str,i64,i64,str
1980,1,1,"""M""",35,7,"""06460"""
1980,2059,1,"""M""",35,7,"""06460"""
1980,11,1,"""M""",35,7,"""06460"""
1980,2064,1,"""M""",35,7,"""06460"""
1980,2065,1,"""M""",35,7,"""06460"""
…,…,…,…,…,…,…
6026,935,0,"""M""",35,6,"""11210"""
6026,2005,0,"""M""",35,6,"""11210"""
6026,1035,0,"""M""",35,6,"""11210"""
6026,3039,0,"""M""",35,6,"""11210"""


## Обучаем catboost

Большая часть признаков получилась категориальными, чтобы не думать над эффективным кодированием этих признаков мы воспользуемся библиотекой `catboost`

Класс Pool позволяет обернуть наши данные с возможностью указать группу для group-wise ранжирования

In [69]:
train_dataset, test_dataset = train_test_split(
    ranking_dataset_with_features, test_size=0.1, random_state=RANDOM_STATE
)
# отсортируем, чтобы использовать group_id в классе Pool
train_dataset = train_dataset.sort('user_id')
test_dataset = test_dataset.sort('user_id')

cat_features=['user_id', 'item_id', 'gender', 'zip_code', 'occupation']

train_pool = Pool(
    train_dataset.drop(['target']).to_pandas(),
    train_dataset['target'].to_list(),
    # используется для подсчета groupwise метрик, например NDCG
    group_id=train_dataset['user_id'].to_list(),
    cat_features=cat_features
)

test_pool = Pool(
    test_dataset.drop(['target']).to_pandas(),
    test_dataset['target'].to_list(),
    group_id=test_dataset['user_id'].to_list(),
    cat_features=cat_features
)

# обучаем модель классификации, так как она работает сильно быстрее, чем модель ранжирования
# в качестве метрик можно смотреть на NDCG, MAP и AUC
ranking_model = CatBoostClassifier(**{
    'iterations': 300,
    'verbose': False,
    'random_state': RANDOM_STATE,
    'use_best_model': True,
    'custom_metric': [f'NDCG:top={TOP_K}', f'MAP:top={TOP_K}', 'AUC']
})
ranking_model.fit(train_pool, plot=True, eval_set=test_pool)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7ff8aa7b7370>

## Оценим результаты ранжирования

подготовим наши кандидаты в виде списков для каждого пользователя

In [70]:
# сформируем датафрейм с кандидатами из модели als
als_candidates = pl.DataFrame({
    'user_id': grouped_df['user_id'],
    'candidates': [
        [item_id for item_id in als_recs[user_id] if item_id not in user_history][:TOP_K]
        for user_id, user_history in grouped_df.select('user_id', 'train_item_ids').rows()
    ]
})
als_candidates

user_id,candidates
i64,list[i64]
4073,"[1263, 260, … 1408]"
2236,"[3578, 3555, … 3745]"
2370,"[527, 1208, … 2762]"
268,"[1569, 356, … 524]"
6,"[899, 919, … 3555]"
…,…
4710,"[1197, 2858, … 1210]"
5627,"[246, 919, … 1201]"
4454,"[1617, 293, … 480]"
1170,"[3504, 1997, … 356]"


In [71]:
def get_w2v_candidates(w2v_model, train_ids):
    model_preds = w2v_model.predict_output_word(
        train_ids[-w2v_model.window:], topn=(TOP_K + len(train_ids))
    )
    if model_preds is None:
        return []
    
    return [pred[0] for pred in model_preds if pred[0] not in train_ids][:TOP_K]
    
# сформируем датафрейм с кандидатами из модели w2v
w2v_candidates = pl.DataFrame({
    'user_id': grouped_df['user_id'],
    'candidates': [
        get_w2v_candidates(w2v_model, train_ids)
        for train_ids in grouped_df['train_item_ids'].to_list()
    ]
})
w2v_candidates

user_id,candidates
i64,list[i64]
4073,"[1221, 2028, … 2959]"
2236,"[94, 1089, … 1088]"
2370,"[1235, 1231, … 1307]"
268,"[2116, 2114, … 1179]"
6,"[1678, 1673, … 3424]"
…,…
4710,"[2763, 1022, … 3565]"
5627,"[1092, 1238, … 2037]"
4454,"[2011, 2012, … 534]"
1170,"[1238, 1240, … 1231]"


In [72]:
def predict_ranks(ranking_model, candidates_with_features):
    if isinstance(ranking_model, CatBoostRanker):
        return ranking_model.predict(candidates_with_features.to_pandas())
    else:
        return ranking_model.predict_proba(candidates_with_features.to_pandas())[:, 1]

candidates_with_features = (
    # объединяем все кандидаты в один список пар (user_id, item_id)
    pl.concat([als_candidates, w2v_candidates])
    .explode('candidates')
    .rename({'candidates': 'item_id'})
    .unique()
    # добавляем признаки пользователей
    .join(user_features, 'user_id')
    # добавляем признаки объектов
    .join(item_features, 'item_id')
).drop(['title', 'genres'])

candidates_with_features = candidates_with_features.with_columns([
    # взвешиваем с помощью модели ранжирования кандидатов
    pl.Series(predict_ranks(ranking_model, candidates_with_features)).alias('rank')
])

candidates_with_features

user_id,item_id,gender,age,occupation,zip_code,rank
i64,i64,str,i64,i64,str,f64
2811,1081,"""M""",25,19,"""94122""",0.108003
1555,2967,"""F""",18,0,"""20008""",0.084012
2572,1193,"""M""",35,14,"""75005""",0.494878
1796,708,"""M""",50,11,"""20852""",0.15889
278,1243,"""M""",45,18,"""60482""",0.094869
…,…,…,…,…,…,…
4487,2701,"""M""",25,7,"""01850""",0.094917
602,1099,"""F""",56,6,"""14612""",0.220101
1411,2734,"""M""",35,1,"""08107""",0.191717
4163,2571,"""M""",56,7,"""08901""",0.325712


In [74]:
grouped_candidates_with_features = (
    candidates_with_features
    .group_by('user_id')
    .agg([
        pl.col('item_id'),
        pl.col('rank')
    ])
)

reranked_candidates = []
for user_id, item_ids, item_ranks in grouped_candidates_with_features.rows():
    # для каждого пользователя сортируем оценки модели ранжирования и оставляем top-k объектов
    reranked_candidates.append([
        user_id,
        [
            item_ids[ind]
            for ind in np.argsort(item_ranks)[::-1][:TOP_K]
        ]
    ])
    
reranked_candidates = pl.DataFrame(reranked_candidates, schema=['user_id', 'candidates_item_ids'])
reranked_candidates

  return dispatch(args[0].__class__)(*args, **kw)


user_id,candidates_item_ids
i64,list[i64]
1319,"[1197, 318, … 2580]"
2492,"[919, 1230, … 1393]"
9,"[589, 2762, … 3702]"
3,"[1270, 1136, … 3361]"
2897,"[1198, 858, … 3751]"
…,…
1828,"[1196, 593, … 1639]"
6020,"[260, 1252, … 1200]"
5499,"[2997, 593, … 2763]"
2620,"[1196, 260, … 3555]"


оценим метрики для кандидатов по отдельности

In [75]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in w2v_candidates.join(grouped_df, 'user_id').select('candidates', 'test_item_ids').rows():
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1621, Hitrate@20 = 0.5152


In [76]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in als_candidates.join(grouped_df, 'user_id').select('candidates', 'test_item_ids').rows():
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.0730, Hitrate@20 = 0.3185


и попробуем два подхода объединения кандидатов:
1. будем по очереди брать объекты на i-ой категории и соединять в один список
2. возьмем сначала n кандидатов из одного списка, затем из другого и так далее

In [86]:
joined_candidates = (
    pl.concat([als_candidates, w2v_candidates])
    .group_by('user_id')
    .agg(pl.col('candidates'))
)

ndcg_list = []
hitrate_list = []

for candidates, y_rel in (
    joined_candidates
    .join(grouped_df, 'user_id')
    .select('candidates', 'test_item_ids')
    .rows()
):
    y_rec = [item_id for item_ids in list(zip(*candidates)) for item_id in item_ids]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1336, Hitrate@20 = 0.5179


In [94]:
joined_candidates = (
    pl.concat([als_candidates, w2v_candidates])
    .group_by('user_id')
    .agg(pl.col('candidates'))
)

ndcg_list = []
hitrate_list = []

for candidates, y_rel in (
    joined_candidates
    .join(grouped_df, 'user_id')
    .select('candidates', 'test_item_ids')
    .rows()
):
    y_rec = [item_id for item_ids in candidates for item_id in item_ids[:TOP_K // len(candidates)]]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1153, Hitrate@20 = 0.5230


ну и наконец оценим метрики для кандидатов после ранжирования

In [95]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in (
    reranked_candidates
    .join(grouped_df, 'user_id')
    .select('candidates_item_ids', 'test_item_ids')
    .rows()
):
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.2361, Hitrate@20 = 0.6712


In [96]:
print(f'NDCG@{TOP_K} +{(0.2361 / 0.1336 - 1)* 100:.2f}%')
print(f'Hitrate@{TOP_K} +{(0.6712 / 0.5179 - 1)* 100:.2f}%')

NDCG@20 +76.72%
Hitrate@20 +29.60%
