In [1]:
import optuna
import random

import polars as pl
import pandas as pd
import numpy as np

from typing import List, Any
import scipy.sparse as sp
from tqdm import tqdm

import implicit
import faiss
from gensim.models import Word2Vec

from catboost import Pool, CatBoost, CatBoostClassifier, CatBoostRanker
from sklearn.model_selection import train_test_split

In [2]:
ratings = pd.read_csv(
    'ml-1m/ratings.dat', delimiter='::', header=None, 
    names=['user_id', 'item_id', 'rating', 'timestamp'], 
    engine='python'
)
ratings = pl.from_pandas(ratings).filter(pl.col('rating') >= 4)
ratings

user_id,item_id,rating,timestamp
i64,i64,i64,i64
1,1193,5,978300760
1,3408,4,978300275
1,2355,5,978824291
1,1287,5,978302039
1,2804,5,978300719
1,594,4,978302268
1,919,4,978301368
1,595,5,978824268
1,938,4,978301752
1,2398,4,978302281


In [3]:
grouped_df = (
    ratings
    .groupby('user_id')
    .agg([
        pl.col('item_id').apply(lambda x: x[:-3]).alias('train_item_ids'),
        pl.col('rating').apply(lambda x: x[:-3]).alias('train_ratings'),
        pl.col('item_id').apply(lambda x: x[-3:]).alias('test_item_ids'),
        pl.col('rating').apply(lambda x: x[-3:]).alias('test_ratings'),
    ])
)
grouped_df

user_id,train_item_ids,train_ratings,test_item_ids,test_ratings
i64,list[i64],list[i64],list[i64],list[i64]
1448,"[2987, 571, … 1240]","[4, 4, … 5]","[1242, 1246, 1247]","[4, 5, 4]"
4840,"[588, 1, … 527]","[5, 4, … 5]","[531, 2013, 1097]","[4, 4, 4]"
72,"[3798, 2997, … 3753]","[4, 5, … 4]","[82, 2028, 1304]","[5, 5, 5]"
4808,"[2059, 588, … 3770]","[4, 4, … 5]","[3773, 2046, 2048]","[4, 5, 4]"
4656,"[585, 586, … 1073]","[4, 4, … 4]","[541, 2028, 551]","[5, 4, 4]"
448,"[2617, 2688, … 2763]","[4, 4, … 5]","[2959, 2028, 1097]","[5, 5, 4]"
1784,"[589, 593, … 32]","[5, 4, … 5]","[2916, 527, 2046]","[4, 5, 4]"
4384,"[574, 589, … 1092]","[4, 4, … 4]","[1093, 1094, 1095]","[5, 4, 5]"
5840,"[3791, 587, … 1073]","[4, 4, … 5]","[3763, 1088, 1097]","[4, 5, 5]"
6040,"[573, 589, … 1094]","[4, 4, … 5]","[562, 1096, 1097]","[5, 4, 4]"


In [4]:
median_seq_len = int(grouped_df['train_item_ids'].apply(len).median())
print(f"средняя длина сессии {median_seq_len}")

средняя длина сессии 55


In [5]:
# соберем строчки для разреженной матрицы
rows = []
cols = []
values = []

for user_id, train_item_ids, train_ratings in grouped_df.select('user_id', 'train_item_ids', 'train_ratings').rows():
    rows.extend([user_id] * len(train_item_ids))
    cols.extend(train_item_ids)
    values.extend(train_ratings)

user_item_data = sp.csr_matrix((values, (rows, cols)), dtype=np.float32)
user_item_data

<6041x3953 sparse matrix of type '<class 'numpy.float32'>'
	with 557171 stored elements in Compressed Sparse Row format>

In [6]:
TOP_K = 20


def user_hitrate(y_relevant: List[str], y_preds: List[str], k: int = TOP_K) -> int:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: 1 if top-k recommendations contains at lease one relevant item
    """
    return int(len(set(y_relevant).intersection(y_preds[:k])) > 0)


def user_ndcg(y_rel: List[Any], y_rec: List[Any], k: int = TOP_K) -> float:
    """
    :param y_rel: relevant items
    :param y_rec: recommended items
    :param k: number of top recommended items
    :return: ndcg metric for user recommendations
    """
    dcg = sum([1. / np.log2(idx + 2) for idx, item in enumerate(y_rec[:k]) if item in y_rel])
    idcg = sum([1. / np.log2(idx + 2) for idx, _ in enumerate(zip(y_rel, np.arange(k)))])
    return dcg / idcg

In [7]:
RANDOM_STATE = 42

def set_seed():
    random.seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)
    
    
def get_recommendations(user_embs: np.array, item_embs: np.array, k: int = TOP_K):
    # строим индекс объектов
    index = faiss.IndexFlatIP(item_embs.shape[1])
    index.add(item_embs)

    # строим рекомендации с помощью dot-product расстояния
    # с запасом, чтобы после фильтрации просмотренных осталось хотя бы TOP_K
    return index.search(user_embs, k)

## ALS

В качестве первой модели возьмем ALS факторизацию и подберем оптимальные гиперпараметры с помощью библиотеки `optuna`

In [8]:
def objective(trial):
    factors = trial.suggest_int('factors', 8, 128)
    iterations = trial.suggest_int('iterations', 5, 30)
    alpha = trial.suggest_float('alpha', 0.1, 5.0)
    regularization = trial.suggest_float('regularization', 1e-3, 1.0)
        
    print({
        'factors': factors,
        'iterations': iterations,
        'alpha': alpha,
        'regularization': regularization,
    })
    
    set_seed()
    als_model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        iterations=iterations,
        random_state=RANDOM_STATE,
        alpha=alpha,
        regularization=regularization
    )
    als_model.fit(user_item_data)
    
    _, recs = get_recommendations(
        als_model.user_factors,
        als_model.item_factors,
        TOP_K + median_seq_len
    )
    
    ndcg_list = []
    for user_id, user_history, y_rel in grouped_df.select('user_id', 'train_item_ids', 'test_item_ids').rows():
        y_rec = [item_id for item_id in recs[user_id] if item_id not in user_history]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
    mean_ndcg = np.mean(ndcg_list)
    print(f'NDCG@{TOP_K} = {mean_ndcg}')
    return mean_ndcg
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=20)

study.best_params

[I 2023-10-08 13:37:35,190] A new study created in memory with name: no-name-d07c46c8-1233-4f57-b1c8-043049528b60


{'factors': 16, 'iterations': 22, 'alpha': 1.2980733971075535, 'regularization': 0.8237628084522084}


  0%|          | 0/22 [00:00<?, ?it/s]

[I 2023-10-08 13:37:37,603] Trial 0 finished with value: 0.06374625453013871 and parameters: {'factors': 16, 'iterations': 22, 'alpha': 1.2980733971075535, 'regularization': 0.8237628084522084}. Best is trial 0 with value: 0.06374625453013871.


NDCG@20 = 0.06374625453013871
{'factors': 47, 'iterations': 11, 'alpha': 1.2790531729943362, 'regularization': 0.47040171932694835}


  0%|          | 0/11 [00:00<?, ?it/s]

[I 2023-10-08 13:37:40,510] Trial 1 finished with value: 0.0703597635272655 and parameters: {'factors': 47, 'iterations': 11, 'alpha': 1.2790531729943362, 'regularization': 0.47040171932694835}. Best is trial 1 with value: 0.0703597635272655.


NDCG@20 = 0.0703597635272655
{'factors': 45, 'iterations': 12, 'alpha': 3.159540093554007, 'regularization': 0.2893510445441296}


  0%|          | 0/12 [00:00<?, ?it/s]

[I 2023-10-08 13:37:43,803] Trial 2 finished with value: 0.06474666466103732 and parameters: {'factors': 45, 'iterations': 12, 'alpha': 3.159540093554007, 'regularization': 0.2893510445441296}. Best is trial 1 with value: 0.0703597635272655.


NDCG@20 = 0.06474666466103732
{'factors': 83, 'iterations': 23, 'alpha': 4.498373870255871, 'regularization': 0.4905491650310873}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-10-08 13:37:54,989] Trial 3 finished with value: 0.06365754173097307 and parameters: {'factors': 83, 'iterations': 23, 'alpha': 4.498373870255871, 'regularization': 0.4905491650310873}. Best is trial 1 with value: 0.0703597635272655.


NDCG@20 = 0.06365754173097307
{'factors': 60, 'iterations': 27, 'alpha': 0.7301388022596219, 'regularization': 0.22168368806292812}


  0%|          | 0/27 [00:00<?, ?it/s]

[I 2023-10-08 13:38:06,173] Trial 4 finished with value: 0.07699509829571285 and parameters: {'factors': 60, 'iterations': 27, 'alpha': 0.7301388022596219, 'regularization': 0.22168368806292812}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.07699509829571285
{'factors': 47, 'iterations': 20, 'alpha': 3.9929837813393463, 'regularization': 0.7996954115150592}


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2023-10-08 13:38:13,209] Trial 5 finished with value: 0.061050239850323554 and parameters: {'factors': 47, 'iterations': 20, 'alpha': 3.9929837813393463, 'regularization': 0.7996954115150592}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.061050239850323554
{'factors': 33, 'iterations': 10, 'alpha': 1.1336164429018136, 'regularization': 0.7366039187055663}


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2023-10-08 13:38:15,423] Trial 6 finished with value: 0.06936496412130065 and parameters: {'factors': 33, 'iterations': 10, 'alpha': 1.1336164429018136, 'regularization': 0.7366039187055663}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.06936496412130065
{'factors': 117, 'iterations': 9, 'alpha': 4.009204527247484, 'regularization': 0.25928774050090253}


  0%|          | 0/9 [00:00<?, ?it/s]

[I 2023-10-08 13:38:26,272] Trial 7 finished with value: 0.06584217326148942 and parameters: {'factors': 117, 'iterations': 9, 'alpha': 4.009204527247484, 'regularization': 0.25928774050090253}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.06584217326148942
{'factors': 126, 'iterations': 15, 'alpha': 0.7353353472171489, 'regularization': 0.7780306212622194}


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2023-10-08 13:38:47,371] Trial 8 finished with value: 0.07150734831370684 and parameters: {'factors': 126, 'iterations': 15, 'alpha': 0.7353353472171489, 'regularization': 0.7780306212622194}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.07150734831370684
{'factors': 54, 'iterations': 27, 'alpha': 0.7889284445080836, 'regularization': 0.21551873178732905}


  0%|          | 0/27 [00:00<?, ?it/s]

[I 2023-10-08 13:39:01,185] Trial 9 finished with value: 0.07438689340620668 and parameters: {'factors': 54, 'iterations': 27, 'alpha': 0.7889284445080836, 'regularization': 0.21551873178732905}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.07438689340620668
{'factors': 84, 'iterations': 30, 'alpha': 0.14157397311529984, 'regularization': 0.024323717058256505}


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2023-10-08 13:39:17,939] Trial 10 finished with value: 0.07420192417721956 and parameters: {'factors': 84, 'iterations': 30, 'alpha': 0.14157397311529984, 'regularization': 0.024323717058256505}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.07420192417721956
{'factors': 72, 'iterations': 29, 'alpha': 2.0948285562661564, 'regularization': 0.08964483499552373}


  0%|          | 0/29 [00:00<?, ?it/s]

[I 2023-10-08 13:39:38,380] Trial 11 finished with value: 0.069575531659918 and parameters: {'factors': 72, 'iterations': 29, 'alpha': 2.0948285562661564, 'regularization': 0.08964483499552373}. Best is trial 4 with value: 0.07699509829571285.


NDCG@20 = 0.069575531659918
{'factors': 61, 'iterations': 26, 'alpha': 0.20939109408878598, 'regularization': 0.22379629301465345}


  0%|          | 0/26 [00:00<?, ?it/s]

[I 2023-10-08 13:40:11,364] Trial 12 finished with value: 0.08066796259561843 and parameters: {'factors': 61, 'iterations': 26, 'alpha': 0.20939109408878598, 'regularization': 0.22379629301465345}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.08066796259561843
{'factors': 98, 'iterations': 25, 'alpha': 0.19797414321365217, 'regularization': 0.38565552111256607}


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2023-10-08 13:40:32,872] Trial 13 finished with value: 0.07313270364882624 and parameters: {'factors': 98, 'iterations': 25, 'alpha': 0.19797414321365217, 'regularization': 0.38565552111256607}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.07313270364882624
{'factors': 65, 'iterations': 18, 'alpha': 1.8868675069480876, 'regularization': 0.1294102899468391}


  0%|          | 0/18 [00:00<?, ?it/s]

[I 2023-10-08 13:40:59,543] Trial 14 finished with value: 0.06984251232545037 and parameters: {'factors': 65, 'iterations': 18, 'alpha': 1.8868675069480876, 'regularization': 0.1294102899468391}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.06984251232545037
{'factors': 26, 'iterations': 25, 'alpha': 0.1551685589558227, 'regularization': 0.001960619806722269}


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2023-10-08 13:41:03,596] Trial 15 finished with value: 0.07899412533580902 and parameters: {'factors': 26, 'iterations': 25, 'alpha': 0.1551685589558227, 'regularization': 0.001960619806722269}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.07899412533580902
{'factors': 11, 'iterations': 5, 'alpha': 0.12662983009760548, 'regularization': 0.04104337694633558}


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2023-10-08 13:41:04,701] Trial 16 finished with value: 0.06960281396190923 and parameters: {'factors': 11, 'iterations': 5, 'alpha': 0.12662983009760548, 'regularization': 0.04104337694633558}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.06960281396190923
{'factors': 28, 'iterations': 24, 'alpha': 2.6782054025501187, 'regularization': 0.015053800535140188}


  0%|          | 0/24 [00:00<?, ?it/s]

[I 2023-10-08 13:41:08,613] Trial 17 finished with value: 0.06353795046898754 and parameters: {'factors': 28, 'iterations': 24, 'alpha': 2.6782054025501187, 'regularization': 0.015053800535140188}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.06353795046898754
{'factors': 28, 'iterations': 17, 'alpha': 1.8466242539531883, 'regularization': 0.1540589051482909}


  0%|          | 0/17 [00:00<?, ?it/s]

[I 2023-10-08 13:41:11,559] Trial 18 finished with value: 0.06482571794262887 and parameters: {'factors': 28, 'iterations': 17, 'alpha': 1.8466242539531883, 'regularization': 0.1540589051482909}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.06482571794262887
{'factors': 104, 'iterations': 26, 'alpha': 0.546668755171372, 'regularization': 0.3604277738955223}


  0%|          | 0/26 [00:00<?, ?it/s]

[I 2023-10-08 13:41:31,888] Trial 19 finished with value: 0.07356113356684539 and parameters: {'factors': 104, 'iterations': 26, 'alpha': 0.546668755171372, 'regularization': 0.3604277738955223}. Best is trial 12 with value: 0.08066796259561843.


NDCG@20 = 0.07356113356684539


{'factors': 61,
 'iterations': 26,
 'alpha': 0.20939109408878598,
 'regularization': 0.22379629301465345}

In [8]:
set_seed()
als_model = implicit.als.AlternatingLeastSquares(
    factors=70,
    iterations=50,
    random_state=RANDOM_STATE,
    regularization=0.5,
    alpha=0.7,
)
als_model.fit(user_item_data)

_, als_recs = get_recommendations(
    als_model.user_factors,
    als_model.item_factors,
    TOP_K + median_seq_len
)

ndcg_list = []
hitrate_list = []
for user_id, user_history, y_rel in grouped_df.select('user_id', 'train_item_ids', 'test_item_ids').rows():
    y_rec = [item_id for item_id in als_recs[user_id] if item_id not in user_history]
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

  0%|          | 0/50 [00:00<?, ?it/s]

NDCG@20 = 0.0760, Hitrate@20 = 0.3264


## Word2Vec

В качестве второго подхода попробуем использовать сессионные рекомендации и модель w2v, для нее так же подберем оптимальные гиперпараметры с помощью библиотеки `optuna`

In [14]:
def evaluate_model(model):
    ndcg_list = []
    hitrate_list = []
    for train_ids, y_rel in grouped_df.select('train_item_ids', 'test_item_ids').rows():
        model_preds = model.predict_output_word(train_ids[-model.window:], topn=(TOP_K + len(train_ids)))
        if model_preds is None:
            ndcg_list.append(0)
            hitrate_list.append(0)
            continue

        y_rec = [pred[0] for pred in model_preds if pred[0] not in train_ids]
        ndcg_list.append(user_ndcg(y_rel, y_rec))
        hitrate_list.append(user_hitrate(y_rel, y_rec))
    return np.mean(ndcg_list), np.mean(hitrate_list)


def objective(trial):
    sg = trial.suggest_categorical('sg', [0, 1])
    window = trial.suggest_int('window', 1, 10)
    ns_exponent = trial.suggest_float('ns_exponent', -3, 3)
    negative = trial.suggest_int('negative', 3, 20)
    min_count = trial.suggest_int('min_count', 0, 20)
    vector_size = trial.suggest_categorical('vector_size', [16, 32, 64, 128])
    
    print({
        'sg': sg,
        'window_len': window,
        'ns_exponent': ns_exponent,
        'negative': negative,
        'min_count': min_count,
        'vector_size': vector_size,
    })
    
    set_seed()
    model = Word2Vec(
        grouped_df['train_item_ids'].to_list(),
        window=window,
        sg=sg,
        hs=0,
        min_count=min_count,
        vector_size=vector_size,
        negative=negative,
        ns_exponent=ns_exponent,
        seed=RANDOM_STATE,
        epochs=10,
    )
    
    mean_ndcg, mean_hitrate = evaluate_model(model)
    print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Hitrate@{TOP_K} = {mean_hitrate:.4f}')
    return mean_ndcg
    
    
study = optuna.create_study(directions=('maximize',))
study.optimize(objective, n_trials=20)

study.best_params

[I 2023-09-28 13:19:04,169] A new study created in memory with name: no-name-12a81c96-ab9c-44df-963f-8477f49709e3


{'sg': 1, 'window_len': 10, 'ns_exponent': -2.8148944733799937, 'negative': 3, 'min_count': 5, 'vector_size': 16}


[I 2023-09-28 13:19:10,203] Trial 0 finished with value: 0.07070304176177117 and parameters: {'sg': 1, 'window': 10, 'ns_exponent': -2.8148944733799937, 'negative': 3, 'min_count': 5, 'vector_size': 16}. Best is trial 0 with value: 0.07070304176177117.


NDCG@20 = 0.07070304176177117
{'sg': 1, 'window_len': 5, 'ns_exponent': -0.07444822537703732, 'negative': 18, 'min_count': 16, 'vector_size': 32}


[I 2023-09-28 13:19:24,263] Trial 1 finished with value: 0.06581562501419552 and parameters: {'sg': 1, 'window': 5, 'ns_exponent': -0.07444822537703732, 'negative': 18, 'min_count': 16, 'vector_size': 32}. Best is trial 0 with value: 0.07070304176177117.


NDCG@20 = 0.06581562501419552
{'sg': 1, 'window_len': 6, 'ns_exponent': 2.499336282479147, 'negative': 9, 'min_count': 4, 'vector_size': 16}


[I 2023-09-28 13:19:32,086] Trial 2 finished with value: 0.0659551796449718 and parameters: {'sg': 1, 'window': 6, 'ns_exponent': 2.499336282479147, 'negative': 9, 'min_count': 4, 'vector_size': 16}. Best is trial 0 with value: 0.07070304176177117.


NDCG@20 = 0.0659551796449718
{'sg': 0, 'window_len': 3, 'ns_exponent': -0.701171263042756, 'negative': 8, 'min_count': 12, 'vector_size': 16}


[I 2023-09-28 13:19:35,564] Trial 3 finished with value: 0.11303485484295084 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -0.701171263042756, 'negative': 8, 'min_count': 12, 'vector_size': 16}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.11303485484295084
{'sg': 1, 'window_len': 10, 'ns_exponent': -2.5879660913130937, 'negative': 3, 'min_count': 15, 'vector_size': 32}


[I 2023-09-28 13:19:44,088] Trial 4 finished with value: 0.04247340719368319 and parameters: {'sg': 1, 'window': 10, 'ns_exponent': -2.5879660913130937, 'negative': 3, 'min_count': 15, 'vector_size': 32}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.04247340719368319
{'sg': 0, 'window_len': 8, 'ns_exponent': 0.8731775605983865, 'negative': 6, 'min_count': 16, 'vector_size': 16}


[I 2023-09-28 13:19:47,500] Trial 5 finished with value: 0.047637661721706505 and parameters: {'sg': 0, 'window': 8, 'ns_exponent': 0.8731775605983865, 'negative': 6, 'min_count': 16, 'vector_size': 16}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.047637661721706505
{'sg': 0, 'window_len': 3, 'ns_exponent': -1.0306459459890838, 'negative': 13, 'min_count': 6, 'vector_size': 32}


[I 2023-09-28 13:19:51,784] Trial 6 finished with value: 0.1047335420974221 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -1.0306459459890838, 'negative': 13, 'min_count': 6, 'vector_size': 32}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.1047335420974221
{'sg': 0, 'window_len': 9, 'ns_exponent': 2.1445807942766146, 'negative': 10, 'min_count': 12, 'vector_size': 64}


[I 2023-09-28 13:19:57,330] Trial 7 finished with value: 0.018200755266323587 and parameters: {'sg': 0, 'window': 9, 'ns_exponent': 2.1445807942766146, 'negative': 10, 'min_count': 12, 'vector_size': 64}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.018200755266323587
{'sg': 1, 'window_len': 7, 'ns_exponent': -0.7512612633370499, 'negative': 10, 'min_count': 7, 'vector_size': 64}


[I 2023-09-28 13:20:11,787] Trial 8 finished with value: 0.05010055886348278 and parameters: {'sg': 1, 'window': 7, 'ns_exponent': -0.7512612633370499, 'negative': 10, 'min_count': 7, 'vector_size': 64}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.05010055886348278
{'sg': 0, 'window_len': 3, 'ns_exponent': 1.2948497838755424, 'negative': 14, 'min_count': 17, 'vector_size': 128}


[I 2023-09-28 13:20:17,183] Trial 9 finished with value: 0.054952656323472786 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': 1.2948497838755424, 'negative': 14, 'min_count': 17, 'vector_size': 128}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.054952656323472786
{'sg': 0, 'window_len': 1, 'ns_exponent': -1.780995584699475, 'negative': 20, 'min_count': 0, 'vector_size': 128}


[I 2023-09-28 13:20:23,379] Trial 10 finished with value: 0.08164655565404855 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': -1.780995584699475, 'negative': 20, 'min_count': 0, 'vector_size': 128}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.08164655565404855
{'sg': 0, 'window_len': 3, 'ns_exponent': -1.0059196568273143, 'negative': 14, 'min_count': 11, 'vector_size': 32}


[I 2023-09-28 13:20:29,251] Trial 11 finished with value: 0.11060084233633524 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -1.0059196568273143, 'negative': 14, 'min_count': 11, 'vector_size': 32}. Best is trial 3 with value: 0.11303485484295084.


NDCG@20 = 0.11060084233633524
{'sg': 0, 'window_len': 3, 'ns_exponent': -1.1335310925030724, 'negative': 16, 'min_count': 10, 'vector_size': 16}


[I 2023-09-28 13:20:34,155] Trial 12 finished with value: 0.13018802807298602 and parameters: {'sg': 0, 'window': 3, 'ns_exponent': -1.1335310925030724, 'negative': 16, 'min_count': 10, 'vector_size': 16}. Best is trial 12 with value: 0.13018802807298602.


NDCG@20 = 0.13018802807298602
{'sg': 0, 'window_len': 1, 'ns_exponent': 0.08995002818962561, 'negative': 17, 'min_count': 10, 'vector_size': 16}


[I 2023-09-28 13:20:38,030] Trial 13 finished with value: 0.1591000578367207 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': 0.08995002818962561, 'negative': 17, 'min_count': 10, 'vector_size': 16}. Best is trial 13 with value: 0.1591000578367207.


NDCG@20 = 0.1591000578367207
{'sg': 0, 'window_len': 1, 'ns_exponent': 0.42131771744734636, 'negative': 17, 'min_count': 9, 'vector_size': 16}


[I 2023-09-28 13:20:43,501] Trial 14 finished with value: 0.15500329166041557 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': 0.42131771744734636, 'negative': 17, 'min_count': 9, 'vector_size': 16}. Best is trial 13 with value: 0.1591000578367207.


NDCG@20 = 0.15500329166041557
{'sg': 0, 'window_len': 1, 'ns_exponent': 0.6308573564890306, 'negative': 17, 'min_count': 9, 'vector_size': 16}


[I 2023-09-28 13:20:47,982] Trial 15 finished with value: 0.14136445297845335 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': 0.6308573564890306, 'negative': 17, 'min_count': 9, 'vector_size': 16}. Best is trial 13 with value: 0.1591000578367207.


NDCG@20 = 0.14136445297845335
{'sg': 0, 'window_len': 1, 'ns_exponent': 0.21930657529045242, 'negative': 20, 'min_count': 3, 'vector_size': 16}


[I 2023-09-28 13:20:53,254] Trial 16 finished with value: 0.16246831868333328 and parameters: {'sg': 0, 'window': 1, 'ns_exponent': 0.21930657529045242, 'negative': 20, 'min_count': 3, 'vector_size': 16}. Best is trial 16 with value: 0.16246831868333328.


NDCG@20 = 0.16246831868333328
{'sg': 0, 'window_len': 5, 'ns_exponent': 1.5251991806935847, 'negative': 20, 'min_count': 1, 'vector_size': 16}


[I 2023-09-28 13:20:57,952] Trial 17 finished with value: 0.07368306683727002 and parameters: {'sg': 0, 'window': 5, 'ns_exponent': 1.5251991806935847, 'negative': 20, 'min_count': 1, 'vector_size': 16}. Best is trial 16 with value: 0.16246831868333328.


NDCG@20 = 0.07368306683727002
{'sg': 0, 'window_len': 2, 'ns_exponent': 0.019576694493158277, 'negative': 19, 'min_count': 19, 'vector_size': 128}


[I 2023-09-28 13:21:05,576] Trial 18 finished with value: 0.13134996798453677 and parameters: {'sg': 0, 'window': 2, 'ns_exponent': 0.019576694493158277, 'negative': 19, 'min_count': 19, 'vector_size': 128}. Best is trial 16 with value: 0.16246831868333328.


NDCG@20 = 0.13134996798453677
{'sg': 0, 'window_len': 4, 'ns_exponent': -0.07850640884224044, 'negative': 15, 'min_count': 3, 'vector_size': 64}


[I 2023-09-28 13:21:11,470] Trial 19 finished with value: 0.08944864109974976 and parameters: {'sg': 0, 'window': 4, 'ns_exponent': -0.07850640884224044, 'negative': 15, 'min_count': 3, 'vector_size': 64}. Best is trial 16 with value: 0.16246831868333328.


NDCG@20 = 0.08944864109974976


{'sg': 0,
 'window': 1,
 'ns_exponent': 0.21930657529045242,
 'negative': 20,
 'min_count': 3,
 'vector_size': 16}

In [15]:
set_seed()
w2v_model = Word2Vec(
    grouped_df['train_item_ids'].to_list(),
    hs=0,
    seed=RANDOM_STATE,
    epochs=30,
    sg=0,
    window=1,
    ns_exponent=0.2,
    negative=20,
    min_count=3,
    vector_size=16,
)

mean_ndcg, mean_hitrate = evaluate_model(w2v_model)
print(f'NDCG@{TOP_K} = {mean_ndcg:.4f}, Hitrate@{TOP_K} = {mean_hitrate:.4f}')

NDCG@20 = 0.1570, Hitrate@20 = 0.5048


## Ранжирующая модель

Для ранжирования нам хотелось бы учитывать больше признаков для пользователей и объектов, в датасете movielens-1M, к счастью, все это имеется из коробки

Для _пользователей_ есть следующие признаки:
- gender (пол пользователя, категориальный признак)
- age (возраст пользователя, численный признак)
- occupation (род дейтельности, категориальный признак)
- zip_code (почтовый индекс пользователя, категориальный признак)

In [16]:
user_features = pd.read_csv(
    'ml-1m/users.dat', delimiter='::', header=None, 
    names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], 
    engine='python'
)
user_features = pl.from_pandas(user_features)
user_features

user_id,gender,age,occupation,zip_code
i64,str,i64,i64,str
1,"""F""",1,10,"""48067"""
2,"""M""",56,16,"""70072"""
3,"""M""",25,15,"""55117"""
4,"""M""",45,7,"""02460"""
5,"""M""",25,20,"""55455"""
6,"""F""",50,9,"""55117"""
7,"""M""",35,1,"""06810"""
8,"""M""",25,12,"""11413"""
9,"""M""",25,17,"""61614"""
10,"""F""",35,1,"""95370"""


Для _объектов_ есть следующие признаки:
- title (название тайтла, строка)
- genres (жанр тайтла, список категориальных признаков)

In [17]:
item_features = pd.read_csv(
    'ml-1m/movies.dat', delimiter='::', header=None, 
    names=['item_id', 'title', 'genres'],
    engine='python', encoding='latin-1'
)
item_features = pl.from_pandas(item_features)
item_features = item_features.with_columns(pl.col('genres').apply(lambda x: x.split('|')))
item_features

item_id,title,genres
i64,str,list[str]
1,"""Toy Story (199…","[""Animation"", ""Children's"", ""Comedy""]"
2,"""Jumanji (1995)…","[""Adventure"", ""Children's"", ""Fantasy""]"
3,"""Grumpier Old M…","[""Comedy"", ""Romance""]"
4,"""Waiting to Exh…","[""Comedy"", ""Drama""]"
5,"""Father of the …","[""Comedy""]"
6,"""Heat (1995)""","[""Action"", ""Crime"", ""Thriller""]"
7,"""Sabrina (1995)…","[""Comedy"", ""Romance""]"
8,"""Tom and Huck (…","[""Adventure"", ""Children's""]"
9,"""Sudden Death (…","[""Action""]"
10,"""GoldenEye (199…","[""Action"", ""Adventure"", ""Thriller""]"


## Соберем датасет для ранжирования

В качестве **позитивных** объектов мы будем использовать те взаимодействия, где оценка >= 4

В качестве **негативных** объектов мы можем использовать те взаимодействия, где оценка < 4, однако для implicit реакции этот подход не всегда подходит и для простоты мы будем сэмплировать негативные примеры на примере сессионных рекомендаций

In [18]:
ns_exponent = 0.75  # степень популярности объекта
items_set = ratings['item_id'].unique()

# предподсчитаем вероятности в сэмплировании негативных примеров
item_probs = dict()
for item_id, count in ratings.groupby('item_id').count().rows():
    item_probs[item_id] = count / len(ratings)

item_probs = np.array([
    item_probs.get(item_id, 0.)
    for item_id in items_set
])**ns_exponent
item_probs /= np.sum(item_probs)  # normalize so that it's probabilities

In [19]:
set_seed()

n_negatives = 3  # кол-во негативных примеров на один позитивный

ranking_dataset = []
for user_id, pos_ids in ratings.groupby('user_id').agg(pl.col('item_id').alias('item_ids')).rows():
    pos_ids = set(pos_ids)
    # на каждый позитивный объект (len(pos_ids)) сэмплируем n_negatives негативных
    # и еще len(pos_ids) для фильтрации
    n_items_to_sample = min(
        len(item_probs),
        len(pos_ids) * (n_negatives + 1)
    )
    
    neg_ids = [
        item_id
        for item_id in np.random.choice(
            items_set,
            n_items_to_sample,
            replace=False,  # сэмплируем только уникальные объекты
            p=item_probs  # сэмплируем чаще популярные объекты для негативов
        )
        if item_id not in pos_ids
    ]
    
    for item_id in pos_ids:
        ranking_dataset.append([user_id, item_id, 1])
    for item_id in neg_ids:
        ranking_dataset.append([user_id, item_id, 0])
    
ranking_dataset = pl.DataFrame(ranking_dataset, schema=['user_id', 'item_id', 'target'])
ranking_dataset

user_id,item_id,target
i64,i64,i64
3152,3107,1
3152,260,1
3152,3754,1
3152,3852,1
3152,2797,1
3152,3793,1
3152,2161,1
3152,2678,1
3152,2431,1
3152,1276,0


In [20]:
# соединим датасет с признаками пользователей и объектов
ranking_dataset_with_features = (
    ranking_dataset
    .join(user_features, 'user_id')
    .join(item_features, 'item_id')
).drop(['title', 'genres'])  # пока что мы не будем использовать признаки для объектов, вам предлагается попробовать
# вам предлагается попробовать эффективно закодировать эти признаки и еще улучшить метрики

ranking_dataset_with_features

user_id,item_id,target,gender,age,occupation,zip_code
i64,i64,i64,str,i64,i64,str
3152,3107,1,"""M""",18,4,"""61801"""
3152,260,1,"""M""",18,4,"""61801"""
3152,3754,1,"""M""",18,4,"""61801"""
3152,3852,1,"""M""",18,4,"""61801"""
3152,2797,1,"""M""",18,4,"""61801"""
3152,3793,1,"""M""",18,4,"""61801"""
3152,2161,1,"""M""",18,4,"""61801"""
3152,2678,1,"""M""",18,4,"""61801"""
3152,2431,1,"""M""",18,4,"""61801"""
3152,1276,0,"""M""",18,4,"""61801"""


## Обучаем catboost

Большая часть признаков получилась категориальными, чтобы не думать над эффективным кодированием этих признаков мы воспользуемся библиотекой `catboost`

Класс Pool позволяет обернуть наши данные с возможностью указать группу для group-wise ранжирования

In [21]:
train_dataset, test_dataset = train_test_split(
    ranking_dataset_with_features, test_size=0.1, random_state=RANDOM_STATE
)
# отсортируем, чтобы использовать group_id в классе Pool
train_dataset = train_dataset.sort('user_id')
test_dataset = test_dataset.sort('user_id')

cat_features=['user_id', 'item_id', 'gender', 'zip_code', 'occupation']

train_pool = Pool(
    train_dataset.drop(['target']).to_pandas(),
    train_dataset['target'].to_list(),
    # используется для подсчета groupwise метрик, например NDCG
    group_id=train_dataset['user_id'].to_list(),
    cat_features=cat_features
)

test_pool = Pool(
    test_dataset.drop(['target']).to_pandas(),
    test_dataset['target'].to_list(),
    group_id=test_dataset['user_id'].to_list(),
    cat_features=cat_features
)

# обучаем модель классификации, так как она работает сильно быстрее, чем модель ранжирования
# в качестве метрик можно смотреть на NDCG, MAP и AUC
ranking_model = CatBoostClassifier(**{
    'iterations': 300,
    'verbose': False,
    'random_state': RANDOM_STATE,
    'use_best_model': True,
    'custom_metric': [f'NDCG:top={TOP_K}', f'MAP:top={TOP_K}', 'AUC']
})
ranking_model.fit(train_pool, plot=True, eval_set=test_pool)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x291e83fa0>

## Оценим результаты ранжирования

подготовим наши кандидаты в виде списков для каждого пользователя

In [22]:
# сформируем датафрейм с кандидатами из модели als
als_candidates = pl.DataFrame({
    'user_id': grouped_df['user_id'],
    'candidates': [
        [item_id for item_id in als_recs[user_id] if item_id not in user_history][:TOP_K]
        for user_id, user_history in grouped_df.select('user_id', 'train_item_ids').rows()
    ]
})
als_candidates

user_id,candidates
i64,list[i64]
1448,"[454, 1090, 2580]"
4840,"[3114, 2355, … 3100]"
72,"[1265, 1784, … 2268]"
4808,"[1240, 610, … 3396]"
4656,"[1393, 908, … 2916]"
448,"[2336, 3408, … 2987]"
1784,"[2916, 1196, … 541]"
4384,"[2706, 1394, … 1476]"
5840,"[597, 1035, … 3526]"
6040,"[3424, 1225, … 307]"


In [23]:
def get_w2v_candidates(w2v_model, train_ids):
    model_preds = w2v_model.predict_output_word(
        train_ids[-w2v_model.window:], topn=(TOP_K + len(train_ids))
    )
    if model_preds is None:
        return []
    
    return [pred[0] for pred in model_preds if pred[0] not in train_ids][:TOP_K]
    
# сформируем датафрейм с кандидатами из модели w2v
w2v_candidates = pl.DataFrame({
    'user_id': grouped_df['user_id'],
    'candidates': [
        get_w2v_candidates(w2v_model, train_ids)
        for train_ids in grouped_df['train_item_ids'].to_list()
    ]
})
w2v_candidates

user_id,candidates
i64,list[i64]
1448,"[2985, 1242, … 3920]"
4840,"[529, 2797, … 377]"
72,"[3755, 3751, … 1208]"
4808,"[1089, 3911, … 556]"
4656,"[2018, 2019, … 1221]"
448,"[2762, 1028, … 1965]"
1784,"[1968, 353, … 355]"
4384,"[1090, 1097, … 705]"
5840,"[2018, 2019, … 540]"
6040,"[1097, 1090, … 2038]"


In [24]:
def predict_ranks(ranking_model, candidates_with_features):
    if isinstance(ranking_model, CatBoostRanker):
        return ranking_model.predict(candidates_with_features.to_pandas())
    else:
        return ranking_model.predict_proba(candidates_with_features.to_pandas())[:, 1]

candidates_with_features = (
    # объединяем все кандидаты в один список пар (user_id, item_id)
    pl.concat([als_candidates, w2v_candidates])
    .explode('candidates')
    .rename({'candidates': 'item_id'})
    .unique()
    # добавляем признаки пользователей
    .join(user_features, 'user_id')
    # добавляем признаки объектов
    .join(item_features, 'item_id')
).drop(['title', 'genres'])

candidates_with_features = candidates_with_features.with_columns([
    # взвешиваем с помощью модели ранжирования кандидатов
    pl.Series(predict_ranks(ranking_model, candidates_with_features)).alias('rank')
])

candidates_with_features

user_id,item_id,gender,age,occupation,zip_code,rank
i64,i64,str,i64,i64,str,f64
1448,454,"""F""",25,3,"""17522""",0.387932
4840,589,"""F""",45,6,"""63135""",0.276843
4840,2571,"""F""",45,6,"""63135""",0.282576
4840,3255,"""F""",45,6,"""63135""",0.268616
72,1641,"""F""",45,0,"""55122""",0.278103
72,539,"""F""",45,0,"""55122""",0.294356
4656,1219,"""M""",25,17,"""94041""",0.341076
4656,866,"""M""",25,17,"""94041""",0.21189
448,2336,"""M""",25,17,"""80123""",0.190112
448,3408,"""M""",25,17,"""80123""",0.233324


In [25]:
grouped_candidates_with_features = (
    candidates_with_features
    .groupby('user_id')
    .agg([
        pl.col('item_id'),
        pl.col('rank')
    ])
)

reranked_candidates = []
for user_id, item_ids, item_ranks in grouped_candidates_with_features.rows():
    # для каждого пользователя сортируем оценки модели ранжирования и оставляем top-k объектов
    reranked_candidates.append([
        user_id,
        [
            item_ids[ind]
            for ind in np.argsort(item_ranks)[::-1][:TOP_K]
        ]
    ])
    
reranked_candidates = pl.DataFrame(reranked_candidates, schema=['user_id', 'candidates_item_ids'])
reranked_candidates

user_id,candidates_item_ids
i64,list[i64]
3208,"[593, 2762, … 3671]"
3888,"[1196, 1198, … 1080]"
1152,"[2959, 2571, … 70]"
1592,"[1240, 2762, … 1095]"
2640,"[2762, 1580, … 1215]"
808,"[2858, 608, … 2020]"
1808,"[318, 3578, … 1012]"
336,"[3793, 1214, … 1690]"
5128,"[1196, 527, … 1220]"
5896,"[527, 1, … 441]"


оценим метрики для кандидатов по отдельности

In [26]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in w2v_candidates.join(grouped_df, 'user_id').select('candidates', 'test_item_ids').rows():
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1570, Hitrate@20 = 0.5048


In [27]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in als_candidates.join(grouped_df, 'user_id').select('candidates', 'test_item_ids').rows():
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.0760, Hitrate@20 = 0.3264


и попробуем два подхода объединения кандидатов:
1. будем по очереди брать объекты на i-ой категории и соединять в один список
2. возьмем сначала n кандидатов из одного списка, затем из другого и так далее

In [36]:
joined_candidates = (
    pl.concat([als_candidates, w2v_candidates])
    .groupby('user_id')
    .agg(pl.col('candidates'))
)

ndcg_list = []
hitrate_list = []

for candidates, y_rel in (
    joined_candidates
    .join(grouped_df, 'user_id')
    .select('candidates', 'test_item_ids')
    .rows()
):
    y_rec = [item_id for item_ids in list(zip(*candidates)) for item_id in item_ids]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1351, Hitrate@20 = 0.5320


In [39]:
joined_candidates = (
    pl.concat([als_candidates, w2v_candidates])
    .groupby('user_id')
    .agg(pl.col('candidates'))
)

ndcg_list = []
hitrate_list = []

for candidates, y_rel in (
    joined_candidates
    .join(grouped_df, 'user_id')
    .select('candidates', 'test_item_ids')
    .rows()
):
    y_rec = [item_id for item_ids in candidates for item_id in item_ids[:TOP_K // len(candidates)]]
    
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.1223, Hitrate@20 = 0.5421


ну и наконец оценим метрики для кандидатов после ранжирования

In [35]:
ndcg_list = []
hitrate_list = []

for y_rec, y_rel in (
    reranked_candidates
    .join(grouped_df, 'user_id')
    .select('candidates_item_ids', 'test_item_ids')
    .rows()
):
    ndcg_list.append(user_ndcg(y_rel, y_rec))
    hitrate_list.append(user_hitrate(y_rel, y_rec))
print(f'NDCG@{TOP_K} = {np.mean(ndcg_list):.4f}, Hitrate@{TOP_K} = {np.mean(hitrate_list):.4f}')

NDCG@20 = 0.2405, Hitrate@20 = 0.6706


In [31]:
print(f'NDCG@{TOP_K} +{(0.2496 / 0.1597 - 1)* 100:.2f}%')
print(f'Hitrate@{TOP_K} +{(0.6742 / 0.5099 - 1)* 100:.2f}%')

NDCG@20 +56.29%
Hitrate@20 +32.22%
