In [31]:
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

# 1. Подготовка данных

In [32]:
train_df = pd.read_csv("/content/drive/MyDrive/data/data/train_df.csv")
train_df = train_df.drop(["feature_0", "feature_73", "feature_74", "feature_75"], axis=1)
test_df = pd.read_csv("/content/drive/MyDrive/data/data/test_df.csv")
test_df = test_df.drop(["feature_0", "feature_73", "feature_74", "feature_75"], axis=1)

In [33]:
X_train = train_df.drop(["search_id", "target"], axis=1).values
y_train = train_df["target"].to_numpy(dtype=np.float32)
queries_train = train_df["search_id"].values

X_test = test_df.drop(["search_id", "target"], axis=1).values
y_test = test_df["target"].to_numpy(dtype=np.float32)
queries_test = test_df["search_id"].values

In [34]:
num_queries = np.unique(queries_train).shape[0]
num_queries

1000

In [36]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

# 2. Выбор параметров модели

Поскольку целевая переменная (target) является бинарной (например, клик по объекту), то целесообразно в качестве функции потерь для обучения ранжированию использовать **CrossEntropy** по каждой группе, т.к. данная функция потерь является обоснованной относительно MLE. Поскольку данные разбиты по группам (запросам), то подсчет CrossEntropy также стоит производить по группам (в catboost уже имеется реализация – **QueryCrossEntropy** [reference](https://catboost.ai/en/docs/references/querycrossentropy)).

Обучение будет производится на GPU (Google Colab T4).

Для поиска оптимальных параметров будем использовать значение функции потерь на тестовой выборке.



In [40]:
parameters = {
    'loss_function': "QueryCrossEntropy",
    'task_type': "GPU",
    'verbose': False
    }

params_to_gridsearch = {
    'depth': [2, 4, 6],
    'iterations': [100, 250, 500],
    'learning_rate': [0.05, 0.2, 0.3],
    }


model = CatBoostRanker(**parameters)
grid_search_result = model.grid_search(params_to_gridsearch, X=train_pool, cv=5, verbose=False)

Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU
Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07049449518
bestIteration = 99


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.0704510769
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.06948253087
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07034676607
bestIteration = 110


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07045061343
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.06948218963
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07034700035
bestIteration = 110


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07045118895
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.06948230168
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07127408892
bestIteration = 85


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07174374997
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07209742181
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07127411439
bestIteration = 85


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07174373469
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07209741671
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07127403799
bestIteration = 85


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07174374997
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07209742181
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07206541706
bestIteration = 45


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07418391861
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07458231764
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07206541706
bestIteration = 45


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07418391351
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07458231255
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07206542215
bestIteration = 45


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07418391351
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


bestTest = 0.07458230746
bestIteration = 5


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


Training on fold [0/5]
bestTest = 0.05717346856
bestIteration = 5
Training on fold [1/5]
bestTest = 0.0702573542
bestIteration = 5
Training on fold [2/5]
bestTest = 0.06768926616
bestIteration = 5
Training on fold [3/5]
bestTest = 0.07369067354
bestIteration = 20
Training on fold [4/5]
bestTest = 0.06889054609
bestIteration = 10


Default metric period is 5 because QueryCrossEntropy is/are not implemented for GPU


In [47]:
best_params = grid_search_result["params"]
best_params

{'depth': 2, 'learning_rate': 0.3, 'iterations': 250}

# 3. Обучение базовой модели

In [48]:
final_parameters = dict(parameters)
final_parameters.update(best_params)
final_parameters['custom_metric'] = 'NDCG'
model = CatBoostRanker(**final_parameters)
model.fit(train_pool, eval_set=test_pool, verbose=False, plot=False)

Default metric period is 5 because QueryCrossEntropy, NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


<catboost.core.CatBoostRanker at 0x7c59c0bb7a90>

In [49]:
model.best_score_

{'learn': {'QueryCrossEntropy': 0.04377642468948926},
 'validation': {'QueryCrossEntropy': 0.05834585705015533,
  'NDCG:type=Base': 0.9335375740060166}}

In [53]:
print(f"NDCG: {model.best_score_['validation']['NDCG:type=Base']}")

NDCG: 0.9335375740060166


## Получаем NDCG $\approx$ 0.9335