In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from catboost import CatBoostRanker, Pool
from copy import deepcopy

df = pd.read_csv('intern_task.csv')

Разделим датасет на train и test

In [2]:
gss = GroupShuffleSplit(test_size=.20, n_splits=1, random_state=42).split(df, groups=df['query_id'])

X_train_inds, X_test_inds = next(gss)
train_data= df.iloc[X_train_inds]
test_data= df.iloc[X_test_inds]
X_train = train_data.drop(['rank', 'query_id'], axis=1).values
y_train = train_data['rank'].values
queries_train = train_data['query_id'].values
X_test = test_data.drop(['rank', 'query_id'], axis=1).values
y_test = test_data['rank'].values
queries_test = test_data['query_id'].values

Для задачи ранжирования используем catboost: 
1) приведем таргеты к [0,1]
2) инициализируем структуру CatBoost pool
3) зададим параметры модели
4) имплементируем fitting function

In [5]:
max_relevance = np.max(y_train)
y_train = y_train / max_relevance
y_test = y_test / max_relevance

train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

params = {
    'iterations': 3000,
    'custom_metric': ['NDCG:top=5', 'PFound:top=5', 'AverageGain:top=5'],
    'verbose': False,
    'random_seed': 0,
    'thread_count': -1,
}

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(params)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

Обучение модели и полученные метрики

In [6]:
model = fit_model('QueryRMSE', params, train, test)
model.best_score_['validation']

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'PFound:top=5': 0.6975775578613285,
 'NDCG:top=5;type=Base': 0.5790497505705205,
 'QueryRMSE': 0.17765729379166634,
 'AverageGain:top=5': 0.3385416666666665}