In [133]:
import math
import optuna
import pickle
from typing import List, Tuple

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from torch import Tensor


class Solution:
    def __init__(self, n_estimators: int = 231,
                 lr: float = 0.39127022258826616,
                 ndcg_top_k: int = 10,
                 subsample: float = 0.53005362889185,
                 colsample_bytree: float = 0.6304279014613413,
                 max_depth: int = 9,
                 min_samples_leaf: int = 25):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.num_train_objects = self.X_train.shape[0]
        self.unique_train_groups = np.unique(self.query_ids_train)
        self.num_test_objects = self.X_test.shape[0]
        self.num_features_to_choice = int(colsample_bytree *
                                          self.num_input_features)
        self.num_groups_to_choice = int(
            subsample * len(self.unique_train_groups))

        self.ndcg_top_k = ndcg_top_k
        self.n_estimators = n_estimators
        self.lr = lr
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf

        self.trees = []
        self.best_ndcg = -1

    def _get_data(self) -> List[np.ndarray]:
        try:
            train_df, test_df = msrank_10k()
        except:
            train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test,
                query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
         X_test, y_test, self.query_ids_test) = self._get_data()
        # допишите ваш код здесь
        X_train = self._scale_features_in_query_groups(
            X_train, self.query_ids_train
        )
        X_test = self._scale_features_in_query_groups(
            X_test, self.query_ids_test
        )

        self.X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
        self.ys_train = torch.from_numpy(y_train).type(
            torch.FloatTensor).reshape(-1, 1)
        self.X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
        self.ys_test = torch.from_numpy(y_test).type(
            torch.FloatTensor).reshape(-1, 1)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> \
            np.ndarray:
        # допишите ваш код здесь
        for query_id in np.unique(inp_query_ids):
            mask = inp_query_ids == query_id
            scaler = StandardScaler()
            scaled_part = scaler.fit_transform(inp_feat_array[mask])
            inp_feat_array[mask] = scaled_part
        return inp_feat_array

    def _train_one_tree(self, cur_tree_idx: int,
                        train_preds: torch.FloatTensor
                        ) -> Tuple[DecisionTreeRegressor, np.ndarray]:
        # допишите ваш код здесь
        lambdas = np.zeros((self.num_train_objects, 1))
        groups_to_train_on = np.random.choice(self.unique_train_groups,
                                              size=self.num_groups_to_choice,
                                              replace=False)
        for query_id in groups_to_train_on:
            mask = self.query_ids_train == query_id
            group_y = self.ys_train[mask]
            group_preds = train_preds[mask]
            group_lambdas = self._compute_lambdas(group_y, group_preds)
            lambdas[mask] = group_lambdas.numpy()

        dt = DecisionTreeRegressor(max_depth=self.max_depth,
                                   min_samples_leaf=self.min_samples_leaf,
                                   random_state=cur_tree_idx)

        rows = np.isin(self.query_ids_train, groups_to_train_on)
        cols = np.random.choice(np.arange(self.num_input_features),
                                size=self.num_features_to_choice,
                                replace=False)

        sample_X_train = self.X_train[rows][:, cols].numpy()
        lambdas = lambdas[rows]
        dt.fit(sample_X_train, lambdas)
        return dt, cols

    def fit(self):
        np.random.seed(0)
        # допишите ваш код здесь
        best_ndcg_ind = max_ndcg = 0
        prev_preds = torch.zeros(
            self.num_train_objects, 1).type(torch.FloatTensor)
        valid_preds = torch.zeros(
            self.num_test_objects, 1).type(torch.FloatTensor)

        for idx in range(1, self.n_estimators + 1):
            dt, train_cols = self._train_one_tree(idx, prev_preds)
            self.trees.append((dt, train_cols))
            prev_preds -= self.lr * torch.FloatTensor(dt.predict(
                self.X_train[:, train_cols].numpy())).reshape(-1, 1)
            valid_preds -= self.lr * torch.FloatTensor(dt.predict(
                self.X_test[:, train_cols].numpy())).reshape(-1, 1)
            ndcg = self._calc_data_ndcg(self.query_ids_test, self.ys_test,
                                        valid_preds)

            if ndcg > self.best_ndcg:
                best_ndcg_ind = idx
                self.best_ndcg = ndcg
                
        self.trees = self.trees[:best_ndcg_ind]

    def predict(self, data: torch.FloatTensor) -> torch.FloatTensor:
        preds = torch.zeros(data.shape[0], 1).type(torch.FloatTensor)
        for dt, cols in self.trees:
            tmp_preds = dt.predict(data[:, cols].numpy())
            preds -= self.lr * torch.FloatTensor(tmp_preds).reshape(-1, 1)
        return preds

    def _compute_lambdas(self, y_true: torch.FloatTensor,
                         y_pred: torch.FloatTensor) -> Tensor:
        def compute_ideal_dcg(ys_true: torch.Tensor) -> float:
            ys_true, _ = torch.sort(ys_true, dim=0, descending=True)

            sum_dcg = 0
            for i, y_true in enumerate(ys_true, 1):
                sum_dcg += (2 ** y_true - 1) / math.log2(i + 1)
            return sum_dcg

        def compute_labels_in_batch(y_true):
            rel_diff = y_true - y_true.t()
            pos_pairs = (rel_diff > 0).type(torch.float32)
            neg_pairs = (rel_diff < 0).type(torch.float32)
            Sij = pos_pairs - neg_pairs
            return Sij

        _, rank_order = torch.sort(y_true, descending=True, dim=0)
        rank_order += 1

        pos_pairs_score_diff = 1.0 + torch.exp((y_pred - y_pred.t()))

        Sij = compute_labels_in_batch(y_true)

        gain_diff = torch.pow(2.0, y_true) - torch.pow(2.0, y_true.t())
        decay_diff = (1.0 / torch.log2(rank_order + 1.0)) - (
                1.0 / torch.log2(rank_order.t() + 1.0))
        ideal_dcg = compute_ideal_dcg(y_true)
        N = 1 / (ideal_dcg + 1)
        delta_ndcg = torch.abs(N * gain_diff * decay_diff)

        lambda_update = (0.5 * (
                    1 - Sij) - 1 / pos_pairs_score_diff) * delta_ndcg
        lambda_update = torch.sum(lambda_update, dim=1, keepdim=True)
        return lambda_update

    def _dcg(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
             k: int) -> float:
        ys_pred, indices = torch.sort(ys_pred, dim=0, descending=True)
        ys_true = ys_true[indices[:k]]

        sum_dcg = 0
        for i, y_true in enumerate(ys_true, 1):
            sum_dcg += (2 ** y_true - 1) / math.log2(i + 1)
        return sum_dcg

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        ideal_dcg = self._dcg(ys_true, ys_true, ndcg_top_k)
        case_dcg = self._dcg(ys_true, ys_pred, ndcg_top_k)
        return float(case_dcg / ideal_dcg)

    def _calc_data_ndcg(self, queries_list: np.ndarray,
                        true_labels: torch.FloatTensor,
                        preds: torch.FloatTensor) -> float:
        # допишите ваш код здесь
        unique_queries = np.unique(queries_list)
        ndcgs = []
        for query_id in unique_queries:
            group_y = true_labels[queries_list == query_id]
            y_pred = preds[queries_list == query_id]
            group_dcg = self._ndcg_k(group_y, y_pred, self.ndcg_top_k)
            if np.isnan(group_dcg):
                ndcgs.append(0)
                continue
            ndcgs.append(group_dcg)
        return float(np.mean(ndcgs))

    def save_model(self, path: str):
        state = {
            'trees': self.trees,
            'lr': self.lr,
            'best_ndcg': self.best_ndcg
        }
        f = open(path, 'wb')
        pickle.dump(state, f)

    def load_model(self, path: str):
        f = open(path, 'rb')
        state = pickle.load(f)
        self.trees = state['trees']
        self.lr = state['lr']
        self.best_ndcg = state['best_ndcg']


In [134]:
train_df, test_df = msrank_10k()
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
query_ids_train = train_df[1].values.astype(int)

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
query_ids_test = test_df[1].values.astype(int)

In [138]:
X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test).reshape(-1, 1)

In [135]:
best_params = {'lr': 0.39127022258826616, 'subsample': 0.53005362889185, 'colsample_bytree': 0.6304279014613413, 'n_estimators': 231, 'max_depth': 9, 'min_samples_leaf': 25}

In [136]:
slt = Solution(**best_params)
slt.fit()

In [132]:
slt.save_model('model.pickle')

In [139]:
valid_preds = slt.predict(X_test)
ndcg = slt._calc_data_ndcg(query_ids_test, y_test, valid_preds)
ndcg

0.405036643242732

In [108]:
def objective(trial, X_test=X_test, y_test=y_test, query_ids_test=query_ids_test):
    param = {
        'lr': trial.suggest_float('lr', 1e-2, 7e-1),
        'subsample': trial.suggest_float('subsample', 5e-1, 75e-2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 5e-1, 75e-2),
        'n_estimators': trial.suggest_categorical('n_estimators', list(range(100, 300))),
        'max_depth': trial.suggest_categorical('max_depth', list(range(3, 10))),
        'min_samples_leaf': trial.suggest_categorical('min_samples_leaf', list(range(5, 30))),
    }
    
    model = Solution(**param)
    model.fit()
    
    valid_preds = model.predict(X_test)
    ndcg = model._calc_data_ndcg(query_ids_test, y_test, valid_preds)
    return ndcg

In [1]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 200

Best trial: {'lr': 0.39127022258826616, 'subsample': 0.53005362889185, 'colsample_bytree': 0.6304279014613413, 'n_estimators': 231, 'max_depth': 9, 'min_samples_leaf': 25}

### STM solution

In [113]:
import math
import pickle
import random
from typing import List, Tuple

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from tqdm.auto import tqdm


class Solution:
    def __init__(self, n_estimators: int = 100, lr: float = 0.5, ndcg_top_k: int = 10,
                 subsample: float = 0.6, colsample_bytree: float = 0.9,
                 max_depth: int = 7, min_samples_leaf: int = 8):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.num_train_objects = self.X_train.shape[0]
        self.num_test_objects = self.X_test.shape[0]

        self.features_to_choice = int(
            self.num_input_features * colsample_bytree)
        self.objects_to_choice = int(self.num_train_objects * subsample)

        self.ndcg_top_k = ndcg_top_k
        self.n_estimators = n_estimators
        self.lr = lr
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf

        self.trees = None
        self.trees_feat_idxs = None
        self.best_ndcg = -1
        self.best_iter_idx = -1

    def _get_data(self) -> List[np.array]:
        try:
            train_df, test_df = msrank_10k()
        except:
            train_df, test_df = msrank_10k()
        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test, query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
            X_test, y_test, self.query_ids_test) = self._get_data()

        X_train = self._scale_features_in_query_groups(
            X_train, self.query_ids_train)
        X_test = self._scale_features_in_query_groups(
            X_test, self.query_ids_test)

        self.X_train = torch.FloatTensor(X_train)
        self.X_test = torch.FloatTensor(X_test)

        self.ys_train = torch.FloatTensor(y_train).reshape(-1, 1)
        self.ys_test = torch.FloatTensor(y_test).reshape(-1, 1)

    def _scale_features_in_query_groups(self, inp_feat_array: np.array,
                                        inp_query_ids: np.array) -> np.array:
        for cur_id in np.unique(inp_query_ids):
            mask = inp_query_ids == cur_id
            tmp_array = inp_feat_array[mask]
            scaler = StandardScaler()
            inp_feat_array[mask] = scaler.fit_transform(tmp_array)

        return inp_feat_array

    def _train_one_tree(self, cur_tree_idx: int,
                        train_preds: torch.FloatTensor
                        ) -> Tuple[DecisionTreeRegressor, np.array]:
        lambdas = torch.zeros(self.num_train_objects, 1)
        for cur_id in np.unique(self.query_ids_train):
            train_mask = self.query_ids_train == cur_id
            lambda_update = self._compute_lambdas(
                self.ys_train[train_mask], train_preds[train_mask])
            if any(torch.isnan(lambda_update)):
                lambda_update = torch.zeros_like(lambda_update)
            lambdas[train_mask] = lambda_update

        tree = DecisionTreeRegressor(
            max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, random_state=cur_tree_idx)

        this_tree_feats = np.random.choice(
            list(range(self.num_input_features)), self.features_to_choice, replace=False)
        this_tree_objs = np.random.choice(
            list(range(self.num_train_objects)), self.objects_to_choice, replace=False)

        tree.fit(
            self.X_train[this_tree_objs.reshape(-1)
                         ][:, this_tree_feats].numpy(),
            -lambdas[this_tree_objs.reshape(-1), :].numpy()
        )

        return tree, this_tree_feats

    def _calc_data_ndcg(self, queries_list: np.array,
                        true_labels: torch.FloatTensor, preds: torch.FloatTensor) -> float:
        ndcgs = []
        for cur_id in np.unique(queries_list):
            mask = queries_list == cur_id
            cur_ndcg = self._ndcg_k(
                true_labels[mask], preds[mask], self.ndcg_top_k)
            if np.isnan(cur_ndcg):
                ndcgs.append(0)
                continue
            ndcgs.append(cur_ndcg)
        return np.mean(ndcgs)

    def fit(self):
        np.random.seed(0)
        self.trees = []
        self.trees_feat_idxs = []
        self.best_ndcg = -1
        self.best_iter_idx = -1

        train_preds = torch.zeros(self.num_train_objects, 1)
        test_preds = torch.zeros(self.num_test_objects, 1)

        train_ndcgs, test_ndcgs = [], []

        #p_bar = tqdm(range(self.n_estimators))
        for cur_tree_idx in range(self.n_estimators):
            tree, this_tree_feats = self._train_one_tree(
                cur_tree_idx, train_preds)
            self.trees.append(tree)
            self.trees_feat_idxs.append(this_tree_feats)

            cur_tree_train_data = self.X_train[:, this_tree_feats].numpy()
            train_preds += self.lr * \
                torch.FloatTensor(tree.predict(
                    cur_tree_train_data)).reshape(-1, 1)
            train_ndcg = self._calc_data_ndcg(
                self.query_ids_train, self.ys_train, train_preds)

            cur_tree_test_data = self.X_test[:, this_tree_feats].numpy()
            test_preds += self.lr * \
                torch.FloatTensor(tree.predict(
                    cur_tree_test_data)).reshape(-1, 1)
            test_ndcg = self._calc_data_ndcg(
                self.query_ids_test, self.ys_test, test_preds)

            if self.best_ndcg < test_ndcg:
                self.best_ndcg = test_ndcg
                self.best_iter_idx = cur_tree_idx

            train_ndcgs.append(train_ndcg)
            test_ndcgs.append(test_ndcg)
#             p_bar.set_description_str(
#                 f'Test nDCG@{self.ndcg_top_k}={round(test_ndcg, 5)}')

        cut_idx = self.best_iter_idx + 1
        self.trees = self.trees[:cut_idx]
        self.trees_feat_idxs = self.trees_feat_idxs[:cut_idx]

    def predict(self, data: torch.FloatTensor) -> torch.FloatTensor:
        preds = torch.zeros(data.shape[0], 1)
        for cur_tree_idx in range(len(self.trees)):
            tree = self.trees[cur_tree_idx]
            feat_idx = self.trees_feat_idxs[cur_tree_idx]
            tmp_preds = tree.predict(data[:, feat_idx].numpy())
            preds += self.lr * torch.FloatTensor(tmp_preds).reshape(-1, 1)

        return preds

    def _compute_ideal_dcg(self, ys_true: torch.FloatTensor) -> float:
        def dcg(ys_true, ys_pred):
            _, argsort = torch.sort(ys_pred, descending=True, dim=0)
            ys_true_sorted = ys_true[argsort]
            ret = 0
            for i, l in enumerate(ys_true_sorted, 1):
                ret += (2 ** l - 1) / np.log2(1 + i)
            return ret
        ideal_dcg = dcg(ys_true, ys_true)
        return ideal_dcg

    def _compute_lambdas(self, y_true, y_pred):
        # рассчитаем нормировку, IdealDCG
        ideal_dcg = self._compute_ideal_dcg(y_true)
        N = 1 / ideal_dcg

        # рассчитаем порядок документов согласно оценкам релевантности
        _, rank_order = torch.sort(y_true, descending=True, axis=0)
        rank_order += 1

        with torch.no_grad():
            # получаем все попарные разницы скоров в батче
            pos_pairs_score_diff = 1.0 + torch.exp((y_pred - y_pred.t()))

            # поставим разметку для пар, 1 если первый документ релевантнее
            # -1 если второй документ релевантнее
            Sij = self._compute_labels_in_batch(y_true)
            # посчитаем изменение gain из-за перестановок
            gain_diff = self._compute_gain_diff(y_true)

            # посчитаем изменение знаменателей-дискаунтеров
            decay_diff = (1.0 / torch.log2(rank_order + 1.0)) - \
                (1.0 / torch.log2(rank_order.t() + 1.0))
            # посчитаем непосредственное изменение nDCG
            delta_ndcg = torch.abs(N * gain_diff * decay_diff)
            # посчитаем лямбды
            lambda_update = (0.5 * (1 - Sij) - 1 /
                             pos_pairs_score_diff) * delta_ndcg
            lambda_update = torch.sum(lambda_update, dim=1, keepdim=True)

            return lambda_update

    def _compute_labels_in_batch(self, y_true):
        rel_diff = y_true - y_true.t()
        pos_pairs = (rel_diff > 0).type(torch.float32)
        neg_pairs = (rel_diff < 0).type(torch.float32)
        Sij = pos_pairs - neg_pairs
        return Sij

    def _compute_gain_diff(self, y_true):
        gain_diff = torch.pow(2.0, y_true) - torch.pow(2.0, y_true.t())
        return gain_diff

    def _ndcg_k(self, ys_true, ys_pred, ndcg_top_k) -> float:
        def dcg(ys_true, ys_pred):
            _, argsort = torch.sort(ys_pred, descending=True, dim=0)
            argsort = argsort[:ndcg_top_k]
            ys_true_sorted = ys_true[argsort]
            ret = 0
            for i, l in enumerate(ys_true_sorted, 1):
                ret += (2 ** l - 1) / math.log2(1 + i)
            return ret
        ideal_dcg = dcg(ys_true, ys_true)
        pred_dcg = dcg(ys_true, ys_pred)
        return (pred_dcg / ideal_dcg).item()

    def save_model(self, path: str):
        state = {
            'trees': self.trees,
            'trees_feat_idxs': self.trees_feat_idxs,
            'best_ndcg': self.best_ndcg,
            'lr': self.lr
        }
        f = open(path, 'wb')
        pickle.dump(state, f)

    def load_model(self, path: str):
        f = open(path, 'rb')
        state = pickle.load(f)
        self.trees = state['trees']
        self.trees_feat_idxs = state['trees_feat_idxs']
        self.best_ndcg = state['best_ndcg']
        self.lr = state['lr']

In [114]:
def objective(trial, X_test=X_test, y_test=y_test, query_ids_test=query_ids_test):
    param = {
        'lr': trial.suggest_float('lr', 1e-2, 7e-1),
        'subsample': trial.suggest_float('subsample', 5e-1, 75e-2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 5e-1, 75e-2),
        'n_estimators': trial.suggest_categorical('n_estimators', list(range(100, 300))),
        'max_depth': trial.suggest_categorical('max_depth', list(range(3, 10))),
        'min_samples_leaf': trial.suggest_categorical('min_samples_leaf', list(range(5, 30))),
    }
    
    model = Solution(**param)
    model.fit()
    
    valid_preds = model.predict(X_test)
    ndcg = model._calc_data_ndcg(query_ids_test, y_test, valid_preds)
    return ndcg

In [2]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 200

Best trial: {'lr': 0.5099655429753616, 'subsample': 0.6540233528557746, 'colsample_bytree': 0.6343128384876874, 'n_estimators': 260, 'max_depth': 3, 'min_samples_leaf': 12}

In [116]:
best_params = study.best_trial.params

In [117]:
slt = Solution(**best_params)
slt.fit()

In [118]:
valid_preds = slt.predict(X_test)
ndcg = slt._calc_data_ndcg(query_ids_test, y_test, valid_preds)
ndcg

0.40230476581879465

In [119]:
slt.save_model('stm.pickle')