In [135]:
import math

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler
from torch import nn

from typing import List


class ListNet(torch.nn.Module):
    def __init__(self, num_input_features: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        # укажите архитектуру простой модели здесь
        self.model = nn.Sequential(
            nn.Linear(num_input_features, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, input_1: torch.Tensor) -> torch.Tensor:
        logits = self.model(input_1)
        return logits


class Solution:
    def __init__(self, n_epochs: int = 20, listnet_hidden_dim: int = 30,
                 lr: float = 0.001, ndcg_top_k: int = 10):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.ndcg_top_k = ndcg_top_k
        self.n_epochs = n_epochs

        self.model = self._create_model(
            self.num_input_features, listnet_hidden_dim
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def _get_data(self) -> List[np.ndarray]:
        train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test, query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
            X_test, y_test, self.query_ids_test) = self._get_data()
        # допишите ваш код здесь
        X_train = self._scale_features_in_query_groups(
            X_train, self.query_ids_train
        )
        X_test = self._scale_features_in_query_groups(
            X_test, self.query_ids_test
        )

        self.X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
        self.ys_train = torch.from_numpy(y_train).type(torch.FloatTensor)
        self.X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
        self.ys_test = torch.from_numpy(y_test).type(torch.FloatTensor)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> np.ndarray:
        # допишите ваш код здесь
        scaler = StandardScaler()
        scaled_arrays = []
        for query_id in np.unique(inp_query_ids):
            scaled_part = scaler.fit_transform(
                inp_feat_array[inp_query_ids == query_id]
            )
            scaled_arrays.append(scaled_part)
        scaled_X = np.concatenate(scaled_arrays, axis=0)
        return scaled_X

    def _create_model(self, listnet_num_input_features: int,
                      listnet_hidden_dim: int) -> torch.nn.Module:
        torch.manual_seed(0)
        # допишите ваш код здесь
        net = ListNet(
            listnet_num_input_features,
            listnet_hidden_dim
        )
        return net

    def fit(self) -> List[float]:
        # допишите ваш код здесь
        val_ndcgs = []
        for _ in range(self.n_epochs):
            self._train_one_epoch()
            val_ndcgs.append(self._eval_test_set())
        return val_ndcgs

    def _calc_loss(self, batch_ys: torch.FloatTensor,
                   batch_pred: torch.FloatTensor) -> torch.FloatTensor:
        # допишите ваш код здесь
        P_y_i = torch.softmax(batch_ys, dim=0)
        P_z_i = torch.softmax(batch_pred, dim=0)
        return -torch.sum(P_y_i * torch.log(P_z_i))

    def _train_one_epoch(self) -> None:
        self.model.train()
        # допишите ваш код здесь
        unique_queries = np.unique(self.query_ids_train)
        np.random.shuffle(unique_queries)
        batch_size = 16
        
        losses = []
        for query_id in unique_queries:
            group_X = self.X_train[self.query_ids_train == query_id]
            group_y = self.ys_train[self.query_ids_train == query_id]
        
            idx = torch.randperm(len(group_X))
            group_X = group_X[idx]
            group_y = group_y[idx]
            
            cur_batch = 0
            group_losses = []
            for it in range(len(group_X) // batch_size):
                batch_X = group_X[cur_batch: cur_batch + batch_size]
                batch_y = group_y[cur_batch: cur_batch + batch_size]
                cur_batch += batch_size

                self.optimizer.zero_grad()
                if len(batch_X) > 0:
                    preds = self.model(batch_X)
                    loss = self._calc_loss(batch_y, preds)
                    loss.backward()
                    self.optimizer.step()
                    group_losses.append(loss.item())     
            losses.append(np.mean(group_losses))
        print(np.mean(losses))

    def _eval_test_set(self) -> float:
        with torch.no_grad():
            self.model.eval()
            unique_queries = np.unique(self.query_ids_test)
            ndcgs = []
            # допишите ваш код здесь
            for query_id in unique_queries:
                batch_X = self.X_test[self.query_ids_test == query_id]
                batch_y = self.ys_test[self.query_ids_test == query_id]
                y_pred = self.model(batch_X)
                group_dcg = self._ndcg_k(batch_y, y_pred,
                                         self.ndcg_top_k).item()
                ndcgs.append(group_dcg)
            return np.mean(ndcgs)

    def _dcg(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
             k: int) -> float:
        ys_pred, indices = torch.sort(ys_pred, dim=0, descending=True)
        ys_true = ys_true[indices]
        sum_dcg = i = 0
        k = min(len(ys_true), k)
        while i < k:
            sum_dcg += (2**ys_true[i] - 1) / math.log2(i + 2)
            i += 1
        return sum_dcg

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        case_dcg = self._dcg(ys_true, ys_pred, ndcg_top_k)
        ideal_dcg = self._dcg(ys_true, ys_true, ndcg_top_k)
        return case_dcg / ideal_dcg


In [8]:
train_df, test_df = msrank_10k()
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
query_ids_train = train_df[1].values.astype(int)

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
query_ids_test = test_df[1].values.astype(int)

In [13]:
query_ids_train

array([   1,    1,    1, ..., 1291, 1291, 1291])

In [14]:
np.unique(query_ids_train)

array([   1,   16,   31,   46,   61,   76,   91,  106,  121,  136,  151,
        166,  181,  196,  211,  226,  241,  256,  271,  286,  301,  316,
        331,  346,  361,  376,  391,  406,  421,  436,  451,  466,  481,
        496,  511,  526,  541,  556,  571,  586,  601,  616,  631,  646,
        661,  676,  691,  706,  721,  736,  751,  766,  781,  796,  811,
        826,  841,  856,  871,  886,  901,  916,  931,  946,  961,  976,
        991, 1006, 1021, 1036, 1051, 1066, 1081, 1096, 1111, 1126, 1141,
       1156, 1171, 1186, 1201, 1216, 1231, 1246, 1261, 1276, 1291])

In [15]:
type(query_ids_train)

numpy.ndarray

In [16]:
len(np.unique(query_ids_train))

87

In [17]:
np.unique(query_ids_test)

array([  13,   28,   43,   58,   73,   88,  103,  118,  133,  148,  163,
        178,  193,  208,  223,  238,  253,  268,  283,  298,  313,  328,
        343,  358,  373,  388,  403,  418,  433,  448,  463,  478,  493,
        508,  523,  538,  553,  568,  583,  598,  613,  628,  643,  658,
        673,  688,  703,  718,  733,  748,  763,  778,  793,  808,  823,
        838,  853,  868,  883,  898,  913,  928,  943,  958,  973,  988,
       1003, 1018, 1033, 1048, 1063, 1078, 1093, 1108, 1123, 1138, 1153,
       1168, 1183, 1198, 1213, 1228, 1243, 1258, 1273, 1288, 1303, 1318])

In [18]:
X_train[query_ids_train == 1]

array([[ 3. ,  3. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  2. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 3. ,  3. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 2. ,  0. ,  2. , ...,  0. , 20. , 42.6],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ]])

In [19]:
X_train.shape

(10000, 136)

In [20]:
scaler = StandardScaler()

In [21]:
scaled_arrays = []
for query_id in np.unique(query_ids_train):
    scaled_part = scaler.fit_transform(X_train[query_ids_train == query_id])
    scaled_arrays.append(scaled_part)
scaled_X_train = np.concatenate(scaled_arrays, axis=0)

In [25]:
len(scaled_arrays)

87

In [48]:
scaled_X_train.shape

(10000, 136)

In [26]:
scaler.fit_transform(X_train[query_ids_train == 1])

array([[ 0.31606376,  4.81705177, -2.17593133, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.19593518, -0.26622504],
       ...,
       [ 0.31606376,  4.81705177,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [-1.28284703, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.07383065,  1.91992331],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504]])

In [26]:
num_feats = scaled_X_train.shape[1]

In [27]:
model = ListNet(num_feats, 30)

In [28]:
model.model[0]

Linear(in_features=136, out_features=30, bias=True)

In [136]:
slt = Solution()
ndcg = slt.fit()

44.51519703156818
44.362592853806
44.361422310050465
44.36141970095368
44.36142043643974
44.36142064093061
44.36142046404123
44.361420283872846
44.36142021936348
44.36142023813642
44.36142013571689
44.36141967963632
44.36141946228502
44.36141967221082
44.361548836064834
44.36142570305946
44.361419449574015
44.36151713591236
44.36143686646612
44.36141970358302


In [95]:
slt = Solution()
ndcg = slt.fit()

59.41490664804119
55.98594588229711
55.15114100902588
54.8052499867205
54.66720719340032
54.602868208426166
54.5532938654952
54.53017386360008
54.512070408476326
54.50202122953688


In [101]:
slt = Solution()
ndcg = slt.fit()

56.8426117002964
55.25518344359836
54.918763940533005
54.74302530699763
54.65126907673461
54.59346435261869
54.54624840077655
54.525771462005274
54.51158265804808
54.49835771581308


In [131]:
ndcg

[0.24954275394239547,
 0.15069964409551836,
 0.08458630363880233,
 0.18039440060965717,
 0.1065493718593974,
 0.2226325341193429,
 0.2316562063166533,
 0.20712113733911378,
 0.22576327942608093,
 0.24929165612609888]

In [137]:
ndcg

[0.2158617188159207,
 0.2620331101564013,
 0.20208310877734964,
 0.1973035002698783,
 0.10771733356258748,
 0.22949789759745312,
 0.21834875833751125,
 0.13115210725333204,
 0.19694742060859094,
 0.17601861133748156,
 0.24427126616832207,
 0.21280985363674434,
 0.17869614843617787,
 0.25245563264682214,
 0.19216064166870306,
 0.23919985989447345,
 0.12716322232418778,
 0.299408981280232,
 0.16034921606875618,
 0.1770966466600922]