In [1]:
import math

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler
from torch import nn

from typing import List


class ListNet(torch.nn.Module):
    def __init__(self, num_input_features: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        # укажите архитектуру простой модели здесь
        self.model = nn.Sequential(
            nn.Linear(num_input_features, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, input_1: torch.Tensor) -> torch.Tensor:
        logits = self.model(input_1)
        return logits


class Solution:
    def __init__(self, n_epochs: int = 5, listnet_hidden_dim: int = 30,
                 lr: float = 0.001, ndcg_top_k: int = 10):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.ndcg_top_k = ndcg_top_k
        self.n_epochs = n_epochs

        self.model = self._create_model(
            self.num_input_features, listnet_hidden_dim
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def _get_data(self) -> List[np.ndarray]:
        train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test,
                query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
         X_test, y_test, self.query_ids_test) = self._get_data()
        # допишите ваш код здесь
        X_train = self._scale_features_in_query_groups(
            X_train, self.query_ids_train
        )
        X_test = self._scale_features_in_query_groups(
            X_test, self.query_ids_test
        )

        self.X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
        self.ys_train = torch.from_numpy(y_train).type(torch.FloatTensor)
        self.X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
        self.ys_test = torch.from_numpy(y_test).type(torch.FloatTensor)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> \
            np.ndarray:
        # допишите ваш код здесь
        for query_id in np.unique(inp_query_ids):
            mask = inp_query_ids == query_id
            scaler = StandardScaler()
            scaled_part = scaler.fit_transform(inp_feat_array[mask])
            inp_feat_array[mask] = scaled_part
        return inp_feat_array

    def _create_model(self, listnet_num_input_features: int,
                      listnet_hidden_dim: int) -> torch.nn.Module:
        torch.manual_seed(0)
        # допишите ваш код здесь
        net = ListNet(
            listnet_num_input_features,
            listnet_hidden_dim
        )
        return net

    def fit(self) -> List[float]:
        # допишите ваш код здесь
        val_ndcgs = []
        for _ in range(self.n_epochs):
            self._train_one_epoch()
            val_ndcgs.append(self._eval_test_set())
        return val_ndcgs

    def _calc_loss(self, batch_ys: torch.FloatTensor,
                   batch_pred: torch.FloatTensor) -> torch.FloatTensor:
        # допишите ваш код здесь
        P_y_i = torch.softmax(batch_ys, dim=0)
        P_z_i = torch.softmax(batch_pred, dim=0)
        return -torch.sum(P_y_i * torch.log(P_z_i/P_y_i))

    def _train_one_epoch(self) -> None:
        self.model.train()
        # допишите ваш код здесь
        unique_queries = np.unique(self.query_ids_train)
        np.random.shuffle(unique_queries)

        for query_id in unique_queries:
            group_X = self.X_train[self.query_ids_train == query_id]
            group_y = self.ys_train[self.query_ids_train == query_id]

            self.optimizer.zero_grad()
            preds = self.model(group_X).reshape(-1,)
            loss = self._calc_loss(group_y.reshape(-1,), preds)
            loss.backward()
            self.optimizer.step()

    def _eval_test_set(self) -> float:
        with torch.no_grad():
            self.model.eval()
            unique_queries = np.unique(self.query_ids_test)
            ndcgs = []
            # допишите ваш код здесь
            for query_id in unique_queries:
                batch_X = self.X_test[self.query_ids_test == query_id]
                batch_y = self.ys_test[self.query_ids_test == query_id]
                y_pred = self.model(batch_X)
                group_dcg = self._ndcg_k(batch_y, y_pred,
                                         self.ndcg_top_k).item()
                if np.isnan(group_dcg):
                    ndcgs.append(0)
                    continue
                ndcgs.append(group_dcg)
            return np.mean(ndcgs)

    def _dcg(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
             k: int) -> float:
        ys_pred, indices = torch.sort(ys_pred, dim=0, descending=True)
        ys_true = ys_true[indices[:k]]

        sum_dcg = 0
        for i, y_true in enumerate(ys_true, 1):
            sum_dcg += (2 ** y_true - 1) / math.log2(i + 1)
        return sum_dcg

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        ideal_dcg = self._dcg(ys_true, ys_true, ndcg_top_k)
        case_dcg = self._dcg(ys_true, ys_pred, ndcg_top_k)
        return case_dcg / ideal_dcg

In [2]:
train_df, test_df = msrank_10k()
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
query_ids_train = train_df[1].values.astype(int)

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
query_ids_test = test_df[1].values.astype(int)

In [3]:
query_ids_train

array([   1,    1,    1, ..., 1291, 1291, 1291])

In [4]:
np.unique(query_ids_train)

array([   1,   16,   31,   46,   61,   76,   91,  106,  121,  136,  151,
        166,  181,  196,  211,  226,  241,  256,  271,  286,  301,  316,
        331,  346,  361,  376,  391,  406,  421,  436,  451,  466,  481,
        496,  511,  526,  541,  556,  571,  586,  601,  616,  631,  646,
        661,  676,  691,  706,  721,  736,  751,  766,  781,  796,  811,
        826,  841,  856,  871,  886,  901,  916,  931,  946,  961,  976,
        991, 1006, 1021, 1036, 1051, 1066, 1081, 1096, 1111, 1126, 1141,
       1156, 1171, 1186, 1201, 1216, 1231, 1246, 1261, 1276, 1291])

In [5]:
type(query_ids_train)

numpy.ndarray

In [6]:
len(np.unique(query_ids_train))

87

In [7]:
X_train[query_ids_train == 1]

array([[ 3. ,  3. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  2. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 3. ,  3. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 2. ,  0. ,  2. , ...,  0. , 20. , 42.6],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ]])

In [8]:
X_train.shape

(10000, 136)

In [9]:
scaler = StandardScaler()

In [10]:
scaled_arrays = []
for query_id in np.unique(query_ids_train):
    scaled_part = scaler.fit_transform(X_train[query_ids_train == query_id])
    scaled_arrays.append(scaled_part)
scaled_X_train = np.concatenate(scaled_arrays, axis=0)

In [11]:
len(scaled_arrays)

87

In [12]:
scaled_X_train.shape

(10000, 136)

In [13]:
scaler.fit_transform(X_train[query_ids_train == 1])

array([[ 0.31606376,  4.81705177, -2.17593133, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.19593518, -0.26622504],
       ...,
       [ 0.31606376,  4.81705177,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [-1.28284703, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.07383065,  1.91992331],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504]])

In [14]:
num_feats = scaled_X_train.shape[1]

In [15]:
slt = Solution(n_epochs=10)
ndcg = slt.fit()

In [16]:
ndcg

[0.4154603543098677,
 0.4288841356438669,
 0.42195730832066725,
 0.42791208531707525,
 0.43076728957451205,
 0.4199077663308179,
 0.421083140313964,
 0.4342134243978018,
 0.4209770988672972,
 0.4373986226591197]