In [96]:
import math

import numpy as np
import torch
from catboost.datasets import msrank_10k
from sklearn.preprocessing import StandardScaler
from torch import nn

from typing import List


class ListNet(torch.nn.Module):
    def __init__(self, num_input_features: int, hidden_dim: int):
        super().__init__()
        self.hidden_dim = hidden_dim
        # укажите архитектуру простой модели здесь
        self.model = nn.Sequential(
                        nn.Linear(num_input_features, 2*hidden_dim),
                        nn.BatchNorm1d(2*hidden_dim),
                        nn.ReLU(),
                        nn.Linear(2*hidden_dim, hidden_dim),
                        nn.BatchNorm1d(hidden_dim),
                        nn.ReLU(),
                        nn.Linear(hidden_dim, 1),
        )
        self.out_activation = nn.Sigmoid()

    def forward(self, input_1: torch.Tensor) -> torch.Tensor:
        logits = self.model(input_1)
        return logits


class Solution:
    def __init__(self, n_epochs: int = 5, listnet_hidden_dim: int = 30,
                 lr: float = 0.001, ndcg_top_k: int = 10):
        self._prepare_data()
        self.num_input_features = self.scaled_X_train.shape[1]
        self.ndcg_top_k = ndcg_top_k
        self.n_epochs = n_epochs

        self.model = self._create_model(
            self.num_input_features, listnet_hidden_dim
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def _get_data(self) -> List[np.ndarray]:
        train_df, test_df = msrank_10k()

        X_train = train_df.drop([0, 1], axis=1).values
        y_train = train_df[0].values
        query_ids_train = train_df[1].values.astype(int)

        X_test = test_df.drop([0, 1], axis=1).values
        y_test = test_df[0].values
        query_ids_test = test_df[1].values.astype(int)

        return [X_train, y_train, query_ids_train, X_test, y_test, query_ids_test]

    def _prepare_data(self) -> None:
        (X_train, y_train, self.query_ids_train,
            X_test, y_test, self.query_ids_test) = self._get_data()
        
        scaled_X_train = self._scale_features_in_query_groups(
                                X_train, self.query_ids_train
                        )
        scaled_X_test = self._scale_features_in_query_groups(
                                X_test, self.query_ids_test
                        )
        
        self.scaled_X_train = torch.from_numpy(scaled_X_train)
        self.y_train = torch.from_numpy(y_train)
        self.scaled_X_test = torch.from_numpy(scaled_X_test)
        self.y_test = torch.from_numpy(y_test)

    def _scale_features_in_query_groups(self, inp_feat_array: np.ndarray,
                                        inp_query_ids: np.ndarray) -> np.ndarray:
        scaler = StandardScaler()
        scaled_arrays = []
        for query_id in np.unique(inp_query_ids):
            scaled_part = scaler.fit_transform(inp_feat_array[inp_query_ids == query_id])
            scaled_arrays.append(scaled_part)
        scaled_X = np.concatenate(scaled_arrays, axis=0)
        return scaled_X

    def _create_model(self, listnet_num_input_features: int,
                      listnet_hidden_dim: int) -> torch.nn.Module:
        torch.manual_seed(0)
        net = ListNet(
            listnet_num_input_features,
            listnet_hidden_dim
        )
        return net

    def fit(self) -> List[float]:
        val_ndcgs = []
        for _ in range(self.n_epochs):
            self._train_one_epoch()
            val_ndcgs.append(self._eval_test_set())
        return val_ndcgs

    def _calc_loss(self, batch_ys: torch.FloatTensor,
                   batch_pred: torch.FloatTensor) -> torch.FloatTensor:
        P_y_i = torch.softmax(batch_ys, dim=0)
        P_z_i = torch.softmax(batch_pred, dim=0)
        return -torch.sum(P_y_i * torch.log(P_z_i))

    def _train_one_epoch(self) -> None:
        self.model.train()
        # допишите ваш код здесь
        np.random.shuffle(self.query_ids_train)
        
        for query_id in self.query_ids_train:
            self.optimizer.zero_grad()
            batch_X = self.scaled_X_train[self.query_ids_train == query_id].to(torch.float32)
            batch_y = self.y_train[self.query_ids_train == query_id]
            preds = self.model(batch_X)
            loss = self._calc_loss(batch_y, preds)
            loss.backward()
            self.optimizer.step()
        

    def _eval_test_set(self) -> float:
        with torch.no_grad():
            self.model.eval()
            ndcgs = []
            for query_id in self.query_ids_test:
                batch_X = self.scaled_X_test[self.query_ids_test == query_id].to(torch.float32)
                batch_y = self.y_test[self.query_ids_test == query_id]
                y_pred = self.model(batch_X)
                group_dcg = self._ndcg_k(batch_y, y_pred, self.ndcg_top_k).item()
                ndcgs.append(group_dcg)
            return np.mean(ndcgs)

    def _dcg(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
             k: int) -> float:
        ys_pred, indices = torch.sort(ys_pred, descending=True)
        ys_true = ys_true[indices]

        sum_dcg = i = 0
        k = min(len(ys_true), k)
        while i < k:
            sum_dcg += (2**ys_true[i] - 1) / math.log2(i + 2)
            i += 1
        return sum_dcg

    def _ndcg_k(self, ys_true: torch.Tensor, ys_pred: torch.Tensor,
                ndcg_top_k: int) -> float:
        case_dcg = self._dcg(ys_true, ys_pred, ndcg_top_k)
        ideal_dcg = self._dcg(ys_true, ys_true, ndcg_top_k)
        return case_dcg / ideal_dcg

In [2]:
train_df, test_df = msrank_10k()
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
query_ids_train = train_df[1].values.astype(int)

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
query_ids_test = test_df[1].values.astype(int)

In [61]:
type(query_ids_train)

numpy.ndarray

In [58]:
len(np.unique(query_ids_train))

87

In [9]:
np.unique(query_ids_test)

array([  13,   28,   43,   58,   73,   88,  103,  118,  133,  148,  163,
        178,  193,  208,  223,  238,  253,  268,  283,  298,  313,  328,
        343,  358,  373,  388,  403,  418,  433,  448,  463,  478,  493,
        508,  523,  538,  553,  568,  583,  598,  613,  628,  643,  658,
        673,  688,  703,  718,  733,  748,  763,  778,  793,  808,  823,
        838,  853,  868,  883,  898,  913,  928,  943,  958,  973,  988,
       1003, 1018, 1033, 1048, 1063, 1078, 1093, 1108, 1123, 1138, 1153,
       1168, 1183, 1198, 1213, 1228, 1243, 1258, 1273, 1288, 1303, 1318])

In [21]:
X_train[query_ids_train == 1]

array([[ 3. ,  3. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 3. ,  0. ,  2. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 3. ,  3. ,  3. , ...,  0. ,  0. ,  0. ],
       [ 2. ,  0. ,  2. , ...,  0. , 20. , 42.6],
       [ 3. ,  0. ,  3. , ...,  0. ,  0. ,  0. ]])

In [27]:
X_train.shape

(10000, 136)

In [25]:
scaler = StandardScaler()

In [47]:
scaled_arrays = []
for query_id in np.unique(query_ids_train):
    scaled_part = scaler.fit_transform(X_train[query_ids_train == query_id])
    scaled_arrays.append(scaled_part)
scaled_X_train = np.concatenate(scaled_arrays, axis=0)

In [48]:
scaled_X_train.shape

(10000, 136)

In [26]:
scaler.fit_transform(X_train[query_ids_train == 1])

array([[ 0.31606376,  4.81705177, -2.17593133, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [ 0.31606376, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.19593518, -0.26622504],
       ...,
       [ 0.31606376,  4.81705177,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504],
       [-1.28284703, -0.23497813, -0.31394034, ..., -0.11175774,
        -0.07383065,  1.91992331],
       [ 0.31606376, -0.23497813,  0.61705515, ..., -0.11175774,
        -0.19593518, -0.26622504]])

In [82]:
num_feats = scaled_X_train.shape[1]

In [83]:
model = ListNet(num_feats, 30)

In [85]:
model.model[0]

Linear(in_features=136, out_features=60, bias=True)

In [97]:
slt = Solution()
ndcg = slt.fit()

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>


In [98]:
ndcg

[0.13444121040412244,
 0.13444121040412244,
 0.13444121040412244,
 0.13444121040412244,
 0.13444121040412244]