# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting duckdb (from pytorch-lifestream)
  Downloading duckdb-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (966 bytes)
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.1.2->pytorch-lifestream)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m7

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.frames.coles.losses.softmax_loss import SoftmaxLoss

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

---

**SWIN1D_Encoder (orig. implementation by Yukara Ikemiya):**

In [7]:
#------------------------------------------------------------------------------------------------------------
# Based on https://github.com/yukara-ikemiya/Swin-Transformer-1d/tree/main and adapted to pytorch-lifestream
#------------------------------------------------------------------------------------------------------------

import torch
import torch.nn as nn
from ptls.data_load.padded_batch import PaddedBatch


def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
    # copied from timm/models/layers/drop.py
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)
    return x * random_tensor


class DropPath(nn.Module):
    # copied from timm/models/layers/drop.py
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None, scale_by_keep=True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, L, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, C)
    """
    B, L, C = x.shape
    x = x.view(B, L // window_size, window_size, C)
    windows = x.contiguous().view(-1, window_size, C)
    return windows


def window_reverse(windows, window_size, L):
    """
    Args:
        windows: (num_windows*B, window_size, C)
        window_size (int): Window size
        L (int): Length of data

    Returns:
        x: (B, L, C)
    """
    B = int(windows.shape[0] / (L / window_size))
    x = windows.view(B, L // window_size, window_size, -1)
    x = x.contiguous().view(B, L, -1)
    return x


class WindowAttention(nn.Module):
    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (int): The width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self, dim: int, window_size: int, num_heads: int,
                 qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(2 * window_size - 1, num_heads))  # 2*window_size - 1, nH

        # get pair-wise relative position index for each token inside the window
        coords_w = torch.arange(self.window_size)
        relative_coords = coords_w[:, None] - coords_w[None, :]  # W, W
        relative_coords[:, :] += self.window_size - 1  # shift to start from 0

        # relative_position_index | example
        # [2, 1, 0]
        # [3, 2, 1]
        # [4, 3, 2]
        self.register_buffer("relative_position_index", relative_coords)  # (W, W): range of 0 -- 2*(W-1)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask_add, mask_mult):
        """
        Args:
            x: input features with shape of (num_windows*B, W, C)
            mask: (0/-inf) mask with shape of (num_windows, W, W) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size, self.window_size, -1)  # W, W, nH
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, W, W
        attn = attn + relative_position_bias.unsqueeze(0)

        nW = mask_add.shape[1]
        attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask_add
        attn = attn.view(-1, self.num_heads, N, N)
        attn = self.softmax(attn)
        attn = attn * mask_mult
        
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    def extra_repr(self) -> str:
        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'


class SwinTransformerBlock(nn.Module):
    r""" Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether this block is decoder-like (hence, attn_mask should prevent from seeing future tokens). True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """

    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
                 act_layer=nn.GELU, norm_layer=nn.LayerNorm,
                 decoder=False, start_end_fusion=True):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim, window_size=self.window_size, num_heads=num_heads,
            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        attn_mask = None
        self.register_buffer("attn_mask", attn_mask)

        self.decoder = decoder
        self.start_end_fusion = start_end_fusion

    def forward(self, x):
        seq_lens = x.seq_lens
        x = x.payload
        
        B, L, C = x.shape

        # define seq_len_mask
        mask = torch.arange(L, device=x.device)[None, :] + torch.ones((B, L), device=x.device)
        mask[mask > seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]

        # make new max seq_len `L` divisible by `self.window_size` by adding 'zero' samples
        num_samples_to_add = self.window_size - (L % self.window_size)
        
        if num_samples_to_add < self.window_size:
            additional_samples = torch.zeros((B, num_samples_to_add, C), device=x.device)
            x = torch.cat((x, additional_samples), dim=1)
            mask_additional_samples = torch.zeros((B, num_samples_to_add, mask.shape[2]), device=mask.device)
            mask = torch.cat((mask, mask_additional_samples), dim=1)
            L += num_samples_to_add

        # zero out padding transactions
        x = x * mask
        
        assert L >= self.window_size, f'input length ({L}) must be >= window size ({self.window_size})'
        assert L % self.window_size == 0, f'input length ({L}) must be divisible by window size ({self.window_size})'

        shortcut = x
        x = self.norm1(x)

        # shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=-self.shift_size, dims=1) # cyclic shift 
            if not self.start_end_fusion:
                shifted_x[:, -self.shift_size:] = 0. # zero out invalid embs
            mask = torch.roll(mask, shifts=-self.shift_size, dims=1) # cyclic shift of the mask
            if not self.start_end_fusion:
                mask[:, -self.shift_size:] = 0.
        else:
            shifted_x = x
        
        # partition
        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, C
        mask = window_partition(mask, self.window_size) # nW*B, window_size, 1
        
        # calculate attn_mask
        attn_mask = (mask @ mask.transpose(-2, -1)) # nW*B, window_size, window_size
        
        if self.decoder:
            no_look_ahead_attn_mask = 1. - torch.triu(torch.ones_like(attn_mask), diagonal=1)
            attn_mask *= no_look_ahead_attn_mask
        
        attn_mask_real = attn_mask.clone().detach()
        attn_mask_real = attn_mask_real.view(attn_mask_real.shape[0], self.window_size, self.window_size).unsqueeze(1).expand(-1, self.num_heads, -1, -1) # B*nW, nH, window_size, window_size
        
        attn_mask[attn_mask == 0.] = -torch.inf
        attn_mask[attn_mask == 1.] = 0.
        attn_mask[:, torch.arange(attn_mask.shape[-1]), torch.arange(attn_mask.shape[-1])] = 0.
        attn_mask = attn_mask.view(B, attn_mask.shape[0] // B, self.window_size, self.window_size).unsqueeze(2).expand(-1, -1, self.num_heads, -1, -1) # B, nW, nH, window_size, window_size
        
        # W-MSA/SW-MSA
        attn_windows = self.attn(x_windows, mask_add=attn_mask, mask_mult=attn_mask_real)  # nW*B, window_size, C
        
        # merge windows
        shifted_x = window_reverse(attn_windows, self.window_size, L)  # (B, L, C)

        # reverse zero-padding shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=self.shift_size, dims=1) # cyclic shift
            if not self.start_end_fusion:
                x[:, :self.shift_size] = 0. # zero out invalid embs
        else:
            x = shifted_x

        x = shortcut + self.drop_path(x)

        # FFN
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        
        return PaddedBatch(x, seq_lens)

    def extra_repr(self) -> str:
        return f"dim={self.dim}, num_heads={self.num_heads}, " \
               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"


class SwinTransformerLayer(nn.Module):
    """ A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of input channels.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether blocks in this layer are decoder-like. True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int,
        window_size: int,
        mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
        drop_path=0., norm_layer=nn.LayerNorm,
        decoder=False, start_end_fusion=True
    ):
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.num_heads = num_heads
        self.window_size = window_size

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(dim=dim,
                                 num_heads=num_heads, window_size=window_size,
                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
                                 drop=drop, attn_drop=attn_drop,
                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                                 norm_layer=norm_layer,
                                 decoder=decoder,
                                 start_end_fusion=start_end_fusion)
            for i in range(depth)])

    def forward(self, x):
        for blk in self.blocks:
            x = blk(x)
        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, depth={self.depth}, num_heads={self.num_heads}, window_size={self.window_size}"


class SwinTransformerBackbone(nn.Module):
    """ Swin Transformer Backbone (4 stages as in orig. 2D impl.).

    Args:
        dim (int): Number of input channels.
        depths (list[int]): Numbers of blocks in stages.
        num_heads (int): Number of attention heads in W-MSA layers.
        start_window_size (int): Local window size of stage 1.
        window_size_mult (int): the number by which the `window_size` is being multiplied when moving to another stage
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether blocks in this backbone are decoder-like. True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """
    def __init__(
        self,
        dim: int,
        depths: list[int],
        num_heads,
        start_window_size: int,
        window_size_mult: int = 1,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop=0.,
        attn_drop=0.,
        drop_path=0.,
        norm_layer=nn.LayerNorm,
        decoder=False,
        start_end_fusion=True
    ):
        super().__init__()
        self.dim = dim
        self.depths = depths
        
        if type(num_heads) == int:
            self.num_heads = [num_heads] * len(depths)
        else:
            self.num_heads = num_heads
        
        self.window_sizes = [start_window_size]
        
        for i in range(len(self.depths) - 1):
            self.window_sizes += [self.window_sizes[-1] * window_size_mult]

        # build model
        self.backbone = nn.ModuleList([
            SwinTransformerLayer(dim=self.dim,
                                 depth=self.depths[i],
                                 num_heads=self.num_heads[i],
                                 window_size=self.window_sizes[i],
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias,
                                 qk_scale=qk_scale,
                                 drop=drop,
                                 attn_drop=attn_drop,
                                 drop_path=drop_path,
                                 norm_layer=norm_layer,
                                 decoder=decoder,
                                 start_end_fusion=start_end_fusion)
            for i in range(len(self.depths))])

    def forward(self, x):
        for layer in self.backbone:
            x = layer(x)
        return x

In [8]:
def change_to_enc(swin_model):
    for i in range(len(swin_model.backbone)):
        for j in range(len(swin_model.backbone[i].blocks)):
            swin_model.backbone[i].blocks[j].decoder = False

def change_to_dec(swin_model):
    for i in range(len(swin_model.backbone)):
        for j in range(len(swin_model.backbone[i].blocks)):
            swin_model.backbone[i].blocks[j].decoder = True

# Эксперименты.

**Данные:**

In [9]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [10]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [11]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [12]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [13]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [14]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [15]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [16]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**SWIN-RNN Seq Encoder:**

In [17]:
from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
from ptls.nn.seq_encoder.containers import SeqEncoderContainer


class SWIN_RNN_SeqEncoder(SeqEncoderContainer):
    """SeqEncoderContainer with SWIN transformer backbone for features hierarchic fusion and RnnEncoder for feature aggregation.
    
    Parameters
        trx_encoder:
            TrxEncoder object
        input_size:
            input_size parameter for RnnEncoder
            If None: input_size = trx_encoder.output_size
            Set input_size explicitly or use None if your trx_encoder object has output_size attribute
        is_reduce_sequence:
            False - returns PaddedBatch with all transactions embeddings
            True - returns one embedding for sequence based on CLS token
        swin_depths: Numbers of blocks in stages (SWIN backbone).
        swin_num_heads: Number of attention heads in W-MSA layers (SWIN backbone).
        swin_start_window_size: Local window size of stage 1 (SWIN backbone).
        swin_window_size_mult (int): the number by which the `window_size` is being multiplied when moving to another stage (SWIN backbone).
        swin_drop: Dropout rate (SWIN backbone). Default: 0.0
        swin_attn_drop: Attention dropout rate (SWIN backbone). Default: 0.0
        swin_drop_path: Stochastic depth rate (SWIN backbone). Default: 0.0
        swin_decoder: Flag that shows whether blocks in SWIN backbone are decoder-like. True => decoder-like; False => encoder-like. Default: False
        swin_start_end_fusion: Flag that shows if the last and the first half-windows should merge (True) or not (False). Must be False for CPC and GPT.
        **rnn_seq_encoder_params:
            RnnEncoder params
    """
    def __init__(self,
                 trx_encoder=None,
                 input_size=None,
                 is_reduce_sequence=True,
                 swin_depths=[],
                 swin_num_heads=4,
                 swin_start_window_size=4,
                 swin_window_size_mult=1,
                 swin_drop=0.,
                 swin_attn_drop=0.,
                 swin_drop_path=0.,
                 swin_decoder=False,
                 swin_start_end_fusion=True,
                 **rnn_seq_encoder_params
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder_cls=RnnEncoder,
            input_size=input_size,
            seq_encoder_params=rnn_seq_encoder_params,
            is_reduce_sequence=is_reduce_sequence,
        )
        self.swin_fusion = SwinTransformerBackbone(
                               dim=trx_encoder.output_size,
                               depths=swin_depths,
                               num_heads=swin_num_heads,
                               start_window_size=swin_start_window_size,
                               window_size_mult=swin_window_size_mult,
                               drop=swin_drop,
                               attn_drop=swin_attn_drop,
                               drop_path=swin_drop_path,
                               decoder=swin_decoder,
                               start_end_fusion=swin_start_end_fusion 
                              )

    def forward(self, x, names=None, seq_len=None, h_0=None):
        x = self.trx_encoder(x)
        x = self.swin_fusion(x)
        x = self.seq_encoder(x, h_0)
        return x

**Convolution Aggregator Class:**

In [18]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class ConvAggregator(TrxEncoder):
    """The NN layer, a combination of TrxEncoder and Conv Layer (a window of #`agg_samples` transactions) 
       (works like nn.Sequential([TrxEncoder, Conv Window Aggregation])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window (conv layer).

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
        )

        self.agg_samples = agg_samples

        channels = super().output_size

        self.conv = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=self.agg_samples, padding=(self.agg_samples - 1), bias=False) # (B, T, L)

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        if self.use_window_attention:
            pass # Not Implemented
    
        agg_embeds = torch.transpose(self.conv(torch.transpose(masked_embeds, 1, 2)), 1, 2)

        new_seq_lens = embeds.seq_lens + self.agg_samples - 1

        return PaddedBatch(agg_embeds, new_seq_lens)

In [21]:
seed_everything(0)

In [22]:
device = "cuda:0"

In [25]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    linear_projection_size=64
)

trx_encoder = TrxEncoder(**trx_encoder_params).to(device)

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=4,
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=True,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru").to(device)

In [27]:
trx_encoder.eval()

train_loader = inference_data_loader(data_train, num_workers=0, batch_size=64)

for i, batch in tqdm(enumerate(train_loader)):
    batch = batch.to(device)
    embeds = seq_encoder(batch)

    # if i == 0:
    #     print(batch.payload)
    #     print(batch.seq_lens)
    #     print()
    #     print(embeds)
    #     print(embeds.shape)
    #     print(embeds.seq_lens)

422it [01:17,  5.45it/s]


---

**Train sequences lengths check:**

In [30]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    linear_projection_size=64
)

trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (linear_projection_head): Linear(in_features=33, out_features=64, bias=True)
)

In [31]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=64)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

422it [00:02, 209.81it/s]

Max Length: 683





---

# SWIN Aggregation

- **COLES:**

In [47]:
# import gc

# gc.collect()
# torch.cuda.empty_cache()

In [19]:
seed_everything(0)

**DataLoaders:**

In [20]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    train_num_workers=4,
    train_batch_size=32,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=32
)

**Модель:**

In [21]:
N_EPOCHS = 20

In [22]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    linear_projection_size=64,
    agg_samples=3, # 3, 5, 7, 9
    use_window_attention=False
)

#trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder = ConvAggregator(**trx_encoder_params)

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=4,
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=False,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    #loss=SoftmaxLoss(),
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [23]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_SWIN_agg (w/ conv_agg, 3trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [24]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/691f007f6de14fd7811a52366541fb43

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

The following arguments were not expected: --md5 --explicit
Run with --help for more information.

  self.pid = os.fork()
  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_SWIN_agg (w/ conv_agg, 3trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/691f007f6de14fd7811a52366541fb43
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [2025]             : (12.434327125549316, 256.6272277832031)
[1;38;5;39mCOMET INFO:[0m     seq_len [337]           : (321.9937438964844, 396.76251220703125)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.7303524613380432, 0.9741855263710022)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;3

In [25]:
trainer.logged_metrics

{'loss': tensor(14.5103),
 'seq_len': tensor(344.6250),
 'valid/recall_top_k': tensor(0.9742)}

In [56]:
torch.save(seq_encoder.state_dict(), "coles_enc_win_agg.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [26]:
encoder = coles.seq_encoder

device = "cuda:0"

encoder.to(device)

SWIN_RNN_SeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (linear_projection_head): Linear(in_features=33, out_features=64, bias=True)
    (conv): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(64, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
  (swin_fusion): SwinTransformerBackbone(
    (backbone): ModuleList(
      (0): SwinTransformerLayer(
        dim=64, depth=2, num_heads=2, window_size=4
        (blocks): ModuleList(
          (0): SwinTransformerBlock(
            dim=64, num_heads=2, window_size=4, shift_size=0, mlp_ratio=4.0
            (no

In [27]:
from tqdm import tqdm

seed_everything(0)

In [28]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=32)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

844it [01:34,  8.94it/s]


array([[ 0.99999994, -0.9999983 ,  0.9999999 , ...,  0.07011775,
        -0.99997383,  0.9089313 ],
       [ 0.99999994, -0.99999917,  0.99999994, ...,  0.6910359 ,
        -0.999971  ,  0.27540633],
       [ 1.        , -0.9999976 ,  0.99999994, ...,  0.6189189 ,
        -0.9999723 , -0.13676685],
       ...,
       [ 1.        , -0.99999875,  1.        , ..., -0.8627192 ,
        -0.9999666 , -0.50936544],
       [ 1.        , -0.99999726,  0.99999994, ..., -0.00435414,
        -0.99995846,  0.28849235],
       [ 1.        , -0.9999988 ,  1.        , ...,  0.262862  ,
        -0.99998564, -0.26877946]], dtype=float32)

In [29]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=32)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

94it [00:09,  9.70it/s]


array([[ 0.99999994, -0.999999  ,  1.        , ...,  0.548387  ,
        -0.99997306,  0.94794637],
       [ 0.99999994, -0.99999917,  0.9999998 , ...,  0.40317556,
        -0.9999732 , -0.59722704],
       [ 1.        , -0.9999987 ,  1.        , ...,  0.7880338 ,
        -0.9999577 , -0.18983312],
       ...,
       [ 0.99999976, -0.9999977 ,  1.        , ...,  0.704449  ,
        -0.9999575 , -0.76203877],
       [ 0.99999994, -0.9999965 ,  1.        , ...,  0.7353311 ,
        -0.99998206, -0.19870564],
       [ 1.        , -0.99999946,  0.99999994, ...,  0.7427735 ,
        -0.999995  , -0.76541483]], dtype=float32)

In [30]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=0)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.12714
0:	learn: 1.2954761	total: 16s	remaining: 4h 26m 50s
1:	learn: 1.2301989	total: 16s	remaining: 2h 13m 22s
2:	learn: 1.1790849	total: 16s	remaining: 1h 28m 53s
3:	learn: 1.1373166	total: 16.1s	remaining: 1h 6m 38s
4:	learn: 1.1046293	total: 16.1s	remaining: 53m 17s
5:	learn: 1.0768421	total: 16.1s	remaining: 44m 24s
6:	learn: 1.0533809	total: 16.1s	remaining: 38m 2s
7:	learn: 1.0330715	total: 16.1s	remaining: 33m 16s
8:	learn: 1.0160692	total: 16.1s	remaining: 29m 34s
9:	learn: 1.0013699	total: 16.1s	remaining: 26m 36s
10:	learn: 0.9889747	total: 16.1s	remaining: 24m 10s
11:	learn: 0.9783094	total: 16.1s	remaining: 22m 9s
12:	learn: 0.9684621	total: 16.2s	remaining: 20m 26s
13:	learn: 0.9594464	total: 16.2s	remaining: 18m 58s
14:	learn: 0.9520535	total: 16.2s	remaining: 17m 42s
15:	learn: 0.9449457	total: 16.2s	remaining: 16m 35s
16:	learn: 0.9390123	total: 16.2s	remaining: 15m 36s
17:	learn: 0.9335706	total: 16.2s	remaining: 14m 44s
18:	learn: 0.9289168	tot

<catboost.core.CatBoostClassifier at 0x7c8f9af12920>

In [31]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [32]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5866666666666667
ROC-AUC: 0.8418938072777914


In [4]:
arr = np.array([0.8418938072777914, 0.8433480466040607, 0.8435011133618263])

arr.mean(), arr.std()

(0.8429143224145594, 0.0007243137977252656)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

<!-- - COLES embeds w/ SWIN_Agg seq_enc + Catboost:
  - Accuracy: `0.593`, `0.5906666666666667`, `0.5983333333333334`, avg: `0.594 +- 0.0032`
  - ROC-AUC: `0.8430090651959065`, `0.8425861754433439`, `0.8393266734968265`, avg: `0.8416 +- 0.0016` -->

- COLES embeds w/ SWIN_Agg seq_enc + Catboost:
  - Accuracy: `0.5993333333333334`, `0.5916666666666667`, `0.5993333333333334`, avg: `0.5968 +- 0.0036`
  - ROC-AUC: `0.8435234374762616`, `0.8419468094194725`, `0.8457581938901253`, avg: `0.8437 +- 0.0016`

---

- COLES embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.5866666666666667`, `0.5933333333333334`, `0.598`, avg: `0.5927 +- 0.0047`
  - ROC-AUC: `0.8418938072777914`, `0.8433480466040607`, `0.8435011133618263`, avg: `0.8429 +- 0.0007`

---

<!-- - COLES embeds w/ SWIN_Agg seq_enc + ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.5903333333333334`, `0.5883333333333334`, `0.5846666666666667`, avg: `0.5878 +- 0.0023`
  - ROC-AUC: `0.8423263115862589`, `0.8404208025778663`, `0.8384572507278364`, avg: `0.8404 +- 0.0016`

--- -->

**Вывод:** для CoLES качество при замене RNN энкодера на SWIN энкодер значительно ухудшается, как по accuracy, так и по ROC-AUC. При добавлении свёрточной агрегации качество становится ещё хуже.

**Конфигурация, лучшая по метрикам:**

- COLES embeds w/ SWIN_Agg seq_enc + Catboost:
  - Accuracy: `0.5993333333333334`, `0.5916666666666667`, `0.5993333333333334`, avg: `0.5968 +- 0.0036`
  - ROC-AUC: `0.8435234374762616`, `0.8419468094194725`, `0.8457581938901253`, avg: `0.8437 +- 0.0016`

---

**Train sequences lengths check:**

In [19]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    linear_projection_size=260
)

trx_encoder = TrxEncoder(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (linear_projection_head): Linear(in_features=33, out_features=260, bias=True)
)

In [21]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:01, 112.30it/s]

Max Length: 863





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [19]:
import torch
from torch import nn as nn
from torch.nn import functional as F
from ptls.data_load.padded_batch import PaddedBatch


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=0):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        # num_additional_samples = mapped_ctx_embeddings.payload.shape[1] - max_seq_len
        # if num_additional_samples > 0:
        #     additional_samples = torch.zeros((batch_size, num_additional_samples, emb_size), device=device)
        #     base_embeddings = PaddedBatch(torch.cat((base_embeddings.payload, additional_samples), dim=1), base_embeddings.seq_lens)
        #     max_seq_len += num_additional_samples               
        
        #mapped_ctx_embeddings = mapped_ctx_embeddings.payload
            
        mapped_ctx_embeddings = mapped_ctx_embeddings.payload[:, :max_seq_len, :, :]

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        #max_seq_len = mapped_ctx_embeddings.payload.shape[1]
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [20]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6, shift='none',
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        if shift == 'add':
            loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(seq_encoder.trx_encoder.agg_samples - 1))
        else:
            loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=0)

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        swin_fusion = self._seq_encoder.swin_fusion
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(swin_fusion(base_embeddings))
        
        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [21]:
seed_everything(0)

**DataLoaders:**

In [22]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=863,
        max_len=904
    ),
    train_num_workers=4,
    train_batch_size=32,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=863,
        max_len=904
    ),
    valid_num_workers=4,
    valid_batch_size=32
)

**Модель:**

In [23]:
N_EPOCHS = 20

In [24]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 128},
        "small_group": {"in": 250, "out": 128},
    },
    linear_projection_size=272,
    agg_samples=3, # 3, 5, 7
    use_window_attention=False
)

#trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder = ConvAggregator(**trx_encoder_params)

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=4,
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=True,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    shift='add', # 'none' / 'add'
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [25]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_SWIN_agg (w/ conv_agg, 3trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [26]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/27d0b7749e7948ad9679c03c5258e06c

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

The following arguments were not expected: --md5 --explicit
Run with --help for more information.

  self.pid = os.fork()
  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_SWIN_agg (w/ conv_agg, 3trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/27d0b7749e7948ad9679c03c5258e06c
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [2025]             : (0.23935692012310028, 4.475934982299805)
[1;38;5;39mCOMET INFO:[0m     seq_len [337]           : (799.71875, 857.46875)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.9094287157058716, 0.9480034708976746)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET I

In [27]:
trainer.logged_metrics

{'loss': tensor(0.4193),
 'seq_len': tensor(828.7084),
 'valid/cpc_accuracy': tensor(0.9478)}

In [36]:
#torch.save(seq_encoder.state_dict(), "cpc_enc_win_agg_trx20.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [9]:
# !gdown "https://drive.google.com/uc?export=download&id=1iuJJfsZpvco2VAgEMxnLlv8LW9cAcUEY" -O "cpc_enc_win_agg_trx20.pt"

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1iuJJfsZpvco2VAgEMxnLlv8LW9cAcUEY
From (redirected): https://drive.google.com/uc?export=download&id=1iuJJfsZpvco2VAgEMxnLlv8LW9cAcUEY&confirm=t&uuid=c3968f85-d4c1-450a-a997-c3c531e5d2fc
To: /kaggle/working/cpc_enc_win_agg_trx20.pt
100%|██████████████████████████████████████| 44.8M/44.8M [00:00<00:00, 75.6MB/s]


In [29]:
#state_dict = torch.load("./cpc_enc_win_agg_trx20.pt", weights_only=False)

#seq_encoder.load_state_dict(state_dict)

<All keys matched successfully>

In [30]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [29]:
encoder = cpc.seq_encoder

device = "cuda:0"

encoder.to(device)

SWIN_RNN_SeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (linear_projection_head): Linear(in_features=257, out_features=272, bias=True)
    (conv): Conv1d(272, 272, kernel_size=(3,), stride=(1,), padding=(2,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(272, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
  (swin_fusion): SwinTransformerBackbone(
    (backbone): ModuleList(
      (0): SwinTransformerLayer(
        dim=272, depth=2, num_heads=2, window_size=4
        (blocks): ModuleList(
          (0): SwinTransformerBlock(
            dim=272, num_heads=2, window_size=4, shift_size=0, mlp_ratio=4.0
      

In [31]:
encoder.seq_encoder.is_reduce_sequence = True
change_to_enc(encoder.swin_fusion)

In [46]:
# change_to_dec(encoder.swin_fusion)

In [33]:
from tqdm import tqdm

seed_everything(0)

In [34]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=16)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

1688it [03:49,  7.34it/s]


array([[ 0.50206935,  0.5502154 ,  0.21881583, ..., -0.47690132,
        -0.00952161,  0.17128213],
       [ 0.58404934,  0.5412656 ,  0.28523195, ..., -0.6154343 ,
         0.42086115,  0.56868637],
       [-0.08817012,  0.6637562 , -0.27675313, ..., -0.5988691 ,
         0.18293908, -0.01829041],
       ...,
       [ 0.22068627,  0.2929967 , -0.5602638 , ..., -0.45553845,
         0.02662687,  0.20506336],
       [ 0.561322  , -0.01245022,  0.1761291 , ..., -0.5325867 ,
         0.12356527, -0.00991007],
       [ 0.27731666,  0.34375715,  0.00902353, ..., -0.708862  ,
         0.15119435,  0.33666897]], dtype=float32)

In [35]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=16)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

188it [00:23,  7.85it/s]


array([[ 0.4124494 ,  0.16320945, -0.06075403, ..., -0.7835697 ,
         0.23924844,  0.18599585],
       [ 0.48923135,  0.5581738 , -0.06504323, ..., -0.7392166 ,
         0.21464413,  0.11534584],
       [ 0.46545008,  0.14906222, -0.07318704, ..., -0.46150145,
         0.26937625,  0.15991728],
       ...,
       [ 0.4191622 ,  0.11884288,  0.10891388, ..., -0.7017476 ,
         0.04949887,  0.27504486],
       [ 0.25224152,  0.28723678, -0.0979318 , ..., -0.6043854 ,
         0.5650005 ,  0.22921675],
       [ 0.1502997 ,  0.03001654, -0.00657668, ..., -0.5688363 ,
         0.5408161 ,  0.42914337]], dtype=float32)

In [36]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=0)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.12714
0:	learn: 1.3084508	total: 17s	remaining: 4h 42m 33s
1:	learn: 1.2512947	total: 17s	remaining: 2h 21m 13s
2:	learn: 1.2066016	total: 17s	remaining: 1h 34m 6s
3:	learn: 1.1698312	total: 17s	remaining: 1h 10m 33s
4:	learn: 1.1421062	total: 17s	remaining: 56m 25s
5:	learn: 1.1173501	total: 17s	remaining: 46m 59s
6:	learn: 1.0959187	total: 17s	remaining: 40m 16s
7:	learn: 1.0780401	total: 17s	remaining: 35m 13s
8:	learn: 1.0628451	total: 17.1s	remaining: 31m 17s
9:	learn: 1.0501766	total: 17.1s	remaining: 28m 9s
10:	learn: 1.0388875	total: 17.1s	remaining: 25m 35s
11:	learn: 1.0278922	total: 17.1s	remaining: 23m 26s
12:	learn: 1.0184868	total: 17.1s	remaining: 21m 37s
13:	learn: 1.0106912	total: 17.1s	remaining: 20m 4s
14:	learn: 1.0037609	total: 17.1s	remaining: 18m 43s
15:	learn: 0.9979188	total: 17.1s	remaining: 17m 33s
16:	learn: 0.9923271	total: 17.1s	remaining: 16m 30s
17:	learn: 0.9873362	total: 17.1s	remaining: 15m 35s
18:	learn: 0.9830680	total: 17.2s	

<catboost.core.CatBoostClassifier at 0x7b72d691ae60>

In [37]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [38]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.57
ROC-AUC: 0.8227406048367972


In [1]:
import numpy as np

In [40]:
arr = np.array([0.8227406048367972, 0.8311162237991545, 0.8247213858065967])

arr.mean(), arr.std()

(0.8261927381475161, 0.0035741112213618035)

- CPC context embeds + Catboost:
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

---

<!-- - CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
  - Accuracy: `0.581`, `0.5646666666666667`, `0.5726666666666667`, avg: `0.5727 +- 0.0067`
  - ROC-AUC: `0.8291802131654565`, `0.8185063563468156`, `0.8215564372620066`, avg: `0.8231 +- 0.0045`
  
---
-->

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
   - `Accuracy: 0.59`, `0.5726666666666667`, `0.5736666666666667`, avg: `0.5788 +- 0.0079`
   - `ROC-AUC: 0.8291468943509576`, `0.8264141264285515`, `0.8222129523643`, avg: `0.8259 +- 0.0029`

<!-- ---

- CPC context embeds w/ SWIN_Agg seq_enc (w/ look-ahead mask) + Catboost:
  - Accuracy: `0.579`, `0.554`, `0.569`, avg: `0.5673 +- 0.0103`
  - ROC-AUC: `0.8309040879417245`, `0.817863458479023`, `0.8264009551174533`, avg: `0.8251 +- 0.0054` -->

---

- CPC context embeds w/ SWIN_Agg seq_enc (w/ look-ahead mask) + Catboost:
  - Accuracy: `0.5806666666666667`, `0.5723333333333334`, `0.5636666666666666`, avg: `0.5722 +- 0.0069`
  - ROC-AUC: `0.8274773162162774`, `0.8273407125045609`, `0.8207535939209638`, avg: `0.8252 +- 0.0031`

---

- CPC context embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.57`, `0.5756666666666667`, `0.5756666666666667`, avg: `0.5738 +- 0.0027`
  - ROC-AUC: `0.8227406048367972`, `0.8311162237991545`, `0.8247213858065967`, avg: `0.8262 +- 0.0036`

---

**Вывод:** Для CPC замена энкодера на SWIN-трансформер приводит к значительному повышению accuracy и к сильному спаду по ROC-AUC в сравнении с бейзлайном. При этом сетап с attn-маской на инференсе демонстрирует худшие результаты по сравнению с вариантом без неё.

Сетап со свёрточной агрегацией и SWIN-энкодером демонстрирует несколько лучший ROC-AUC, чем в случае с обычным транзакционным энкодером и SWIN-энкодером, который тем не менее всё ещё хуже, чем в случае бейзлайна, но также - значительный спад по accuracy по сравнению со случаем с обычным транзакционным энкодером и SWIN-энкодером и даже - по сравнению с бейзлайном.


**Конфигурация, лучшая по метрикам:** 

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
   - `Accuracy: 0.59`, `0.5726666666666667`, `0.5736666666666667`, avg: `0.5788 +- 0.0079`
   - `ROC-AUC: 0.8291468943509576`, `0.8264141264285515`, `0.8222129523643`, avg: `0.8259 +- 0.0029`

---

# Итоги.

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**                   | 0.4921 ± 0.005        | 0.76 ± 0.0012   |
| **GRU (+ MLP)**                           | 0.6066 ± 0.0019       | 0.8479 ± 0.0013 |
| **CoLES**                                 | 0.6042 ± 0.0083       | 0.8482 ± 0.0007 |
| **COLES embeds w/ SWIN Agg encoder**      | 0.5968 ± 0.0036       | 0.8437 ± 0.0016 |
| **CPC Modeling**                          | 0.5762 ± 0.0058       | 0.8305 ± 0.003  |
| **CPC Modeling w/ SWIN Agg encoder**      | 0.5788 ± 0.0079       | 0.8259 ± 0.0029 |
| **GPT2**                                  | 0.6146 ± 0.0075       | 0.852 ± 0.0029  |