# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=ac168cfbfeabbaa3d77eb90a3ddca67b1e0822f1a9fb2ff2b

In [2]:
# data preprocessing
import os
import numpy as np 
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml 

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
from ptls.frames.coles.losses.softmax_loss import SoftmaxLoss

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

---

**Time2Vec:**

In [7]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

---

**SWIN1D_Encoder (orig. implementation by Yukara Ikemiya):**

In [8]:
#------------------------------------------------------------------------------------------------------------
# Based on https://github.com/yukara-ikemiya/Swin-Transformer-1d/tree/main and adapted to pytorch-lifestream
#------------------------------------------------------------------------------------------------------------

import torch
import torch.nn as nn
from ptls.data_load.padded_batch import PaddedBatch


def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
    # copied from timm/models/layers/drop.py
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)
    return x * random_tensor


class DropPath(nn.Module):
    # copied from timm/models/layers/drop.py
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None, scale_by_keep=True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


def window_partition(x, window_size):
    """
    Args:
        x: (B, L, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, C)
    """
    B, L, C = x.shape
    x = x.view(B, L // window_size, window_size, C)
    windows = x.contiguous().view(-1, window_size, C)
    return windows


def window_reverse(windows, window_size, L):
    """
    Args:
        windows: (num_windows*B, window_size, C)
        window_size (int): Window size
        L (int): Length of data

    Returns:
        x: (B, L, C)
    """
    B = int(windows.shape[0] / (L / window_size))
    x = windows.view(B, L // window_size, window_size, -1)
    x = x.contiguous().view(B, L, -1)
    return x


class WindowAttention(nn.Module):
    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (int): The width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self, dim: int, window_size: int, num_heads: int,
                 qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(2 * window_size - 1, num_heads))  # 2*window_size - 1, nH

        # get pair-wise relative position index for each token inside the window
        coords_w = torch.arange(self.window_size)
        relative_coords = coords_w[:, None] - coords_w[None, :]  # W, W
        relative_coords[:, :] += self.window_size - 1  # shift to start from 0

        # relative_position_index | example
        # [2, 1, 0]
        # [3, 2, 1]
        # [4, 3, 2]
        self.register_buffer("relative_position_index", relative_coords)  # (W, W): range of 0 -- 2*(W-1)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask_add, mask_mult):
        """
        Args:
            x: input features with shape of (num_windows*B, W, C)
            mask: (0/-inf) mask with shape of (num_windows, W, W) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size, self.window_size, -1)  # W, W, nH
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, W, W
        attn = attn + relative_position_bias.unsqueeze(0)

        nW = mask_add.shape[1]
        attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask_add
        attn = attn.view(-1, self.num_heads, N, N)
        attn = self.softmax(attn)
        attn = attn * mask_mult
        
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    def extra_repr(self) -> str:
        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'


class SwinTransformerBlock(nn.Module):
    r""" Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether this block is decoder-like (hence, attn_mask should prevent from seeing future tokens). True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """

    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
                 act_layer=nn.GELU, norm_layer=nn.LayerNorm,
                 decoder=False, start_end_fusion=True):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim, window_size=self.window_size, num_heads=num_heads,
            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

        attn_mask = None
        self.register_buffer("attn_mask", attn_mask)

        self.decoder = decoder
        self.start_end_fusion = start_end_fusion

    def forward(self, x):
        seq_lens = x.seq_lens
        x = x.payload
        
        B, L, C = x.shape

        # define seq_len_mask
        mask = torch.arange(L, device=x.device)[None, :] + torch.ones((B, L), device=x.device)
        mask[mask > seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]

        # make new max seq_len `L` divisible by `self.window_size` by adding 'zero' samples
        num_samples_to_add = self.window_size - (L % self.window_size)
        
        if num_samples_to_add < self.window_size:
            additional_samples = torch.zeros((B, num_samples_to_add, C), device=x.device)
            x = torch.cat((x, additional_samples), dim=1)
            mask_additional_samples = torch.zeros((B, num_samples_to_add, mask.shape[2]), device=mask.device)
            mask = torch.cat((mask, mask_additional_samples), dim=1)
            L += num_samples_to_add

        # zero out padding transactions
        x = x * mask
        
        assert L >= self.window_size, f'input length ({L}) must be >= window size ({self.window_size})'
        assert L % self.window_size == 0, f'input length ({L}) must be divisible by window size ({self.window_size})'

        shortcut = x
        x = self.norm1(x)

        # shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=-self.shift_size, dims=1) # cyclic shift 
            if not self.start_end_fusion:
                shifted_x[:, -self.shift_size:] = 0. # zero out invalid embs
            mask = torch.roll(mask, shifts=-self.shift_size, dims=1) # cyclic shift of the mask
            if not self.start_end_fusion:
                mask[:, -self.shift_size:] = 0.
        else:
            shifted_x = x
        
        # partition
        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, C
        mask = window_partition(mask, self.window_size) # nW*B, window_size, 1
        
        # calculate attn_mask
        attn_mask = (mask @ mask.transpose(-2, -1)) # nW*B, window_size, window_size
        
        if self.decoder:
            no_look_ahead_attn_mask = 1. - torch.triu(torch.ones_like(attn_mask), diagonal=1)
            attn_mask *= no_look_ahead_attn_mask
        
        attn_mask_real = attn_mask.clone().detach()
        attn_mask_real = attn_mask_real.view(attn_mask_real.shape[0], self.window_size, self.window_size).unsqueeze(1).expand(-1, self.num_heads, -1, -1) # B*nW, nH, window_size, window_size
        
        attn_mask[attn_mask == 0.] = -torch.inf
        attn_mask[attn_mask == 1.] = 0.
        attn_mask[:, torch.arange(attn_mask.shape[-1]), torch.arange(attn_mask.shape[-1])] = 0.
        attn_mask = attn_mask.view(B, attn_mask.shape[0] // B, self.window_size, self.window_size).unsqueeze(2).expand(-1, -1, self.num_heads, -1, -1) # B, nW, nH, window_size, window_size
        
        # W-MSA/SW-MSA
        attn_windows = self.attn(x_windows, mask_add=attn_mask, mask_mult=attn_mask_real)  # nW*B, window_size, C
        
        # merge windows
        shifted_x = window_reverse(attn_windows, self.window_size, L)  # (B, L, C)

        # reverse zero-padding shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=self.shift_size, dims=1) # cyclic shift
            if not self.start_end_fusion:
                x[:, :self.shift_size] = 0. # zero out invalid embs
        else:
            x = shifted_x

        x = shortcut + self.drop_path(x)

        # FFN
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        
        return PaddedBatch(x, seq_lens)

    def extra_repr(self) -> str:
        return f"dim={self.dim}, num_heads={self.num_heads}, " \
               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"


class SwinTransformerLayer(nn.Module):
    """ A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of input channels.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether blocks in this layer are decoder-like. True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """

    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int,
        window_size: int,
        mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
        drop_path=0., norm_layer=nn.LayerNorm,
        decoder=False, start_end_fusion=True
    ):
        super().__init__()
        self.dim = dim
        self.depth = depth
        self.num_heads = num_heads
        self.window_size = window_size

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(dim=dim,
                                 num_heads=num_heads, window_size=window_size,
                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
                                 drop=drop, attn_drop=attn_drop,
                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                                 norm_layer=norm_layer,
                                 decoder=decoder,
                                 start_end_fusion=start_end_fusion)
            for i in range(depth)])

    def forward(self, x):
        for blk in self.blocks:
            x = blk(x)
        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, depth={self.depth}, num_heads={self.num_heads}, window_size={self.window_size}"


class SwinTransformerBackbone(nn.Module):
    """ Swin Transformer Backbone (4 stages as in orig. 2D impl.).

    Args:
        dim (int): Number of input channels.
        depths (list[int]): Numbers of blocks in stages.
        num_heads (int): Number of attention heads in W-MSA layers.
        start_window_size (int): Local window size of stage 1.
        window_size_mult (int): the number by which the `window_size` is being multiplied when moving to another stage
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        decoder (bool, optional): Flag that shows whether blocks in this backbone are decoder-like. True => decoder-like; False => encoder-like. Default: False
        start_end_fusion (bool, optional): Flag that shows if the last and the first half-windows should merge (True) or not (False).
    """
    def __init__(
        self,
        dim: int,
        depths: list[int],
        num_heads,
        start_window_size: int,
        window_size_mult: int = 1,
        mlp_ratio=4.,
        qkv_bias=True,
        qk_scale=None,
        drop=0.,
        attn_drop=0.,
        drop_path=0.,
        norm_layer=nn.LayerNorm,
        decoder=False,
        start_end_fusion=True
    ):
        super().__init__()
        self.dim = dim
        self.depths = depths
        
        if type(num_heads) == int:
            self.num_heads = [num_heads] * len(depths)
        else:
            self.num_heads = num_heads
        
        self.window_sizes = [start_window_size]
        
        for i in range(len(self.depths) - 1):
            self.window_sizes += [self.window_sizes[-1] * window_size_mult]

        # build model
        self.backbone = nn.ModuleList([
            SwinTransformerLayer(dim=self.dim,
                                 depth=self.depths[i],
                                 num_heads=self.num_heads[i],
                                 window_size=self.window_sizes[i],
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias,
                                 qk_scale=qk_scale,
                                 drop=drop,
                                 attn_drop=attn_drop,
                                 drop_path=drop_path,
                                 norm_layer=norm_layer,
                                 decoder=decoder,
                                 start_end_fusion=start_end_fusion)
            for i in range(len(self.depths))])

    def forward(self, x):
        for layer in self.backbone:
            x = layer(x)
        return x

In [9]:
def change_to_enc(swin_model):
    for i in range(len(swin_model.backbone)):
        for j in range(len(swin_model.backbone[i].blocks)):
            swin_model.backbone[i].blocks[j].decoder = False

def change_to_dec(swin_model):
    for i in range(len(swin_model.backbone)):
        for j in range(len(swin_model.backbone[i].blocks)):
            swin_model.backbone[i].blocks[j].decoder = True

# Эксперименты.

**Данные:**

In [10]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [11]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [12]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [13]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [14]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [15]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [16]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [17]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [18]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

147

---

In [19]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [20]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [21]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [22]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Определение бинов для time diff'ов (в часах) (опциональный шаг, нужен только для TD-GPT):**

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600
TIME_DIFF_BINS = 256

time_diffs = []

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1
    
    time_diffs += [batch.payload['time_diff'][batch.payload['time_diff'] != -1].numpy()]
    
time_diffs = np.concatenate(time_diffs)

time_diff_bins = np.quantile(time_diffs, q=[(i / TIME_DIFF_BINS) for i in range(1, TIME_DIFF_BINS)], axis=0)

36it [00:00, 98.54it/s] 


In [26]:
time_diff_bins

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,   3.,
         4.,   5.,   6.,   7.,   7.,   8.,   9.,   

In [27]:
time_diff_bins = list(set(time_diff_bins.tolist()))
time_diff_bins.sort()
time_diff_bins = torch.tensor(time_diff_bins, dtype=torch.int)
time_diff_bins

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  20,  22,  24,  26,  31,  35,  38,  44,  48,
         54,  62,  72,  82,  96, 114, 120, 144, 168, 216, 300, 458],
       dtype=torch.int32)

In [28]:
TIME_DIFF_BINS_NUM = len(time_diff_bins)

TIME_DIFF_BINS_NUM

40

**Тест:**

In [29]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1

    print(torch.bucketize(batch.payload['time_diff'], time_diff_bins, right=True))

20it [00:00, 93.69it/s]

tensor([[ 0, 37,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 12, 37,  ...,  0,  0,  0],
        ...,
        [ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0]])
tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        ...,
        [ 0,  2,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 39,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 22,  1,  ...,  0,  0,  0],
        ...,
        [ 0, 11, 21,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 32,  ...,  0,  0,  0]])
tensor([[ 0, 22, 17,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 28,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 31,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 19,  6,  ...

36it [00:00, 95.76it/s]

tensor([[ 0,  1, 28,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 11,  ...,  0,  0,  0],
        [ 0, 14, 40,  ...,  0,  0,  0],
        [ 0,  1, 12,  ...,  0,  0,  0]])
tensor([[ 0,  8,  1,  ...,  0,  0,  0],
        [ 0, 21, 39,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 17,  8,  ...,  0,  0,  0],
        [ 0, 22, 28,  ...,  0,  0,  0],
        [ 0,  1, 10,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 18,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 10,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 17,  ...




---

**SWIN-RNN Seq Encoder:**

In [23]:
from ptls.nn.seq_encoder.rnn_encoder import RnnEncoder
from ptls.nn.seq_encoder.containers import SeqEncoderContainer


class SWIN_RNN_SeqEncoder(SeqEncoderContainer):
    """SeqEncoderContainer with SWIN transformer backbone for features hierarchic fusion and RnnEncoder for feature aggregation.
    
    Parameters
        trx_encoder:
            TrxEncoder object
        input_size:
            input_size parameter for RnnEncoder
            If None: input_size = trx_encoder.output_size
            Set input_size explicitly or use None if your trx_encoder object has output_size attribute
        is_reduce_sequence:
            False - returns PaddedBatch with all transactions embeddings
            True - returns one embedding for sequence based on CLS token
        swin_depths: Numbers of blocks in stages (SWIN backbone).
        swin_num_heads: Number of attention heads in W-MSA layers (SWIN backbone).
        swin_start_window_size: Local window size of stage 1 (SWIN backbone).
        swin_window_size_mult (int): the number by which the `window_size` is being multiplied when moving to another stage (SWIN backbone).
        swin_drop: Dropout rate (SWIN backbone). Default: 0.0
        swin_attn_drop: Attention dropout rate (SWIN backbone). Default: 0.0
        swin_drop_path: Stochastic depth rate (SWIN backbone). Default: 0.0
        swin_decoder: Flag that shows whether blocks in SWIN backbone are decoder-like. True => decoder-like; False => encoder-like. Default: False
        swin_start_end_fusion: Flag that shows if the last and the first half-windows should merge (True) or not (False). Must be False for CPC and GPT.
        **rnn_seq_encoder_params:
            RnnEncoder params
    """
    def __init__(self,
                 trx_encoder=None,
                 input_size=None,
                 is_reduce_sequence=True,
                 swin_depths=[],
                 swin_num_heads=4,
                 swin_start_window_size=4,
                 swin_window_size_mult=1,
                 swin_drop=0.,
                 swin_attn_drop=0.,
                 swin_drop_path=0.,
                 swin_decoder=False,
                 swin_start_end_fusion=True,
                 **rnn_seq_encoder_params
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder_cls=RnnEncoder,
            input_size=input_size,
            seq_encoder_params=rnn_seq_encoder_params,
            is_reduce_sequence=is_reduce_sequence,
        )
        self.swin_fusion = SwinTransformerBackbone(
                               dim=trx_encoder.output_size,
                               depths=swin_depths,
                               num_heads=swin_num_heads,
                               start_window_size=swin_start_window_size,
                               window_size_mult=swin_window_size_mult,
                               drop=swin_drop,
                               attn_drop=swin_attn_drop,
                               drop_path=swin_drop_path,
                               decoder=swin_decoder,
                               start_end_fusion=swin_start_end_fusion 
                              )

    def forward(self, x, names=None, seq_len=None, h_0=None):
        x = self.trx_encoder(x)
        x = self.swin_fusion(x)
        x = self.seq_encoder(x, h_0)
        return x

---

**SWIN Seq Encoder:**

In [24]:
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn.seq_encoder.containers import SeqEncoderContainer


class SWIN_Encoder(AbsSeqEncoder):
    def __init__(self,
                 dim=0,
                 depths=[],
                 num_heads=4,
                 start_window_size=4,
                 window_size_mult=1,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 decoder=False,
                 start_end_fusion=True,
                 is_reduce_sequence=False
                 ):
        super().__init__(is_reduce_sequence=is_reduce_sequence)
        self.dim = dim
        self.swin_fusion = SwinTransformerBackbone(
                               dim=dim,
                               depths=depths,
                               num_heads=num_heads,
                               start_window_size=start_window_size,
                               window_size_mult=window_size_mult,
                               drop=drop,
                               attn_drop=attn_drop,
                               drop_path=drop_path,
                               decoder=decoder,
                               start_end_fusion=start_end_fusion 
                              )

    @property
    def embedding_size(self):
        return self.dim

    def forward(self, x):
        x = self.swin_fusion(x)

        if self.is_reduce_sequence:
            x = x.payload.sum(dim=1) / x.seq_lens.unsqueeze(-1)
        
        return x


class SWIN_SeqEncoder(torch.nn.Module):
    def __init__(self,
                 trx_encoder,
                 depths=[],
                 num_heads=4,
                 start_window_size=4,
                 window_size_mult=1,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 decoder=False,
                 start_end_fusion=True,
                 is_reduce_sequence=False
                 ):
        super().__init__()
        self.trx_encoder = trx_encoder
        self.seq_encoder = SWIN_Encoder(
            dim=trx_encoder.output_size,
            depths=depths,
            num_heads=num_heads,
            start_window_size=start_window_size,
            window_size_mult=window_size_mult,
            drop=drop,
            attn_drop=attn_drop,
            drop_path=drop_path,
            decoder=decoder,
            start_end_fusion=start_end_fusion,
            is_reduce_sequence=is_reduce_sequence
        )
        
    def forward(self, x):
        x = self.trx_encoder(x)
        x = self.seq_encoder(x)
        return x

    @property
    def embedding_size(self):
        return self.seq_encoder.embedding_size

In [25]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class ConvAggregator(TrxEncoderT2V):
    """The NN layer, a combination of TrxEncoder and Conv Layer (a window of #`agg_samples` transactions) 
       (works like nn.Sequential([TrxEncoder, Conv Window Aggregation])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window (conv layer).

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.

        k (int):
            Number of periodic components in T2V time embeddings

        time_col (str):
            Name of the time column in data
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
            k=k,
            time_col=time_col
        )

        self.agg_samples = agg_samples

        channels = super().output_size

        self.conv = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=self.agg_samples, padding=(self.agg_samples - 1), bias=False) # (B, T, L)

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        if self.use_window_attention:
            pass # Not Implemented
    
        agg_embeds = torch.transpose(self.conv(torch.transpose(masked_embeds, 1, 2)), 1, 2)

        new_seq_lens = embeds.seq_lens + self.agg_samples - 1

        return PaddedBatch(agg_embeds, new_seq_lens)

**Test:**

In [27]:
seed_everything(0)

In [28]:
device = "cuda:0"

In [31]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    linear_projection_size=64,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**trx_encoder_params).to(device)

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=4,
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=True,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru").to(device)

# seq_encoder = SWIN_SeqEncoder(
#     trx_encoder=trx_encoder,
#     depths=[2, 2, 6, 2],
#     num_heads=[2, 4, 8, 16],
#     start_window_size=4,
#     window_size_mult=2,
#     drop=0.1,
#     attn_drop=0.1,
#     drop_path=0.1,
#     decoder=True,
#     start_end_fusion=False,
#     is_reduce_sequence=True).to(device)

In [33]:
from ptls.data_load.padded_batch import PaddedBatch

trx_encoder.eval()

train_loader = inference_data_loader(data_train, num_workers=0, batch_size=32)

for i, batch in tqdm(enumerate(train_loader)):
    batch = batch.to(device)
    embeds = seq_encoder(batch)
    
    if i == 0:
        #print(batch.payload)
        #print(batch.seq_lens)
        #print()
        #print(masked_embeds.payload[0, 4])
        #print()
        #print(embeds_batch.seq_lens)
        print(embeds)
        #print(embeds.shape)
        #print(embeds.seq_lens)

5it [00:00, 24.26it/s]

tensor([[ 0.3058,  0.0266, -0.4723,  ..., -0.2322, -0.5250, -0.6277],
        [ 0.4279,  0.6304, -0.9828,  ..., -0.9140, -0.9946, -0.8880],
        [ 0.2657,  0.6070, -0.9835,  ..., -0.8823, -0.9938, -0.9012],
        ...,
        [ 0.3871,  0.7265, -0.9873,  ..., -0.9603, -0.9937, -0.8249],
        [ 0.1095, -0.0019,  0.1778,  ...,  0.1959, -0.1310, -0.1935],
        [ 0.1517,  0.0067,  0.1989,  ..., -0.2095, -0.4819, -0.4876]],
       device='cuda:0', grad_fn=<IndexBackward0>)


141it [00:05, 25.29it/s]


In [34]:
# import gc
# #swin_backbone.cpu()
# #del swin_backbone
# del embeds
# del batch
# gc.collect()
# torch.cuda.empty_cache()

---

**Train sequences lengths check:**

In [None]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    linear_projection_size=64,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**trx_encoder_params)
trx_encoder.to("cuda")

In [23]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

36it [00:00, 48.00it/s]

Max Length: 100





---

# SWIN Aggregation 

- **COLES:**

In [202]:
seed_everything(0)

**DataLoaders:**

In [203]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [204]:
N_EPOCHS = 20

In [205]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    linear_projection_size=64,
    k=7,
    time_col="event_time",
    agg_samples=3,
    use_window_attention=False
)

#trx_encoder = TrxEncoderT2V(**trx_encoder_params)
trx_encoder = ConvAggregator(**trx_encoder_params)

# seq_encoder = SWIN_SeqEncoder(
#     trx_encoder=trx_encoder,
#     depths=[2, 2, 6, 2],
#     num_heads=4,
#     start_window_size=4,
#     window_size_mult=2,
#     drop=0.1,
#     attn_drop=0.1,
#     drop_path=0.1,
#     decoder=False,
#     start_end_fusion=True,
#     is_reduce_sequence=True
# )

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=4, # 2, 4
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=False,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    #loss=SoftmaxLoss(),
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0.),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [206]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_SWIN_agg (w/ conv_agg, 5trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [207]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/632953b300fc4d5389bb5e4e2768aaae

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_SWIN_agg (w/ conv_agg, 5trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/632953b300fc4d5389bb5e4e2768aaae
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (80.70063781738281, 808.0537109375)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (36.43281173706055, 41.673439025878906)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.08885176479816437, 0.6555304527282715)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;3

In [208]:
trainer.logged_metrics

{'loss': tensor(76.2719),
 'seq_len': tensor(34.2066),
 'valid/recall_top_k': tensor(0.6555)}

In [28]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [209]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

SWIN_RNN_SeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (linear_projection_head): Linear(in_features=41, out_features=64, bias=True)
    (conv): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(4,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(64, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
  (swin_fusion): SwinTransformerBackbone(
    (backbone

In [69]:
# change_to_enc(encoder.swin_fusion)

In [210]:
from tqdm import tqdm

seed_everything(0)

In [211]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:04,  7.80it/s]


array([[ 0.3654307 , -0.9336183 ,  0.9923521 , ..., -0.79270506,
        -0.9202323 , -0.846506  ],
       [ 0.22435653, -0.9190185 ,  0.9950493 , ..., -0.4321019 ,
        -0.9257525 , -0.80878735],
       [ 0.7760851 , -0.9392842 ,  0.99143946, ..., -0.63314885,
        -0.90790373, -0.81972027],
       ...,
       [-0.0429655 , -0.9298562 ,  0.9901741 , ..., -0.23477188,
        -0.90675217, -0.70827776],
       [ 0.21107441, -0.93539536,  0.9918051 , ..., -0.08189818,
        -0.91077983, -0.6599331 ],
       [ 0.07144204, -0.94521785,  0.9964512 , ..., -0.01008347,
        -0.9235658 , -0.559582  ]], dtype=float32)

In [212]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00,  8.00it/s]


array([[-0.06601859, -0.9046768 ,  0.9926913 , ..., -0.41738456,
        -0.89603454, -0.70903504],
       [ 0.52761394, -0.9205212 ,  0.9918173 , ..., -0.76959485,
        -0.857632  , -0.8636213 ],
       [-0.03815325, -0.91488475,  0.9963867 , ..., -0.21135972,
        -0.89220965, -0.69404054],
       ...,
       [ 0.7193384 , -0.9260903 ,  0.9931525 , ..., -0.20386115,
        -0.91107005, -0.7313736 ],
       [ 0.01250206, -0.92826676,  0.9942206 , ..., -0.39163008,
        -0.89881516, -0.63099337],
       [-0.1448749 , -0.9290574 ,  0.9948817 , ..., -0.11638625,
        -0.88336426, -0.5805926 ]], dtype=float32)

In [213]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [214]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=0)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.088214
0:	learn: 0.6656387	total: 10.1ms	remaining: 10.1s
1:	learn: 0.6426715	total: 16.6ms	remaining: 8.3s
2:	learn: 0.6224552	total: 23.1ms	remaining: 7.67s
3:	learn: 0.6051181	total: 29.6ms	remaining: 7.38s
4:	learn: 0.5890547	total: 36.1ms	remaining: 7.19s
5:	learn: 0.5758434	total: 42.5ms	remaining: 7.04s
6:	learn: 0.5633223	total: 49.1ms	remaining: 6.96s
7:	learn: 0.5520632	total: 55.7ms	remaining: 6.91s
8:	learn: 0.5426978	total: 62.2ms	remaining: 6.85s
9:	learn: 0.5331597	total: 68.8ms	remaining: 6.82s
10:	learn: 0.5244493	total: 75.2ms	remaining: 6.76s
11:	learn: 0.5172855	total: 81.8ms	remaining: 6.73s
12:	learn: 0.5101961	total: 88.4ms	remaining: 6.71s
13:	learn: 0.5043178	total: 94.7ms	remaining: 6.67s
14:	learn: 0.4987792	total: 101ms	remaining: 6.63s
15:	learn: 0.4940862	total: 107ms	remaining: 6.61s
16:	learn: 0.4893876	total: 114ms	remaining: 6.58s
17:	learn: 0.4850416	total: 120ms	remaining: 6.54s
18:	learn: 0.4808889	total: 127ms	remaining: 6.56

<catboost.core.CatBoostClassifier at 0x7ea95632cbb0>

In [215]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [None]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

In [77]:
arr = np.array([0.8147836363342021, 0.7974777808356672, 0.8094089459455085])

arr.mean(), arr.std()

(0.8072234543717927, 0.00723212457019052)

<!-- - COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

- COLES embeds w/ SWIN_Agg seq_enc + Catboost:
  - Accuracy: `0.746`, `0.744`, `0.734`, avg: `0.7413 +- 0.0052`
  - ROC-AUC: `0.8197050395816807`, `0.8041961438215345`, `0.8093603794660925`, avg: `0.8111 +- 0.0064`

---

- COLES embeds w/ SWIN_Agg seq_enc (trained on InfoNCE loss) + Catboost:
  - Accuracy: `0.728`, `0.736`, `0.704`, avg: `0.7227 +- 0.0136`
  - ROC-AUC: `0.803888556118567`, `0.8046494309627494`, `0.7928153988117401`, avg: `0.8005 +- 0.0054`

--- -->

<!-- - COLES embeds w/ SWIN_Agg seq_enc + w/ ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.762`, `0.716`, `0.74`, avg: `0.7393 +- 0.0188`
  - ROC-AUC: `0.8183128005050913`, `0.7948875686001521`, `0.8069158666688252`, avg: `0.8067 +- 0.0096`

---

- COLES embeds w/ SWIN_Agg seq_enc + w/ ConvAgg (5 trx) + Catboost:
  - Accuracy: `0.742`, `0.724`, `0.746`, avg: `0.7373 +- 0.0096`
  - ROC-AUC: `0.8101698207896909`, `0.7846724190963398`, `0.8073691538100403`, avg: `0.8007 +- 0.0114`

---

- COLES embeds w/ SWIN_Agg seq_enc + Catboost (no start-end fusion):
  - Accuracy: `0.736`, `0.734`, `0.726`, avg: `0.732 +- 0.0043`
  - ROC-AUC: `0.8147350698547863`, `0.8079519515630311`, `0.8073043985041525`, avg: `0.81 +- 0.0034`

---

- COLES embeds w/ SWIN_Agg seq_enc (No RNN for feat agg) + Catboost:
  - Accuracy: `0.694`
  - ROC-AUC: `0.7710576160334137` -->

<!-- **Вывод:** Неудовлетворительные результаты, SWIN стоит использовать только в связке с RNN (усреднение эмбеддингов не работает). -->

<!-- - COLES embeds w/ SWIN_Agg seq_enc (trained on InfoNCE loss) + Catboost:
  - Accuracy: `0.718`, `0.726`, `0.702`, avg: `0.7153 +- 0.01`
  - ROC-AUC: `0.8016383092389634`, `0.7930744200352916`, `0.8027229606125851`, avg: `0.7991 +- 0.0043`

---

- COLES embeds w/ SWIN_Agg seq_enc (decoder-like training) + Catboost:
  - Accuracy: `0.734`, `0.734`, `0.722`, avg: `0.73 +- 0.0057`
  - ROC-AUC: `0.8147836363342021`, `0.7974777808356672`, `0.8094089459455085`, avg: `0.8072 +- 0.0072` -->

<!-- ---

**Вывод:** для CoLES агрегация свёртками в целом приводит к повышению качества, причём больше это проявляется по accuracy. Лучший результат достигается при агрегации свёрточным слоем с ядром свёртки размера 7, после чего результат становится лишь хуже.

**Лучший результат:**  

- COLES embeds + ConvAgg (7 trx) + Catboost:
  - Accuracy: `0.758`, `0.728`, `0.732`, avg: `0.7393 +- 0.0133`
  - ROC-AUC: `0.8269576338411229`, `0.7956646322708065`, `0.8140389503164915`, avg: `0.8122 +- 0.0128` -->

---

- COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

- COLES embeds w/ SWIN_Agg seq_enc (num_heads is multiplied by factor 2 on each stage) + Catboost:
  - Accuracy: `0.756`, `0.738`, `0.734`, avg: `0.7427 +- 0.0096`
  - ROC-AUC: `0.8205468585582232`, `0.8086156934483819`, `0.8079195739100873`, avg: `0.8124 +- 0.0058`

---

- COLES embeds w/ SWIN_Agg seq_enc (smaller window sizes) + Catboost:
  - Accuracy: `0.746`, `0.732`, `0.724`, avg: `0.734 +- 0.0091`
  - ROC-AUC: `0.8117725146104158`, `0.8022372958184262`, `0.795000890385456`, avg: `0.803 +- 0.0069`

---

- COLES embeds w/ SWIN_Agg seq_enc (trained on InfoNCE loss) + Catboost:
  - Accuracy: `0.72`, `0.722`, `0.708`, avg: `0.7167 +- 0.0062`
  - ROC-AUC: `0.8057340823363714`, `0.7915202926939827`, `0.7877321072995418`, avg: `0.795 +- 0.0077`

---

- COLES embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.756`, `0.762`, `0.74`, avg: `0.7527 +- 0.0093`
  - ROC-AUC: `0.8149131469459778`, `0.812209612925159`, `0.8027553382655291`, avg: `0.81 +- 0.0052`

---

- COLES embeds w/ SWIN_Agg seq_enc (smaller window sizes) & ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.764`, `0.726`, `0.76`, avg: `0.75 +- 0.017`
  - ROC-AUC: `0.8225380842142753`, `0.7952113451295915`, `0.8167586731637823`, avg: `0.8115 +- 0.0118`

**Вывод:** CoLES с SWIN-трансформером в качестве seq_encoder'а демонстрирует значительно лучшее качество, чем обычный CoLES (с RNN), - как по accuracy, так и по ROC-AUC.

В качестве лоссов пробовались ContrastiveLoss и InfoNCE Loss. Обучать такой CoLES нужно всё же на стандартный Contrastive Loss, так как при обучении на InfoNCE Loss качество становится даже хуже, чем у бейзлайна.  

По умолчанию стартовое окно покрывает 4 транзакции, также пробовался сетап с размером окна в 2 транзакции, он демонстрировал качество хуже, чем при большом размере окна.

Также пробовалась архитектура (Conv Aggregator в качестве trx_enc +  SWIN + RNN SeqEncoder). Размер свёртки Conv Aggregator'а брался как минимальный (3) из рассматриваемых, чтобы не увеличивать слишком сильно за счёт свёртки - и далее - SWIN-трансформера - receptive field. Такой вариант продемонстрировал качество, значительно лучшее, чем у SWIN-трансформера с обычным trx_encoder'ом, как в случае маленьких окон, так и в случае размера окна по умолчанию. Сетап COLES embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost оказался лучшим подходом по accuracy в целом.

**Лучший результат:**

- COLES embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.756`, `0.762`, `0.74`, avg: `0.7527 +- 0.0093`
  - ROC-AUC: `0.8149131469459778`, `0.812209612925159`, `0.8027553382655291`, avg: `0.81 +- 0.0052`

---

**Train sequences lengths check:**

In [41]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 32},
        "channel_type": {"in": 7, "out": 32},
        "currency": {"in": 60, "out": 32},
        "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    linear_projection_size=192,
    k=31,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**trx_encoder_params)
trx_encoder.to("cuda")

TrxEncoderT2V(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 32, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 32, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 32, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 32, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (time2vec_days): Time2Vec()
  (linear_projection_head): Linear(in_features=161, out_features=192, bias=True)
)

In [43]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

36it [00:00, 82.96it/s]

Max Length: 83





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [26]:
import torch
from torch import nn as nn
from torch.nn import functional as F
from ptls.data_load.padded_batch import PaddedBatch


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=0):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        # num_additional_samples = mapped_ctx_embeddings.payload.shape[1] - max_seq_len
        # if num_additional_samples > 0:
        #     additional_samples = torch.zeros((batch_size, num_additional_samples, emb_size), device=device)
        #     base_embeddings = PaddedBatch(torch.cat((base_embeddings.payload, additional_samples), dim=1), base_embeddings.seq_lens)
        #     max_seq_len += num_additional_samples               
        
        #mapped_ctx_embeddings = mapped_ctx_embeddings.payload
            
        mapped_ctx_embeddings = mapped_ctx_embeddings.payload[:, :max_seq_len, :, :]

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        #max_seq_len = mapped_ctx_embeddings.payload.shape[1]
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [27]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6, shift='none',
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        if shift == 'add':
            loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(seq_encoder.trx_encoder.agg_samples - 1))
        else:
            loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=0)

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        swin_fusion = self._seq_encoder.swin_fusion
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(swin_fusion(base_embeddings))
        
        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [496]:
# import gc

# gc.collect()
# torch.cuda.empty_cache()

In [164]:
seed_everything(42)

**DataLoaders:**

In [165]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=83,             
        max_len=107
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=83,
        max_len=107
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [166]:
N_EPOCHS = 20

In [167]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 32}, # 8 / 32
        "channel_type": {"in": 7, "out": 32},
        "currency": {"in": 60, "out": 32},
        "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    linear_projection_size=192, # 192
    k=31,
    time_col="event_time",
    agg_samples=3, # 3, 5, 7, 9
    use_window_attention=False
)

#trx_encoder = TrxEncoderT2V(**trx_encoder_params)
trx_encoder = ConvAggregator(**trx_encoder_params)

# seq_encoder = SWIN_SeqEncoder(
#     trx_encoder=trx_encoder,
#     depths=[2, 2, 6, 2],
#     num_heads=4,
#     start_window_size=4,
#     window_size_mult=2,
#     drop=0.1,
#     attn_drop=0.1,
#     drop_path=0.1,
#     decoder=True,
#     start_end_fusion=False,
#     is_reduce_sequence=False
# )

seq_encoder = SWIN_RNN_SeqEncoder(
    trx_encoder=trx_encoder,
    swin_depths=[2, 2, 6, 2],
    swin_num_heads=[2, 4, 8, 16],
    swin_start_window_size=2, # 2, 4
    swin_window_size_mult=2,
    swin_drop=0.1,
    swin_attn_drop=0.1,
    swin_drop_path=0.1,
    swin_decoder=True,
    swin_start_end_fusion=False,
    hidden_size=512,
    type="gru"
)


cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    shift='add', # 'none' / 'add'
    optimizer_partial=partial(torch.optim.Adam, lr=3e-4), # Adam, 5e-5
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5) # step_size=5
)

In [None]:
# print(sum(p.numel() for p in cpc.parameters() if p.requires_grad))

- **Params in standard CPC: 1720514**

- **Params in CPC with SWIN_RNN SeqEnc: 7060306** => 4x time increase in params

**Обучение:**

In [168]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_SWIN_agg (w/ conv_agg, 3trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True,
    gradient_clip_val=1.,
    gradient_clip_algorithm="norm"
)

In [169]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/db4fb9b7eadf44ea83b29d58faeacbfc

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_SWIN_agg (w/ conv_agg, 3trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/db4fb9b7eadf44ea83b29d58faeacbfc
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (1.686346411705017, 3.955394744873047)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (60.2421875, 71.78125)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.13515004515647888, 0.5865319967269897)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET

In [170]:
trainer.logged_metrics

{'loss': tensor(1.9482),
 'seq_len': tensor(60.9500),
 'valid/cpc_accuracy': tensor(0.5776)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [171]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

SWIN_RNN_SeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (linear_projection_head): Linear(in_features=161, out_features=192, bias=True)
    (conv): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(2,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(192, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
  (swin_fusion): SwinTransformerBackbone(
    

In [172]:
encoder.seq_encoder.is_reduce_sequence = True
change_to_enc(encoder.swin_fusion)



# change_to_enc(encoder.seq_encoder.swin_fusion)

In [143]:
# change_to_dec(encoder.swin_fusion)

In [173]:
from tqdm import tqdm

seed_everything(42)

In [174]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [175]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=32)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

141it [00:07, 18.49it/s]


array([[ 0.4763479 ,  0.12343363, -0.2737443 , ..., -0.4416202 ,
         0.59304714,  0.15195346],
       [-0.04151388,  0.6004812 , -0.40921816, ..., -0.3299001 ,
         0.9126798 ,  0.4803991 ],
       [ 0.24940437,  0.5722427 , -0.61975867, ..., -0.10690073,
         0.8532574 ,  0.39472497],
       ...,
       [ 0.5277181 ,  0.7133706 , -0.69615996, ..., -0.5489958 ,
         0.3946198 , -0.8759226 ],
       [-0.33658493,  0.8712623 , -0.08205242, ..., -0.23512381,
         0.9274619 , -0.706148  ],
       [ 0.962061  ,  0.54293334, -0.8105549 , ..., -0.5115771 ,
         0.9882774 , -0.5916329 ]], dtype=float32)

In [176]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=32)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

16it [00:00, 17.20it/s]


array([[-0.3532161 ,  0.44846803, -0.56362194, ..., -0.20272706,
         0.82116485,  0.56089455],
       [ 0.37977022,  0.35007468, -0.12814702, ...,  0.08900587,
         0.43118227,  0.18706475],
       [-0.04472409,  0.74503595, -0.05210087, ..., -0.28382245,
         0.946484  ,  0.543487  ],
       ...,
       [ 0.34813413,  0.72305614, -0.42042637, ..., -0.36838108,
         0.9422317 , -0.8351585 ],
       [ 0.22155719,  0.68438727, -0.5866197 , ..., -0.616808  ,
         0.05628479, -0.7080625 ],
       [-0.6057585 ,  0.90392375,  0.05865079, ..., -0.43313178,
         0.933504  , -0.82974684]], dtype=float32)

In [177]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.088214
0:	learn: 0.6660331	total: 11.2ms	remaining: 11.2s
1:	learn: 0.6423336	total: 17.9ms	remaining: 8.91s
2:	learn: 0.6228551	total: 24.4ms	remaining: 8.1s
3:	learn: 0.6049975	total: 31ms	remaining: 7.71s
4:	learn: 0.5897903	total: 37.5ms	remaining: 7.47s
5:	learn: 0.5760603	total: 43.5ms	remaining: 7.2s
6:	learn: 0.5633280	total: 49.5ms	remaining: 7.02s
7:	learn: 0.5520736	total: 55.5ms	remaining: 6.88s
8:	learn: 0.5420958	total: 61.4ms	remaining: 6.76s
9:	learn: 0.5325355	total: 67.5ms	remaining: 6.68s
10:	learn: 0.5252810	total: 73.6ms	remaining: 6.62s
11:	learn: 0.5187271	total: 79.7ms	remaining: 6.56s
12:	learn: 0.5118655	total: 85.7ms	remaining: 6.51s
13:	learn: 0.5056080	total: 91.8ms	remaining: 6.46s
14:	learn: 0.4996784	total: 97.8ms	remaining: 6.42s
15:	learn: 0.4941885	total: 104ms	remaining: 6.38s
16:	learn: 0.4892911	total: 110ms	remaining: 6.35s
17:	learn: 0.4856222	total: 116ms	remaining: 6.31s
18:	learn: 0.4816863	total: 122ms	remaining: 6.28s


<catboost.core.CatBoostClassifier at 0x79ab04c38400>

In [178]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [179]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.746
ROC-AUC: 0.8056207605510677


In [106]:
arr = np.array([])

arr.mean(), arr.std()

(0.8081408212052041, 0.006105603736099413)

<!-- - CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`

---

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
  - `Accuracy: 0.746`, `0.726`, `0.742`, avg: `0.738 +- 0.0086`
  - `ROC-AUC: 0.8005050913859253`, `0.8013630991889397`, `0.8061711806511145`, avg: `0.8027 +- 0.0025`

---

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost (no look-ahead mask on inference):
  - `Accuracy: 0.75`, `0.734`, `0.744`, avg: `0.7427 +- 0.0066`
  - `ROC-AUC: 0.8118210810898319`, `0.7969759272150362`, `0.8073529649835682`, avg: `0.8054 +- 0.0062`

---

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost (no look-ahead mask on inference + no start-end fusion):
  - `Accuracy: 0.744`, `0.76`, `0.754`, avg: `0.7527 +- 0.0066`
  - `ROC-AUC: 0.8094737012513963`, `0.809117547069013`, `0.807547230901232`, avg: `0.8087 +- 0.0008`

---

- CPC context embeds w/ SWIN_Agg seq_enc + ConvAgg (3 trx) + Catboost:
  - `Accuracy: 0.734`, `0.742`, `0.744`, avg: `0.74 +- 0.0043`
  - `ROC-AUC: 0.8022534846448982`, `0.818442311116867`, `0.8112059056838969`, avg: `0.8106 +- 0.0066`

---

- CPC context embeds w/ SWIN_Agg seq_enc + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.728`, `0.724`, `0.73`, avg: `0.7273 +- 0.0025`
  - `ROC-AUC: 0.7972997037444756`, `0.803872367292095`, `0.7975425361415551`, avg: `0.7996 +- 0.003`

---

lr = 5e-5

- `Accuracy: 0.746`, `0.748`, `0.738`, avg: `0.744 +- 0.0043`

- `ROC-AUC: 0.8049893963186608`, `0.8117239481309999`, `0.8046008644833336`, avg: `0.8071 +- 0.0033`

W/ look-ahead mask on inference:

- `Accuracy: 0.756`, `0.736`, `0.726`, avg: `0.7393 +- 0.0125`

- `ROC-AUC: 0.8037266678538473`, `0.8073853426365124`, `0.8133104531252531`, avg: `0.8081 +- 0.0039`

---

**W/ grad_norm_clipping**

- `Accuracy: 0.742`, `0.748`, `0.752`, avg: `0.7473 +- 0.0041`
- `ROC-AUC: 0.8075148532482881`, `0.8072882096776806`, `0.8169205614285021`, avg: `0.8106 +- 0.0045`

W/ look-ahead mask on inference:

- `Accuracy: 0.752`, `0.722`, `0.74`, avg: `0.738 +- 0.8022858622978419`
- `ROC-AUC: 0.8165644072461188`, `0.8055721940716518`, `0.8022858622978419`, avg: `0.8081 +- 0.0061`

---

Smaller Window (4 trx -> 2 trx):

- `Accuracy: 0.74`, ``, ``
- `ROC-AUC: 0.8044389762186138`, ``, ``

W/ look-ahead mask on inference:

- `Accuracy: 0.728`, ``, ``
- `ROC-AUC: 0.8029010377037767`, ``, ``

---

Smaller Window (4 trx -> 2 trx), changed lr scheduler:

- `Accuracy: 0.746`, `0.734`, `0.732`, avg: ``

- `ROC-AUC: 0.8099917436984992`, `0.8003432031212058`, `0.8128895436369817`, avg: ``

W/ look-ahead mask on inference:

- `Accuracy: 0.746`, `0.744`, `0.752`, avg: ``

- `ROC-AUC: 0.8107849961956259`, `0.8039695002509268`, `0.813828495572356`, avg: ``

---

ConvAgg (3 trx):

- `Accuracy: 0.734`, `0.734`, `0.74`, avg: `0.736 +- 0.0028`
- `ROC-AUC: 0.8006184131712293`, `0.8067539784041056`, `0.8040666332097586`, avg: `0.8038 +- 0.0025`

---

ConvAgg (5 trx):

- `Accuracy: 0.724`, `0.718`, `0.728`, avg: `0.7233 +- 0.0041`
- `ROC-AUC: 0.7985786210357612`, `0.7871654983730229`, `0.7939486166647779`, avg: `0.7932 +- 0.0047`

---

No RNN for feat agg:
  - `Accuracy: 0.686`
  - `ROC-AUC: 0.7611338654060966`

**Вывод:** SWIN без RNN не стоит использовать. 

---

**Вывод:** агрегация с помощью свёрток независимо от размера окна заметно улучшает ROC-AUC, при этом accuracy также c увеличением окна несколько вырастает, но не так существенно. Что интересно, с увеличением окна ROC-AUC только увеличивается, хотя, кажется, что рано или поздно результат выйдет на плато и затем начнёт ухудшаться.

**Лучший результат:**

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.744`, `0.758`, `0.75`, avg: `0.7507 +- 0.0057`
  - `ROC-AUC: 0.8150912240371695`, `0.82635864726166`, `0.8121610464457432`, avg: `0.8179 +- 0.0061`

---

**Результаты для CPC с меньшей размерностью embed_dim (8):**

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.754`, `0.742`, `0.744`, avg: `0.7467 +- 0.0052`
  - `ROC-AUC: 0.8175195480079649`, `0.8197697948875686`, `0.8122096129251591`, avg: `0.8165 +- 0.0032` -->

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`

---

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
    - `Accuracy: 0.744`, `0.746`, `0.75`, avg: `0.7467 +- 0.0025`
    - `ROC-AUC: 0.8105745414514903`, `0.8218419646759807`, `0.8159006653607681`, avg: `0.8161 +- 0.0046`

<!-- ---

- Smaller model:
    - `Accuracy: 0.74`, `0.746`, `0.74`, avg: `0.742 +- 0.0028`
    - `ROC-AUC: 0.8124362564957667`, `0.8266986126175713`, `0.8113192274692007`, avg: `0.8168 +- 0.007` -->

---

- CPC context embeds w/ SWIN_Agg seq_enc (w/ look-ahead mask) + Catboost:
    - `Accuracy: 0.744`, `0.746`, `0.738`, avg: `0.7427 +- 0.0034`
    - `ROC-AUC: 0.8105745414514901`, `0.8208058797817748`, `0.8069320554952971`, avg: `0.8128 +- 0.0059`

<!-- ---

- Smaller model:
    - `Accuracy: 0.744`, `0.766`, `0.736`, avg: `0.7487 +- 0.0127`
    - `ROC-AUC: 0.8064625795276101`, `0.8253711288468699`, `0.8082271616130545`, avg: `0.8134 +- 0.0085` -->

---

- CPC context embeds w/ SWIN_Agg seq_enc (smaller windows) + Catboost:
    - `Accuracy: 0.754`, `0.748`, `0.742`, avg: `0.748 +- 0.0049`
    - `ROC-AUC: 0.8113030386427289`, `0.8168881837755582`, `0.8082919169189426`, avg: `0.8122 +- 0.0036`

---

- CPC context embeds w/ SWIN_Agg seq_enc & ConvAgg (3 trx) + Catboost:
    - `Accuracy: 0.746`, `0.736`, `0.734`, avg: `0.7387 +- 0.0052`
    - `ROC-AUC: 0.8182804228521475`, `0.7999546712858785`, `0.806494957180554`, avg: `0.8082 +- 0.0076`

---

- CPC context embeds w/ SWIN_Agg seq_enc (smaller windows) & ConvAgg (3 trx) + Catboost:
    - `Accuracy: 0.72`, `0.742`, `0.746`, avg: `0.736 +- 0.0114`
    - `ROC-AUC: 0.8085509381424941`, `0.8239950785967526`, `0.8056207605510677`, avg: `0.8127 +- 0.0081`

---

**Вывод:** CPC с SWIN-трансформером в качестве seq_encoder'а демонстрирует сравнимое с бейзлайном значение accuracy и гораздо более высокий ROC-AUC по сравнению с ним. На тестовой выборке пробовались сетапы с наличием look-ahead маски, не дающей "смотреть" модели в будущее (это обязательно для обучения, так как CPC должен работать с будущими сэмплами и они не должны при этом попадать в контекст такой модели) и без неё. Без такой маски на тесте оказалось значительно лучше - как по accuracy, так и по ROC-AUC. 

По умолчанию стартовое окно покрывает 4 транзакции, также пробовался сетап с размером окна в 2 транзакции, он демонстрировал accuracy чуть лучше, чем при стандартном размере окна и чем у бейзлайна, но также - достаточно большую просадку по ROC-AUC (при этом такое значение ROC-AUC всё ещё лучше, чем для бейзлайна).

Также пробовалась архитектура (Conv Aggregator в качестве trx_enc +  SWIN + RNN SeqEncoder). Размер свёртки Conv Aggregator'а брался как минимальный (3) из рассматриваемых, чтобы не увеличивать слишком сильно за счёт свёртки - и далее - SWIN-трансформера - receptive field. Такой вариант продемонстрировал качество хуже, чем для SWIN-трансформера с обычным trx_encoder'ом (сильные просадки по accuracy и - почти во всех случаях - ROC-AUC хуже, чем у базового варианта). При этом по ROC-AUC такие методы всё ещё лучше или сравнимы с бейзлайном.   

**Лучшие результаты:**

- CPC context embeds w/ SWIN_Agg seq_enc + Catboost:
    - `Accuracy: 0.744`, `0.746`, `0.75`, avg: `0.7467 +- 0.0025`
    - `ROC-AUC: 0.8105745414514903`, `0.8218419646759807`, `0.8159006653607681`, avg: `0.8161 +- 0.0046`

---

- CPC context embeds w/ SWIN_Agg seq_enc (smaller window size) + Catboost:
    - `Accuracy: 0.754`, `0.748`, `0.742`, avg: `0.748 +- 0.0049`
    - `ROC-AUC: 0.8113030386427289`, `0.8168881837755582`, `0.8082919169189426`, avg: `0.8122 +- 0.0036`

# Итоги.

| Method                                 |    Accuracy           | ROC-AUC         |
|----------------------------------------|-----------------------|-----------------|
| **Flattened Sequences**                | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**                        | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                              | 0.726 ± 0.0071        | 0.8076 ± 0.0025 |
| **COLES embeds + SWIN Agg encoder & ConvAgg (3 trx)** | 0.7527 ± 0.0093       | 0.81 ± 0.0052 |
| **CPC Modeling (emb_dim=32)**          | 0.747 ± 0.0041        | 0.8099 ± 0.0035 |
| **CPC Modeling (emb_dim=32) w/ SWIN Agg encoder** | 0.7467 ± 0.0025       | 0.8161 ± 0.0046 |
| **CPC Modeling (emb_dim=32) w/ SWIN Agg encoder (smaller win size)** | 0.748 ± 0.0049       | 0.8122 ± 0.0036 |