# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=b88c701efcff790abdd9aeae01dc24d85f9cd25869e6789bd

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

---

**Time2Vec:**

In [7]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

# Эксперименты.

**Данные:**

In [8]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [9]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [10]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [11]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [12]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [13]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [14]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [15]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [16]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

147

---

In [17]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [18]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [19]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [20]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Определение бинов для time diff'ов (в часах) (опциональный шаг, нужен только для TD-GPT):**

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600
TIME_DIFF_BINS = 256

time_diffs = []

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1
    
    time_diffs += [batch.payload['time_diff'][batch.payload['time_diff'] != -1].numpy()]
    
time_diffs = np.concatenate(time_diffs)

time_diff_bins = np.quantile(time_diffs, q=[(i / TIME_DIFF_BINS) for i in range(1, TIME_DIFF_BINS)], axis=0)

36it [00:00, 98.54it/s] 


In [26]:
time_diff_bins

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,   3.,
         4.,   5.,   6.,   7.,   7.,   8.,   9.,   

In [27]:
time_diff_bins = list(set(time_diff_bins.tolist()))
time_diff_bins.sort()
time_diff_bins = torch.tensor(time_diff_bins, dtype=torch.int)
time_diff_bins

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  20,  22,  24,  26,  31,  35,  38,  44,  48,
         54,  62,  72,  82,  96, 114, 120, 144, 168, 216, 300, 458],
       dtype=torch.int32)

In [28]:
TIME_DIFF_BINS_NUM = len(time_diff_bins)

TIME_DIFF_BINS_NUM

40

**Тест:**

In [29]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1

    print(torch.bucketize(batch.payload['time_diff'], time_diff_bins, right=True))

20it [00:00, 93.69it/s]

tensor([[ 0, 37,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 12, 37,  ...,  0,  0,  0],
        ...,
        [ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0]])
tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        ...,
        [ 0,  2,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 39,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 22,  1,  ...,  0,  0,  0],
        ...,
        [ 0, 11, 21,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 32,  ...,  0,  0,  0]])
tensor([[ 0, 22, 17,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 28,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 31,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 19,  6,  ...

36it [00:00, 95.76it/s]

tensor([[ 0,  1, 28,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 11,  ...,  0,  0,  0],
        [ 0, 14, 40,  ...,  0,  0,  0],
        [ 0,  1, 12,  ...,  0,  0,  0]])
tensor([[ 0,  8,  1,  ...,  0,  0,  0],
        [ 0, 21, 39,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 17,  8,  ...,  0,  0,  0],
        [ 0, 22, 28,  ...,  0,  0,  0],
        [ 0,  1, 10,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 18,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 10,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 17,  ...




---

**DenseCNN Aggregator Class:**

In [21]:
#---------------------------------------------------------------------------------------------------
# A modified version of https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
#---------------------------------------------------------------------------------------------------

from collections import OrderedDict
from functools import partial
from typing import Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor


class DenseLayer(nn.Module):
    def __init__(self,
                 num_input_features: int,
                 growth_rate: int,
                 bn_size: int,
                 drop_rate: float) -> None:
        super().__init__()
        
        min_kernel_size = 3
        padding_for_min_ks = 1
        
        self.norm1 = nn.BatchNorm1d(num_input_features)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv1d(in_channels=num_input_features, \
                               out_channels=(bn_size * growth_rate),
                               kernel_size=1, bias=False)
        
        self.norm2 = nn.BatchNorm1d(bn_size * growth_rate)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(in_channels=(bn_size * growth_rate), 
                               out_channels=growth_rate,
                               kernel_size=min_kernel_size,
                               padding=padding_for_min_ks, bias=False)
        
        self.drop_rate = float(drop_rate)

    def bn_function(self, inputs: list[Tensor]) -> Tensor:
        concated_features = torch.cat(inputs, dim=1)
        bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))
        return bottleneck_output

    def forward(self, input: Tensor) -> Tensor:
        if isinstance(input, Tensor):
            prev_features = [input]
        else:
            prev_features = input
            
        bottleneck_output = self.bn_function(prev_features)
        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
        
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
            
        return new_features


class DenseBlock(nn.ModuleDict):
    """DenseBlock class, based on
    "Densely Connected Convolutional Networks" (https://arxiv.org/pdf/1608.06993.pdf).

    Args:
        num_layers (int) - how many DenseLayers are in a block
        num_input_features (int) - the number of input features
        growth_rate (int) - how many filters to add by each DenseLayer (`k` in paper)
        bn_size (int) - multiplicative factor for features num in bottleneck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
    """
    
    def __init__(self,
                 num_layers: int,
                 num_input_features: int,
                 bn_size: int,
                 growth_rate: int,
                 drop_rate: float) -> None:
        super().__init__()
        
        for i in range(num_layers):
            layer = DenseLayer(
                num_input_features + i * growth_rate,
                growth_rate=growth_rate,
                bn_size=bn_size,
                drop_rate=drop_rate
            )
            self.add_module("denselayer%d" % (i + 1), layer)

        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, init_features: Tensor) -> Tensor:
        features = [init_features]
        for name, layer in self.items():
            new_features = layer(features)
            features.append(new_features)
        return torch.cat(features, 1)

In [22]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class DenseCNN_Aggregator(TrxEncoderT2V):
    """The NN layer, a combination of TrxEncoder and DenseBlock (based on "Densely Connected Convolutional Networks" (https://arxiv.org/pdf/1608.06993.pdf))
       (works like nn.Sequential([TrxEncoder, DenseBlock])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters

       proj_channels (int):
            The number of channels after the first conv layer

       dense_layers_num (int):
            How many DenseLayers are in DenseBlock
       
       dense_block_drop_rate (float):
            Dropout rate after each DenseLayer in DenseBlock

        k (int):
            Number of periodic components in T2V time embeddings

        time_col (str):
            Name of the time column in data
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 proj_channels,
                 dense_layers_num,
                 dense_block_drop_rate=0.,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
            k=k,
            time_col=time_col
        )

        input_channels = super().output_size

        bn_size = 4

        self.proj_channels = proj_channels
        self.growth_rate = proj_channels
        self.dense_layers_num = dense_layers_num

        min_kernel_size = 3
        padding_for_min_ks = 1

        self.conv = nn.Conv1d(in_channels=input_channels, out_channels=proj_channels, kernel_size=min_kernel_size, padding=padding_for_min_ks, bias=False) # (B, T, L)
       
        self.dense_block = DenseBlock(num_layers=dense_layers_num, num_input_features=proj_channels, bn_size=bn_size, growth_rate=self.growth_rate, drop_rate=dense_block_drop_rate) # (B, T, L)

        self.norm = nn.BatchNorm1d(self.output_size)

        nn.init.kaiming_normal_(self.conv.weight)
        nn.init.constant_(self.norm.weight, 1)
        nn.init.constant_(self.norm.bias, 0)

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        agg_embeds = torch.transpose(self.norm(self.dense_block(self.conv(torch.transpose(masked_embeds, 1, 2)))), 1, 2)

        new_seq_lens = torch.minimum(embeds.seq_lens + self.dense_layers_num + 1, torch.tensor(embeds.payload.shape[1]))

        return PaddedBatch(agg_embeds, new_seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        return self.proj_channels + self.growth_rate * self.dense_layers_num

**Test:**

In [287]:
# seed_everything(0)

In [288]:
# device = "cuda:0"

In [289]:
# trx_encoder_params = dict(
#     embeddings={
#         "MCC": {"in": 342, "out": 8},
#         "channel_type": {"in": 7, "out": 8},
#         "currency": {"in": 60, "out": 8},
#         "trx_category": {"in": 11, "out": 8}            
#     },
#     numeric_values={"amount": "log"},
#     embeddings_noise=0.003,
#     k=7,
#     time_col="event_time",
#     proj_channels=128,
#     dense_layers_num=5,
#     dense_block_drop_rate=0.1
# )

# trx_encoder = DenseCNN_Aggregator(**trx_encoder_params).to(device)

In [None]:
# trx_encoder.eval()

# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
#     embeds_batch = trx_encoder(batch)

#     if i == 0:
#         print(batch.payload["event_time"].shape)
#     #     print(batch.seq_lens)
#     #     print()
#         print(embeds_batch.payload)
#     #     print()
#     #     print(embeds_batch.seq_lens)

---

**Train sequences lengths check:**

In [66]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoderT2V(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (time2vec_days): Time2Vec()
)

In [27]:
# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# trx_encoder.eval()

# min_len = np.inf
# max_len = 0

# for batch in tqdm(train_loader):
#     embeds_batch = trx_encoder(batch.to("cuda"))
#     seq_lens = embeds_batch.seq_lens
#     min_len = min(min_len, seq_lens.min())
#     max_len = max(max_len, seq_lens.max())

# print("Min Length:", min_len.item())
# print("Max Length:", max_len.item())

In [67]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

36it [00:00, 94.05it/s]

Max Length: 100





---

# Aggregation with DenseCNN (DenseNet-like architecture) 

- **COLES:**

In [52]:
seed_everything(42)

**DataLoaders:**

In [53]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [54]:
N_EPOCHS = 20

In [55]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    proj_channels=64,
    dense_layers_num=1,
    dense_block_drop_rate=0.1
)

trx_encoder = DenseCNN_Aggregator(**trx_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [56]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_DenseCNN_Agg (1 layer)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [57]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/afa180c35d454ac7a8362486b7093442

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_DenseCNN_Agg (1 layer)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/afa180c35d454ac7a8362486b7093442
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (70.04817199707031, 859.7939453125)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (34.45624923706055, 40.66250228881836)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.2171206772327423, 0.7317383289337158)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET 

In [58]:
trainer.logged_metrics

{'loss': tensor(64.1187),
 'seq_len': tensor(39.6951),
 'valid/recall_top_k': tensor(0.7159)}

In [28]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [59]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): DenseCNN_Aggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (conv): Conv1d(41, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (dense_block): DenseBlock(
      (denselayer1): DenseLayer(
        (norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv1d(64, 256, kernel_size=(1,), s

In [60]:
from tqdm import tqdm

seed_everything(42)

In [61]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 19.85it/s]


array([[ 0.05949309, -0.08892065, -0.01769528, ...,  0.875472  ,
         0.22099932, -0.22991695],
       [-0.83557796, -0.5995957 , -0.3430144 , ...,  0.9048624 ,
        -0.6807391 ,  0.06205634],
       [-0.868395  , -0.13603222, -0.7285609 , ...,  0.9234001 ,
        -0.25151637,  0.78120786],
       ...,
       [ 0.7374374 ,  0.29916635,  0.32984397, ...,  0.8630713 ,
        -0.35124606,  0.60122293],
       [ 0.37274325, -0.13891849,  0.46158648, ...,  0.89905214,
        -0.1954736 ,  0.897119  ],
       [ 0.42812103,  0.09186149, -0.16399436, ...,  0.8267914 ,
         0.3275809 ,  0.65187263]], dtype=float32)

In [62]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 19.50it/s]


array([[-0.6903378 , -0.19370937, -0.20487432, ...,  0.88484186,
        -0.28482738,  0.688611  ],
       [ 0.40075997, -0.18749076,  0.5418193 , ...,  0.9322701 ,
        -0.29215172, -0.14940706],
       [-0.6353707 , -0.04505442, -0.4751425 , ...,  0.9210559 ,
        -0.53061754,  0.8833193 ],
       ...,
       [-0.8191282 ,  0.55276114, -0.62225276, ...,  0.90859765,
        -0.7753523 ,  0.44585475],
       [-0.3002964 , -0.4725951 ,  0.35450333, ...,  0.9049379 ,
        -0.03551856,  0.73202586],
       [ 0.6593918 , -0.4636611 , -0.09153992, ...,  0.9941689 ,
        -0.1830289 ,  0.9625902 ]], dtype=float32)

In [63]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6714208	total: 9.5ms	remaining: 9.49s
1:	learn: 0.6517580	total: 15.8ms	remaining: 7.86s
2:	learn: 0.6352885	total: 22.1ms	remaining: 7.35s
3:	learn: 0.6214633	total: 28.5ms	remaining: 7.09s
4:	learn: 0.6082342	total: 34.8ms	remaining: 6.93s
5:	learn: 0.5968507	total: 41.1ms	remaining: 6.81s
6:	learn: 0.5864965	total: 47.3ms	remaining: 6.71s
7:	learn: 0.5772906	total: 53.7ms	remaining: 6.66s
8:	learn: 0.5689520	total: 60ms	remaining: 6.6s
9:	learn: 0.5615295	total: 66.1ms	remaining: 6.55s
10:	learn: 0.5542320	total: 72.3ms	remaining: 6.5s
11:	learn: 0.5487542	total: 78.7ms	remaining: 6.48s
12:	learn: 0.5430442	total: 85.8ms	remaining: 6.51s
13:	learn: 0.5375713	total: 92.5ms	remaining: 6.51s
14:	learn: 0.5326196	total: 99.3ms	remaining: 6.52s
15:	learn: 0.5276191	total: 106ms	remaining: 6.53s
16:	learn: 0.5238071	total: 113ms	remaining: 6.52s
17:	learn: 0.5194832	total: 120ms	remaining: 6.52s
18:	learn: 0.5149892	total: 126ms	remaining: 6.53s
1

<catboost.core.CatBoostClassifier at 0x7c2d29bd2f20>

In [64]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [65]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.736
ROC-AUC: 0.8074015314629841


In [67]:
arr = np.array([0.8165644072461188, 0.8039371225979829, 0.8074015314629841])

arr.mean(), arr.std()

(0.8093010204356953, 0.005327170804116732)

- COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

- COLES embeds + DenseCNN_Agg (1 layer) + Catboost:
  - Accuracy: `0.732`, `0.72`, `0.736`, avg: `0.7293 +- 0.0068`
  - ROC-AUC: `0.8165644072461188`, `0.8039371225979829`, `0.8074015314629841`, avg: `0.8093 +- 0.0053`

---

- COLES embeds + DenseCNN_Agg (2 layers) + Catboost:
  - Accuracy: `0.75`, `0.738`, `0.712`, avg: `0.7333 +- 0.0159`
  - ROC-AUC: `0.8158035324019361`, `0.8051027181039646`, `0.7930420423823478`, avg: `0.8046 +- 0.0093`

---

- COLES embeds + DenseCNN_Agg (4 layers) + Catboost:
  - Accuracy: `0.722`, `0.738`, `0.738`, avg: `0.7327 +- 0.0075`
  - ROC-AUC: `0.8036942902009033`, `0.787165498373023`, `0.8095870230367`, avg: `0.8001 +- 0.0095`

---

**Вывод:** С увеличением глубины DenseBlock'а ROC-AUC падает (при глубине 1 он максимален, лучше, чем в случае бейзлайна, далее - быстро убывает). Accuracy напротив увеличивается с увеличением глубины блока (при глубине в 4 слоя метрика несколько проседает относительно предыдущего сетапа, но незначительно).

---

**Train sequences lengths check:**

In [27]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**trx_encoder_params)
trx_encoder.to("cuda")

TrxEncoderT2V(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (time2vec_days): Time2Vec()
)

In [30]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

36it [00:00, 77.01it/s]

Max Length: 107





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [24]:
import torch
from torch import nn as nn
from torch.nn import functional as F


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=None):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.payload.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [25]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6,
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        min_kernel_size = 3

        loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(min_kernel_size - 1) * (1 + seq_encoder.trx_encoder.dense_layers_num))

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        if seq_encoder is not None and not isinstance(seq_encoder, RnnSeqEncoder):
            raise NotImplementedError(f'Only rnn encoder supported in CpcModule. Found {type(seq_encoder)}')

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(base_embeddings)

        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [163]:
seed_everything(42)

**DataLoaders:**

In [164]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=83,             
        max_len=107
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=83,
        max_len=107
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [165]:
N_EPOCHS = 20

In [166]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 32},
        "channel_type": {"in": 7, "out": 32},
        "currency": {"in": 60, "out": 32},
        "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=31,
    time_col="event_time",
    proj_channels=164,
    dense_layers_num=4,
    dense_block_drop_rate=0.1
)

trx_encoder = DenseCNN_Aggregator(**trx_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=1024,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [167]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_DenseCNN_Agg (4 layers, emb_dim=32)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [None]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/be02b07e7a7242718959b9cfe5e1c52b

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [171]:
trainer.logged_metrics

{'loss': tensor(1.9101),
 'seq_len': tensor(65.6500),
 'valid/cpc_accuracy': tensor(0.5127)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [172]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): DenseCNN_Aggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (conv): Conv1d(161, 164, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (dense_block): DenseBlock(
      (denselayer1): DenseLayer(
        (norm1): BatchNorm1d(164, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv1d(164, 656, kernel_size

In [173]:
encoder.seq_encoder.is_reduce_sequence = True

In [174]:
from tqdm import tqdm

seed_everything(42)

In [175]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:06,  5.92it/s]


array([[-0.55423135, -0.4713629 ,  0.30485657, ..., -0.10632832,
         0.03440668, -0.18463232],
       [-0.6506817 , -0.43017188,  0.29139104, ..., -0.13631427,
         0.03676366, -0.25781617],
       [-0.65476453, -0.43191764,  0.29567474, ..., -0.13044037,
         0.04983218, -0.28245366],
       ...,
       [-0.6618838 , -0.3888579 ,  0.26701033, ..., -0.11278969,
         0.02218593, -0.3129979 ],
       [-0.6784419 , -0.37370533,  0.28008053, ..., -0.11247957,
         0.0271778 , -0.30572087],
       [-0.532878  , -0.4575818 ,  0.32681522, ..., -0.09228283,
         0.01946597, -0.1969865 ]], dtype=float32)

In [176]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00,  5.70it/s]


array([[-0.6484833 , -0.4312482 ,  0.27898225, ..., -0.12874314,
         0.03663243, -0.25170347],
       [-0.6611932 , -0.4059619 ,  0.28330123, ..., -0.10255931,
         0.04168191, -0.28185248],
       [-0.6595912 , -0.4155776 ,  0.28338367, ..., -0.12324023,
         0.0392938 , -0.26033437],
       ...,
       [-0.66761315, -0.3831195 ,  0.26771703, ..., -0.11199436,
         0.04495418, -0.3198047 ],
       [-0.66423106, -0.38383272,  0.26377693, ..., -0.11287197,
         0.01912919, -0.3171218 ],
       [ 0.00360363,  0.16999151,  0.02141134, ..., -0.27861762,
         0.01531055, -0.09115385]], dtype=float32)

In [177]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.088214
0:	learn: 0.6654016	total: 12.8ms	remaining: 12.8s
1:	learn: 0.6414507	total: 21.1ms	remaining: 10.5s
2:	learn: 0.6211504	total: 29.7ms	remaining: 9.88s
3:	learn: 0.6034350	total: 38.9ms	remaining: 9.67s
4:	learn: 0.5875963	total: 47.7ms	remaining: 9.49s
5:	learn: 0.5739781	total: 56.4ms	remaining: 9.35s
6:	learn: 0.5621629	total: 64.7ms	remaining: 9.17s
7:	learn: 0.5509868	total: 73.4ms	remaining: 9.1s
8:	learn: 0.5419222	total: 82.2ms	remaining: 9.06s
9:	learn: 0.5332639	total: 90.9ms	remaining: 9s
10:	learn: 0.5257256	total: 100ms	remaining: 8.99s
11:	learn: 0.5191152	total: 109ms	remaining: 8.95s
12:	learn: 0.5131732	total: 118ms	remaining: 8.93s
13:	learn: 0.5073500	total: 126ms	remaining: 8.89s
14:	learn: 0.5021015	total: 135ms	remaining: 8.88s
15:	learn: 0.4976393	total: 144ms	remaining: 8.86s
16:	learn: 0.4936615	total: 153ms	remaining: 8.86s
17:	learn: 0.4898188	total: 162ms	remaining: 8.85s
18:	learn: 0.4858349	total: 171ms	remaining: 8.83s
19:	l

<catboost.core.CatBoostClassifier at 0x7b56a93e6aa0>

In [178]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [179]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.754
ROC-AUC: 0.8203202149876155


In [181]:
arr = np.array([0.8077900632983115, 0.811141150378009, 0.8203202149876155])

arr.mean(), arr.std()

(0.8130838095546453, 0.005296641630736041)

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`

---

- CPC context embeds + DenseCNN_Agg (1 layer) + Catboost:
  - Accuracy: `0.754`, `0.732`, `0.744`, avg: `0.7433 +- 0.009`
  - ROC-AUC: `0.813828495572356`, `0.8179242686697641`, `0.8243350439526639`, avg: `0.8187 +- 0.0043`

---

- CPC context embeds + DenseCNN_Agg (2 layers) + Catboost:
  - Accuracy: `0.75`, `0.746`, `0.742`, avg: `0.746 +- 0.0033`
  - ROC-AUC: `0.8118534587427757`, `0.8190412976963299`, `0.8084861828366061`, avg: `0.8131 +- 0.0044` 

---

- CPC context embeds + DenseCNN_Agg (4 layers) + Catboost:
  - Accuracy: `0.734`, `0.754`, `0.754`, avg: `0.7473 +- 0.0094`
  - ROC-AUC: `0.8077900632983115`, `0.811141150378009`, `0.8203202149876155`, avg: `0.8131 +- 0.0053`

---

**Вывод:** при глубине DenseBlock'а 1 для CPC accuracy - хуже, чем для бейзлайна, ROC-AUC - наоборот, значительно выше. С повышением глубины DenseBlock'а accuracy растёт - становится сравнимым с бейзлайном, ROC-AUC - становится хуже относительно конфигурации с глубиной DenseBlock'а 1, но всё ещё лучше, чем в случае бейзлайна.

# Итоги.

| Method                                     |    Accuracy           | ROC-AUC         |
|--------------------------------------------|-----------------------|-----------------|
| **Flattened Sequences**                    | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**                            | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                                  | 0.726 ± 0.0071        | 0.8076 ± 0.0025 |
| **COLES embeds + DenseCNN_Agg (1 layer)**  | 0.7293 ± 0.0068       | 0.8093 ± 0.0053 |
| **COLES embeds + DenseCNN_Agg (2 layers)** | 0.7333 ± 0.0159       | 0.8046 ± 0.0093 |
| **COLES embeds + DenseCNN_Agg (4 layers)** | 0.7327 ± 0.0075       | 0.8001 ± 0.0095 |
| **CPC Modeling**                           | 0.747 ± 0.0041        | 0.8099 ± 0.0035 |
| **CPC Modeling + DenseCNN_Agg (1 layer)**  | 0.7433 ± 0.009        | 0.8187 ± 0.0043 |
| **CPC Modeling + DenseCNN_Agg (2 layers)** | 0.746 ± 0.0033        | 0.8131 ± 0.0044 |
| **CPC Modeling + DenseCNN_Agg (4 layers)** | 0.7473 ± 0.0094       | 0.8131 ± 0.0053 |
| **TD-GPT**                                 | 0.73 ± 0.0049         | 0.7949 ± 0.0065 |