# Импортируем необходимые библиотеки

In [2]:
!pip install pytorch-lifestream
!pip install comet_ml



In [3]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [4]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
comet_ml.login()

In [7]:
from pytorch_lightning.loggers import CometLogger

---

**Time2Vec:**

In [8]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

# Эксперименты.

**Данные:**

In [9]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [10]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [11]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [12]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [13]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [14]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [15]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [16]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [17]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

147

---

In [18]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [19]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [20]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [21]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Определение бинов для time diff'ов (в часах) (опциональный шаг, нужен только для TD-GPT):**

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600
TIME_DIFF_BINS = 256

time_diffs = []

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1
    
    time_diffs += [batch.payload['time_diff'][batch.payload['time_diff'] != -1].numpy()]
    
time_diffs = np.concatenate(time_diffs)

time_diff_bins = np.quantile(time_diffs, q=[(i / TIME_DIFF_BINS) for i in range(1, TIME_DIFF_BINS)], axis=0)

36it [00:00, 98.54it/s] 


In [26]:
time_diff_bins

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,   3.,
         4.,   5.,   6.,   7.,   7.,   8.,   9.,   

In [27]:
time_diff_bins = list(set(time_diff_bins.tolist()))
time_diff_bins.sort()
time_diff_bins = torch.tensor(time_diff_bins, dtype=torch.int)
time_diff_bins

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  20,  22,  24,  26,  31,  35,  38,  44,  48,
         54,  62,  72,  82,  96, 114, 120, 144, 168, 216, 300, 458],
       dtype=torch.int32)

In [28]:
TIME_DIFF_BINS_NUM = len(time_diff_bins)

TIME_DIFF_BINS_NUM

40

**Тест:**

In [29]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1

    print(torch.bucketize(batch.payload['time_diff'], time_diff_bins, right=True))

20it [00:00, 93.69it/s]

tensor([[ 0, 37,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 12, 37,  ...,  0,  0,  0],
        ...,
        [ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0]])
tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        ...,
        [ 0,  2,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 39,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 22,  1,  ...,  0,  0,  0],
        ...,
        [ 0, 11, 21,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 32,  ...,  0,  0,  0]])
tensor([[ 0, 22, 17,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 28,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 31,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 19,  6,  ...

36it [00:00, 95.76it/s]

tensor([[ 0,  1, 28,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 11,  ...,  0,  0,  0],
        [ 0, 14, 40,  ...,  0,  0,  0],
        [ 0,  1, 12,  ...,  0,  0,  0]])
tensor([[ 0,  8,  1,  ...,  0,  0,  0],
        [ 0, 21, 39,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 15,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 17,  8,  ...,  0,  0,  0],
        [ 0, 22, 28,  ...,  0,  0,  0],
        [ 0,  1, 10,  ...,  0,  0,  0],
        ...,
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 18,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 10,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 17,  ...




---

**Convolution Aggregator Class:**

In [22]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class ConvAggregator(TrxEncoderT2V):
    """The NN layer, a combination of TrxEncoder and Conv Layer (a window of #`agg_samples` transactions) 
       (works like nn.Sequential([TrxEncoder, Conv Window Aggregation])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window (conv layer).

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.

        k (int):
            Number of periodic components in T2V time embeddings

        time_col (str):
            Name of the time column in data
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
            k=k,
            time_col=time_col
        )

        self.agg_samples = agg_samples

        channels = super().output_size

        self.conv = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=self.agg_samples, padding=(self.agg_samples - 1), bias=False) # (B, T, L)

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        if self.use_window_attention:
            pass # Not Implemented
    
        agg_embeds = torch.transpose(self.conv(torch.transpose(masked_embeds, 1, 2)), 1, 2)

        new_seq_lens = embeds.seq_lens + self.agg_samples - 1

        return PaddedBatch(agg_embeds, new_seq_lens)

**Test:**

In [114]:
# seed_everything(0)

In [115]:
# device = "cuda:0"

In [124]:
# agg_encoder_params = dict(
#     embeddings={
#         "MCC": {"in": 342, "out": 8},
#         "channel_type": {"in": 7, "out": 8},
#         "currency": {"in": 60, "out": 8},
#         "trx_category": {"in": 11, "out": 8}            
#     },
#     numeric_values={"amount": "log"},
#     embeddings_noise=0.003,
#     k=7,
#     time_col="event_time",
#     agg_samples=3,
#     use_window_attention=False
# )

# trx_encoder = ConvAggregator(**agg_encoder_params).to(device)

In [125]:
# trx_encoder.eval()

# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
#     embeds_batch = trx_encoder(batch)

#     # if i == 0:
#     #     # print(batch.payload)
#     #     print(batch.seq_lens)
#     #     print()
#     #     print(embeds_batch.payload[31, 2])
#     #     print()
#     #     print(embeds_batch.payload.shape)
#     #     print()
#     #     print(embeds_batch.seq_lens)

36it [00:00, 83.04it/s]


---

**Train sequences lengths check:**

In [24]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    agg_samples=9, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoderT2V(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (time2vec_days): Time2Vec()
)

In [27]:
# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# trx_encoder.eval()

# min_len = np.inf
# max_len = 0

# for batch in tqdm(train_loader):
#     embeds_batch = trx_encoder(batch.to("cuda"))
#     seq_lens = embeds_batch.seq_lens
#     min_len = min(min_len, seq_lens.min())
#     max_len = max(max_len, seq_lens.max())

# print("Min Length:", min_len.item())
# print("Max Length:", max_len.item())

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

36it [00:00, 54.68it/s]

Max Length: 100





---

# Convolution Aggregation (Mean Pooling) 

- **COLES:**

In [98]:
seed_everything(42)

**DataLoaders:**

In [99]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=106,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=106,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [100]:
N_EPOCHS = 20

In [101]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    agg_samples=9, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [102]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_ConvAgg (9 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [103]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/1a3313e328864cbeaca5da2da6383cdb

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_ConvAgg (9 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/1a3313e328864cbeaca5da2da6383cdb
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (82.41187286376953, 833.46533203125)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (37.62343978881836, 44.15937423706055)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.37701478600502014, 0.6986173391342163)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:

In [104]:
trainer.logged_metrics

{'loss': tensor(74.5748),
 'seq_len': tensor(36.1508),
 'valid/recall_top_k': tensor(0.6986)}

In [28]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [105]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (conv): Conv1d(41, 41, kernel_size=(9,), stride=(1,), padding=(8,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [106]:
from tqdm import tqdm

seed_everything(42)

In [107]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 20.52it/s]


array([[-1.0824701e-03,  3.6156092e-02,  4.9616605e-01, ...,
        -6.6765922e-01, -6.6863620e-01, -6.6652589e-02],
       [-2.2565003e-01,  1.8386401e-01,  9.2669201e-01, ...,
        -8.5067677e-01, -8.9130658e-01, -2.4642639e-01],
       [-4.0571715e-04, -9.7591341e-02,  7.8610992e-01, ...,
        -8.8209867e-01, -8.8305283e-01, -2.8033805e-01],
       ...,
       [ 5.4789573e-01, -1.3142693e-01,  7.6335484e-01, ...,
        -7.8219056e-01, -8.5901862e-01, -3.4716463e-01],
       [ 8.0102092e-01, -3.9637855e-01,  8.5910261e-01, ...,
        -4.8154429e-01, -8.7772948e-01, -3.8154456e-01],
       [ 7.0200098e-01, -2.9782823e-01,  7.8033710e-01, ...,
        -5.8043742e-01, -6.5850538e-01, -3.9665264e-01]], dtype=float32)

In [108]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 20.71it/s]


array([[-0.40181643,  0.19573915,  0.8372935 , ..., -0.8428872 ,
        -0.8171789 , -0.14034553],
       [-0.12110938, -0.02542056,  0.677119  , ..., -0.8467641 ,
        -0.8511904 , -0.05373751],
       [ 0.19229183,  0.05307745,  0.9078966 , ..., -0.75695515,
        -0.86165637, -0.2595576 ],
       ...,
       [ 0.71326053, -0.6491342 ,  0.78142303, ..., -0.7998652 ,
        -0.85036147, -0.39598182],
       [ 0.81370497, -0.4632678 ,  0.8471019 , ..., -0.65986204,
        -0.8839792 , -0.2833767 ],
       [ 0.04184141, -0.3152523 ,  0.8863836 , ..., -0.8096399 ,
        -0.8407475 , -0.26678282]], dtype=float32)

In [109]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6720247	total: 9.87ms	remaining: 9.86s
1:	learn: 0.6530918	total: 16.6ms	remaining: 8.28s
2:	learn: 0.6372498	total: 23.2ms	remaining: 7.72s
3:	learn: 0.6226935	total: 29.8ms	remaining: 7.42s
4:	learn: 0.6096742	total: 35.7ms	remaining: 7.11s
5:	learn: 0.5986925	total: 41.8ms	remaining: 6.92s
6:	learn: 0.5878902	total: 47.8ms	remaining: 6.78s
7:	learn: 0.5788606	total: 53.7ms	remaining: 6.66s
8:	learn: 0.5708397	total: 59.7ms	remaining: 6.58s
9:	learn: 0.5638613	total: 65.7ms	remaining: 6.5s
10:	learn: 0.5575060	total: 71.8ms	remaining: 6.45s
11:	learn: 0.5515281	total: 77.7ms	remaining: 6.4s
12:	learn: 0.5455856	total: 83.8ms	remaining: 6.36s
13:	learn: 0.5409140	total: 89.9ms	remaining: 6.33s
14:	learn: 0.5368172	total: 96ms	remaining: 6.3s
15:	learn: 0.5322783	total: 102ms	remaining: 6.26s
16:	learn: 0.5281403	total: 108ms	remaining: 6.23s
17:	learn: 0.5242839	total: 114ms	remaining: 6.21s
18:	learn: 0.5207573	total: 120ms	remaining: 6.21s
1

<catboost.core.CatBoostClassifier at 0x7aea4024f2e0>

In [110]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [111]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.716
ROC-AUC: 0.7953894222207831


In [113]:
arr = np.array([0.8068349225364653, 0.7961502970649658, 0.7953894222207831])

arr.mean(), arr.std()

(0.7994582139407381, 0.00522536154486522)

- COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

- COLES embeds + ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.74`, `0.732`, `0.724`, avg: `0.732 +- 0.0065`
  - ROC-AUC: `0.8146379368959544`, `0.8009583785271407`, `0.8000032377652944`, avg: `0.8051 +- 0.0067`

---

- COLES embeds + ConvAgg (5 trx) + Catboost:
  - Accuracy: `0.746`, `0.73`, `0.706`, avg: `0.7273 +- 0.0164`
  - ROC-AUC: `0.8167262955108383`, `0.7980605785886579`, `0.7902251865762251`, avg: `0.8017 +- 0.0111`

---

- COLES embeds + ConvAgg (7 trx) + Catboost:
  - Accuracy: `0.758`, `0.728`, `0.732`, avg: `0.7393 +- 0.0133`
  - ROC-AUC: `0.8269576338411229`, `0.7956646322708065`, `0.8140389503164915`, avg: `0.8122 +- 0.0128`

---

- COLES embeds + ConvAgg (9 trx) + Catboost:
  - Accuracy: `0.724`, `0.722`, `0.716`, avg: `0.7207 +- 0.0034`
  - ROC-AUC: `0.8068349225364653`, `0.7961502970649658`, `0.7953894222207831`, avg: `0.7995 +- 0.0052`

---

**Вывод:** для CoLES агрегация свёртками в целом приводит к повышению качества, причём больше это проявляется по accuracy. Лучший результат достигается при агрегации свёрточным слоем с ядром свёртки размера 7, после чего результат становится лишь хуже.

**Лучший результат:**  

- COLES embeds + ConvAgg (7 trx) + Catboost:
  - Accuracy: `0.758`, `0.728`, `0.732`, avg: `0.7393 +- 0.0133`
  - ROC-AUC: `0.8269576338411229`, `0.7956646322708065`, `0.8140389503164915`, avg: `0.8122 +- 0.0128`

---

**Train sequences lengths check:**

In [74]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    agg_samples=9, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

ConvAggregator(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (time2vec_days): Time2Vec()
  (conv): Conv1d(41, 41, kernel_size=(9,), stride=(1,), padding=(8,), bias=False)
)

In [76]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

36it [00:00, 103.49it/s]

Max Length: 91





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [25]:
import torch
from torch import nn as nn
from torch.nn import functional as F


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=None):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.payload.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [26]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6,
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(seq_encoder.trx_encoder.agg_samples - 1))

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        if seq_encoder is not None and not isinstance(seq_encoder, RnnSeqEncoder):
            raise NotImplementedError(f'Only rnn encoder supported in CpcModule. Found {type(seq_encoder)}')

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(base_embeddings)

        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [107]:
seed_everything(42)

**DataLoaders:**

In [108]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=91,             
        max_len=115
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=91,
        max_len=115
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [109]:
N_EPOCHS = 20

In [110]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8}, # 8 / 32
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    agg_samples=9, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [111]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_ConvAgg (9 trx, emb_dim=8)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [112]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/5de9c2140d304bcca55f8007fc01fb8d

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_ConvAgg (9 trx, emb_dim=8)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/5de9c2140d304bcca55f8007fc01fb8d
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (2.025022506713867, 4.775966644287109)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (63.234375, 72.09375)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.07966876775026321, 0.31242746114730835)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET I

In [113]:
trainer.logged_metrics

{'loss': tensor(2.0746),
 'seq_len': tensor(77.6500),
 'valid/cpc_accuracy': tensor(0.3099)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [114]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
    (conv): Conv1d(41, 41, kernel_size=(9,), stride=(1,), padding=(8,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [115]:
encoder.seq_encoder.is_reduce_sequence = True

In [116]:
from tqdm import tqdm

seed_everything(42)

In [117]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 21.46it/s]


array([[ 0.04243162,  0.15609364,  0.27093837, ..., -0.4615391 ,
        -0.58185345, -0.23241714],
       [-0.1241576 ,  0.35649574,  0.5112675 , ..., -0.5452488 ,
        -0.8509828 , -0.8381243 ],
       [-0.0584416 ,  0.36499965,  0.39332452, ..., -0.517932  ,
        -0.82081395, -0.80145216],
       ...,
       [-0.15166363,  0.327662  ,  0.40192905, ..., -0.5891854 ,
        -0.8803088 , -0.9036705 ],
       [-0.22926734,  0.2918287 ,  0.5277493 , ..., -0.5973353 ,
        -0.90279895, -0.92507726],
       [ 0.02838782,  0.13193196,  0.3179472 , ..., -0.5031968 ,
        -0.5541788 , -0.68690056]], dtype=float32)

In [118]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 21.48it/s]


array([[-0.06497598,  0.35207307,  0.41517317, ..., -0.52950007,
        -0.84227145, -0.8239114 ],
       [-0.00577283,  0.39577085,  0.45625144, ..., -0.5420173 ,
        -0.7677192 , -0.6899389 ],
       [-0.16261129,  0.2588004 ,  0.45851144, ..., -0.49653676,
        -0.8533968 , -0.8260375 ],
       ...,
       [-0.04368812,  0.47836012,  0.48234987, ..., -0.58338803,
        -0.8439634 , -0.86121476],
       [-0.13340726,  0.26682648,  0.37253833, ..., -0.5738828 ,
        -0.87495565, -0.89667094],
       [-0.132509  ,  0.41543597,  0.61546636, ..., -0.6384193 ,
        -0.8939025 , -0.9177663 ]], dtype=float32)

In [119]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6660318	total: 11.2ms	remaining: 11.2s
1:	learn: 0.6413828	total: 17.8ms	remaining: 8.86s
2:	learn: 0.6209871	total: 24.3ms	remaining: 8.08s
3:	learn: 0.6036022	total: 30.9ms	remaining: 7.68s
4:	learn: 0.5880183	total: 37.1ms	remaining: 7.39s
5:	learn: 0.5742883	total: 43.1ms	remaining: 7.15s
6:	learn: 0.5625952	total: 49.2ms	remaining: 6.98s
7:	learn: 0.5519072	total: 55.4ms	remaining: 6.87s
8:	learn: 0.5429165	total: 61.2ms	remaining: 6.74s
9:	learn: 0.5352071	total: 67ms	remaining: 6.63s
10:	learn: 0.5272737	total: 72.7ms	remaining: 6.53s
11:	learn: 0.5208120	total: 78.5ms	remaining: 6.46s
12:	learn: 0.5150545	total: 84.3ms	remaining: 6.4s
13:	learn: 0.5098019	total: 90ms	remaining: 6.34s
14:	learn: 0.5050376	total: 95.8ms	remaining: 6.29s
15:	learn: 0.5003020	total: 102ms	remaining: 6.26s
16:	learn: 0.4967468	total: 108ms	remaining: 6.22s
17:	learn: 0.4928163	total: 114ms	remaining: 6.19s
18:	learn: 0.4894197	total: 120ms	remaining: 6.18s
1

<catboost.core.CatBoostClassifier at 0x7e08dec4c190>

In [120]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [121]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.726
ROC-AUC: 0.8058959706010912


In [123]:
# arr = np.array([0.8040018779038709, 0.8120962911398553, 0.8058959706010912])

# arr.mean(), arr.std()

(0.8073313798816057, 0.003456894681710768)

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`

---

- CPC context embeds + ConvAgg (3 trx) + Catboost:
  - `Accuracy: 0.742`, `0.742`, `0.74`, avg: `0.7413 +- 0.0009`
  - `ROC-AUC: 0.8095546453837562`, `0.8224733289083874`, `0.8150588463842257`, avg: `0.8157 +- 0.0053`

---

- CPC context embeds + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.748`, `0.746`, `0.75`, avg: `0.748 +- 0.0016`
  - `ROC-AUC: 0.8122258017516311`, `0.8209677680464943`, `0.815884476534296`, avg: `0.8164 +- 0.0036`

---

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.744`, `0.758`, `0.75`, avg: `0.7507 +- 0.0057`
  - `ROC-AUC: 0.8150912240371695`, `0.82635864726166`, `0.8121610464457432`, avg: `0.8179 +- 0.0061`

---

- CPC context embeds + ConvAgg (9 trx) + Catboost:
  - `Accuracy: 0.748`, `0.756`, `0.736`, avg: `0.7467 +- 0.0082`
  - `ROC-AUC: 0.809635589516116`, `0.8277670751647213`, `0.8150912240371696`, avg: `0.8175 +- 0.0076`

**Вывод:** агрегация с помощью свёрток независимо от размера окна заметно улучшает ROC-AUC, при этом accuracy также c увеличением окна несколько вырастает, но не так существенно. Что интересно, с увеличением окна ROC-AUC только увеличивается, хотя, кажется, что рано или поздно результат выйдет на плато и затем начнёт ухудшаться.

**Лучший результат:**

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.744`, `0.758`, `0.75`, avg: `0.7507 +- 0.0057`
  - `ROC-AUC: 0.8150912240371695`, `0.82635864726166`, `0.8121610464457432`, avg: `0.8179 +- 0.0061`

---

**Результаты для CPC с меньшей размерностью embed_dim (8):**

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.754`, `0.742`, `0.744`, avg: `0.7467 +- 0.0052`
  - `ROC-AUC: 0.8175195480079649`, `0.8197697948875686`, `0.8122096129251591`, avg: `0.8165 +- 0.0032`

---

- CPC context embeds + ConvAgg (3 trx) + Catboost:
  - `Accuracy: 0.736`, `0.71`, `0.72`, avg: `0.722 +- 0.0107`
  - `ROC-AUC: 0.8160787424519595`, `0.7893995564261546`, `0.801767819850739`, avg: `0.8024 +- 0.0109`

---

- CPC context embeds + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.74`, `0.718`, `0.724`, avg: `0.7273 +- 0.0093`
  - `ROC-AUC: 0.819510773664017`, `0.8017192533713231`, `0.8029819818361367`, avg: `0.8081 +- 0.0081`

---

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.738`, `0.73`, `0.72`, avg: `0.7293 +- 0.0074`
  - `ROC-AUC: 0.8169691279079179`, `0.8045199203509739`, `0.7989185863916725`, avg: `0.8068 +- 0.0075`

---

- CPC context embeds + ConvAgg (9 trx) + Catboost:
  - `Accuracy: 0.718`, `0.728`, `0.726`, avg: `0.724 +- 0.0043`
  - `ROC-AUC: 0.8040018779038709`, `0.8120962911398553`, `0.8058959706010912`, avg: `0.8073 +- 0.0035`

**Вывод:** при такой размерности эмбеддингов результаты при агрегации транзакций свёрткой выходят хуже, чем без неё.

**Лучший результат:**

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.738`, `0.73`, `0.72`, avg: `0.7293 +- 0.0074`
  - `ROC-AUC: 0.8169691279079179`, `0.8045199203509739`, `0.7989185863916725`, avg: `0.8068 +- 0.0075`

---

- **GPT:**

In [126]:
seed_everything(42)

**DataLoaders:**

In [127]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000, # 85
        max_len=1200 # 105
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [128]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
import torch.nn.functional as F 
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class ConvAgg_TD_GPT_PretrainModule(pl.LightningModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 time_diffs_boundaries: torch.tensor,
                 time_diffs_bins_num: int,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        self.head['time_diff'] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=time_diffs_bins_num) 

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

        self.time_diffs_boundaries = time_diffs_boundaries
        self.SECONDS_IN_HOUR = 3600

        self.shift = trx_encoder.agg_samples

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-self.shift, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        timestamps = labels['event_time']
        timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
        labels['time_diff'] = (timestamps - timestamps_prev) // self.SECONDS_IN_HOUR
        labels['time_diff'][:, 0] = -1

        mask = torch.arange(labels['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], labels['time_diff'].shape[1]), device=batch.device)
        mask[mask > batch.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask.bool()

        labels['time_diff'][~mask] = -1

        labels['time_diff'] = torch.bucketize(labels['time_diff'], self.time_diffs_boundaries.to(batch.device), right=True)
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        timestamps = labels['event_time']
        timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
        labels['time_diff'] = (timestamps - timestamps_prev) // self.SECONDS_IN_HOUR
        labels['time_diff'][:, 0] = -1

        mask = torch.arange(labels['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], labels['time_diff'].shape[1]), device=batch.device)
        mask[mask > batch.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask.bool()

        labels['time_diff'][~mask] = -1

        labels['time_diff'] = torch.bucketize(labels['time_diff'], self.time_diffs_boundaries.to(batch.device), right=True)
        
        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

Multilabel-подход для обучения здесь не применим, так как окна транзакций пересекаются.

In [36]:
# class WinAgg_TD_GPT_MultiLabelPretrainModule(GptPretrainWithTimeDiffsModule):
#     def __init__(self,
#                  trx_encoder: torch.nn.Module,
#                  seq_encoder: AbsSeqEncoder,
#                  time_diffs_boundaries: torch.tensor,
#                  time_diffs_bins_num: int,
#                  head_hidden_size: int = 64,
#                  total_steps: int = 64000,
#                  seed_seq_len: int = 16,
#                  max_lr: float = 0.00005,
#                  weight_decay: float = 0.0,
#                  pct_start: float = 0.1,
#                  norm_predict: bool = False
#                  ):
#         super().__init__(
#             trx_encoder=trx_encoder,
#             seq_encoder=seq_encoder,
#             time_diffs_boundaries=time_diffs_boundaries,
#             time_diffs_bins_num=time_diffs_bins_num,
#             head_hidden_size=head_hidden_size,
#             total_steps=total_steps,
#             seed_seq_len=seed_seq_len,
#             max_lr=max_lr,
#             weight_decay=weight_decay,
#             pct_start=pct_start,
#             norm_predict=norm_predict
#         )
#         self.agg_samples = trx_encoder.agg_samples
#         self.loss = nn.MultiLabelSoftMarginLoss()

#     def loss_gpt(self, logits, labels):
#         loss = 0
        
#         for col_name, head in self.head.items():
#             pred = head(logits[:, (self.hparams.seed_seq_len // self.agg_samples):-1, :])

#             ohe_labels = torch.zeros((pred.shape[0] * pred.shape[1], pred.shape[2]), device=pred.device)
            
#             for shift in range(self.agg_samples):
#                 y_true = labels[col_name][:, (self.hparams.seed_seq_len + self.agg_samples + shift)::self.agg_samples]
#                 y_true = torch.flatten(y_true.long())
#                 ohe_labels_part = F.one_hot(y_true, num_classes=pred.shape[2])
                
#                 if ohe_labels_part.shape[0] < pred.shape[0] * pred.shape[1]:
#                     padding = torch.zeros((pred.shape[0], 1, pred.shape[2]), device=ohe_labels_part.device)
#                     ohe_labels_part = torch.cat((ohe_labels_part.reshape(pred.shape[0], pred.shape[1] - 1, pred.shape[2]), padding), dim=1).reshape(pred.shape[0] * pred.shape[1], pred.shape[2])
                
#                 ohe_labels += ohe_labels_part

#             ohe_labels[ohe_labels > 1] = 1
            
#             pred = pred.reshape(-1, pred.size(-1))

#             loss += self.loss(pred, ohe_labels)
                
#         return loss

In [129]:
N_EPOCHS = 20

In [130]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={
        "MCC": {"in": 342, "out": 16},
        "channel_type": {"in": 7, "out": 16},
        "currency": {"in": 60, "out": 16},
        "trx_category": {"in": 11, "out": 16},
        "amount": {"in": BINS_NUM, "out": 16}
    },
    k=15,
    time_col="event_time",
    agg_samples=5, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=256,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = ConvAgg_TD_GPT_PretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    time_diffs_boundaries=time_diff_bins,
    time_diffs_bins_num=(TIME_DIFF_BINS_NUM + 1), # (boundaries num) + (1 before the first boundary (OOD)) 
    head_hidden_size=256,
    total_steps=(N_EPOCHS * 71), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=3e-3,
    weight_decay=3e-4,
    pct_start=0.1,
    norm_predict=False
)

**Обучение:**

In [131]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="TD-GPT_modeling_ConvAgg (5 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [132]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/0e2d61824a0d45eeb74d9666dfa0bcbe

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : TD-GPT_modeling_ConvAgg (5 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/0e2d61824a0d45eeb74d9666dfa0bcbe
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [170]                : (9.303742408752441, 22.696876525878906)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (9.664616584777832, 10.663762092590332)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : TD-GPT_modeling_ConvAgg (5 trx)
[1;38;5;39mCOMET INFO:[0m   Para

In [133]:
trainer.logged_metrics

{'loss': tensor(9.3622), 'val loss (by epochs)': tensor(9.6646)}

In [134]:
encoder = gpt.seq_encoder

In [135]:
torch.save(encoder.state_dict(), "gpt_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [102]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=b0f44bc3-b84b-425c-968f-016e419987af
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 83.5MB/s]


'gpt_baseline_NAdam.pt'

In [135]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): ConvAgg_TD_GPT_PretrainModule(
    (trx_encoder): ConvAggregator(
      (embeddings): ModuleDict(
        (MCC): NoisyEmbedding(
          342, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (channel_type): NoisyEmbedding(
          7, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (currency): NoisyEmbedding(
          60, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (trx_category): NoisyEmbedding(
          11, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount): NoisyEmbedding(
          128, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
      (time2vec_days): Time2Vec()
      (conv): Conv1d(96, 96, kernel_size=(5,), stride=(1,), padding=(4,), bias=False)
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
 

In [136]:
from tqdm import tqdm

seed_everything(42)

In [137]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds


0it [00:00, ?it/s][A
12it [00:00, 114.22it/s][A
25it [00:00, 118.85it/s][A
38it [00:00, 120.07it/s][A
51it [00:00, 120.08it/s][A
64it [00:00, 121.09it/s][A
77it [00:00, 122.45it/s][A
90it [00:00, 123.90it/s][A
103it [00:00, 122.60it/s][A
116it [00:00, 122.76it/s][A
129it [00:01, 120.99it/s][A
142it [00:01, 122.64it/s][A
155it [00:01, 123.46it/s][A
168it [00:01, 124.45it/s][A
181it [00:01, 125.40it/s][A
194it [00:01, 123.97it/s][A
207it [00:01, 124.54it/s][A
220it [00:01, 124.38it/s][A
233it [00:01, 124.76it/s][A
246it [00:01, 124.47it/s][A
259it [00:02, 123.72it/s][A
272it [00:02, 123.36it/s][A
285it [00:02, 123.23it/s][A
298it [00:02, 123.13it/s][A
311it [00:02, 122.05it/s][A
324it [00:02, 121.30it/s][A
337it [00:02, 121.09it/s][A
350it [00:02, 121.66it/s][A
363it [00:02, 122.33it/s][A
376it [00:03, 122.19it/s][A
389it [00:03, 122.36it/s][A
402it [00:03, 121.89it/s][A
415it [00:03, 121.93it/s][A
428it [00:03, 122.25it/s][A
441it [00:03, 122.64it/s][

array([[ 1.5884214e+01, -4.8447590e+01,  1.4018628e+02, ...,
        -2.2660360e-01,  2.3102083e+00,  8.3402950e-01],
       [ 1.0266776e+00, -5.9573584e+00,  1.3915579e+00, ...,
         7.7217716e-01,  1.8475810e+00,  1.5579300e+00],
       [-9.8106174e+00, -5.3755460e+00,  1.9814354e+00, ...,
         1.1072708e+00,  1.0432576e+00,  8.2801002e-01],
       ...,
       [ 2.0165081e+00, -6.2000442e-01,  2.2387276e+00, ...,
         9.4827384e-01,  3.2454547e-01,  1.1747845e+00],
       [ 1.5602154e+00, -4.5810530e-01,  5.7604975e-01, ...,
         8.2882744e-01,  8.6435431e-01,  7.9788679e-01],
       [ 2.4433768e+00, -1.5139753e+00,  2.4830797e+00, ...,
         4.0681270e-01,  9.6036822e-01,  7.8774691e-02]], dtype=float32)

In [138]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=8)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds


0it [00:00, ?it/s][A
12it [00:00, 117.69it/s][A
25it [00:00, 123.55it/s][A
38it [00:00, 124.25it/s][A
63it [00:00, 125.45it/s][A


array([[  0.24835576,  -1.2513056 ,  -0.19981168, ...,   2.0086992 ,
          0.09088672,   0.42892492],
       [  1.9209932 , -18.529623  ,   7.582492  , ...,   0.7915506 ,
          2.456178  ,   0.78894067],
       [  0.22586714,  -0.5074745 ,   0.5620227 , ...,   1.409787  ,
          1.6394886 ,   0.23832254],
       ...,
       [ -6.433305  , -12.775384  ,  29.386278  , ...,   0.98347455,
          1.4513155 ,   0.0846824 ],
       [ 11.718573  ,   1.4053626 ,   7.3001223 , ...,   0.46314222,
          1.3273515 ,   0.27789086],
       [  0.7055639 ,  -0.5502548 ,   1.1038648 , ...,   1.2276317 ,
          0.9907024 ,   0.6389714 ]], dtype=float32)

In [139]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6700894	total: 8.23ms	remaining: 8.23s
1:	learn: 0.6501389	total: 13.4ms	remaining: 6.66s
2:	learn: 0.6326785	total: 18ms	remaining: 5.97s
3:	learn: 0.6174113	total: 22.7ms	remaining: 5.66s
4:	learn: 0.6051445	total: 27.5ms	remaining: 5.47s
5:	learn: 0.5939488	total: 32.2ms	remaining: 5.33s
6:	learn: 0.5830368	total: 36.9ms	remaining: 5.23s
7:	learn: 0.5742141	total: 41.6ms	remaining: 5.16s
8:	learn: 0.5651418	total: 46.3ms	remaining: 5.1s
9:	learn: 0.5571471	total: 50.9ms	remaining: 5.04s
10:	learn: 0.5500663	total: 55.5ms	remaining: 4.99s
11:	learn: 0.5439377	total: 60.2ms	remaining: 4.96s
12:	learn: 0.5382574	total: 64.9ms	remaining: 4.93s
13:	learn: 0.5326428	total: 69.8ms	remaining: 4.91s
14:	learn: 0.5283147	total: 74.7ms	remaining: 4.9s
15:	learn: 0.5239742	total: 79.8ms	remaining: 4.91s
16:	learn: 0.5191905	total: 85ms	remaining: 4.91s
17:	learn: 0.5151025	total: 90ms	remaining: 4.91s
18:	learn: 0.5115392	total: 94.5ms	remaining: 4.88s


<catboost.core.CatBoostClassifier at 0x7cdc14dce050>

In [140]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [141]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.712
ROC-AUC: 0.7892862346408509


In [143]:
arr = np.array([0.7745705913778311, 0.793430574217675, 0.7892862346408509])

arr.mean(), arr.std()

(0.7857624667454525, 0.008092689841879226)

- TD-GPT embeds + Catboost:
  - `Accuracy: 0.736`, `0.724`, `0.73`, avg: `0.73 +- 0.0049`
  - `ROC-AUC: 0.8034352689773517`, `0.787618785514238`, `0.7936410289618107`, avg: `0.7949 +- 0.0065`

---

- TD-GPT embeds + ConvAgg (3 trx window) + Catboost:
  - `Accuracy: 0.688`, `0.722`, `0.756`, avg: `0.722 +- 0.0278`
  - `ROC-AUC: 0.7640640429975231`, `0.8008774343947808`, `0.8040018779038709`, avg: `0.7896 +- 0.0181`

---

- TD-GPT embeds + ConvAgg (5 trx window) + Catboost:
  - `Accuracy: 0.722`, `0.712`, `0.712`, avg: `0.7153 +- 0.0047`
  - `ROC-AUC: 0.7745705913778311`, `0.793430574217675`, `0.7892862346408509`, avg: `0.7858 +- 0.0081`

---


**Вывод:** Агрегация со свёрточным слоем ухудшает качество (по метрикам) по сравнению с бейзлайном, причём чем больше размер ядра свёртки, тем хуже результаты. При этом по отдельным сидам при окне размера 3 метрики лучше, чем у бейзлайна. Результаты при окне размера 3 вышли достаточно нестабильными, такой результат сложно объяснить.

**Лучший результат:**

- TD-GPT embeds + ConvAgg (3 trx window) + Catboost:
  - `Accuracy: 0.688`, `0.722`, `0.756`, avg: `0.722 +- 0.0278`
  - `ROC-AUC: 0.7640640429975231`, `0.8008774343947808`, `0.8040018779038709`, avg: `0.7896 +- 0.0181`

# Итоги.

| Method                                 |    Accuracy           | ROC-AUC         |
|----------------------------------------|-----------------------|-----------------|
| **Flattened Sequences**                | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**                        | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                              | 0.726 ± 0.0071        | 0.8076 ± 0.0025 |
| **COLES embeds + ConvAgg (7 trx)**     | 0.7393 ± 0.0133       | 0.8122 ± 0.0128 |
| **CPC Modeling (emb_dim=32)**          | 0.747 ± 0.0041        | 0.8099 ± 0.0035 |
| **CPC Modeling (emb_dim=32) + ConvAgg**| 0.7507 ± 0.0057       | 0.8179 ± 0.0061 |
| **CPC Modeling (emb_dim=8)**           | 0.747 ± 0.0052        | 0.8165 ± 0.0032 |
| **CPC Modeling (emb_dim=8) + ConvAgg** | 0.7293 ± 0.0074       | 0.8068 ± 0.0075 |
| **TD-GPT**                             | 0.73 ± 0.0049         | 0.7949 ± 0.0065 |
| **TD-GPT + ConvAgg**                   | 0.722 ± 0.0278        | 0.7896 ± 0.0181 |