# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274639 sha256=1211f32cbe41dcd8ca89176b9020cd993a314bb5e399958166eef797396063ca
  

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

---

**Time2Vec:**

In [7]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

# Эксперименты.

**Данные:**

In [8]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [9]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [10]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [11]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [12]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [13]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [14]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [15]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [16]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

147

---

In [17]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [18]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [19]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [20]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Window Aggregator Class:**

In [21]:
from ptls.data_load.padded_batch import PaddedBatch


class WinAggregator(TrxEncoderT2V):
    """The NN layer, a combination of TrxEncoder and Mean Aggregation within a window of #`agg_samples` transactions 
       (works like nn.Sequential([TrxEncoder, Mean Window Aggregation])).
       It is assumed that any two different windows do not overlap here.
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window.

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.

        k (int):
            Number of periodic components in T2V time embeddings

        time_col (str):
            Name of the time column in data
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                ):

        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
            k=k,
            time_col=time_col
        )

        self.agg_samples = agg_samples

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        num_samples_to_add = self.agg_samples - (masked_embeds.shape[1] % self.agg_samples)  
        if num_samples_to_add > 0:
            additional_samples = torch.zeros((masked_embeds.shape[0], num_samples_to_add, masked_embeds.shape[2]), device=masked_embeds.device)
            masked_embeds = torch.cat((masked_embeds, additional_samples), dim=1)
            mask_additional_samples = torch.zeros((mask.shape[0], num_samples_to_add, mask.shape[2]), device=mask.device)
            mask = torch.cat((mask, mask_additional_samples), dim=1)
    
        masked_embeds = torch.reshape(masked_embeds, (masked_embeds.shape[0], masked_embeds.shape[1] // self.agg_samples, self.agg_samples, masked_embeds.shape[2]))
        mask = torch.reshape(mask, (mask.shape[0], mask.shape[1] // self.agg_samples, self.agg_samples, mask.shape[2]))

        if self.use_window_attention:
            pass # Not Implemented
        
        mask = torch.sum(mask, dim=2)
        mask[mask == 0.] = 1.
    
        mean_embeds = torch.sum(masked_embeds, dim=2) / mask

        new_seq_lens = embeds.seq_lens // self.agg_samples
        div_mod_seq_lens = ((embeds.seq_lens % self.agg_samples) > 0).int()
        new_seq_lens += div_mod_seq_lens

        return PaddedBatch(mean_embeds, new_seq_lens)

---

# Sliding Window Aggregation (Mean Pooling) 

- **COLES:**

In [78]:
seed_everything(30)

**DataLoaders:**

In [79]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=150,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=150,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [80]:
N_EPOCHS = 20

In [81]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    agg_samples=10, # 3, 5, 10
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [82]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_WinAgg (10 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [83]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/c3ebf667bfd14c72a9245f7772d3df8e

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_WinAgg (10 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/c3ebf667bfd14c72a9245f7772d3df8e
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (56.88361358642578, 728.8323974609375)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (48.70624923706055, 56.78750228881836)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.4856342673301697, 0.8206671476364136)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO

In [84]:
trainer.logged_metrics

{'loss': tensor(59.2878),
 'seq_len': tensor(55.1967),
 'valid/recall_top_k': tensor(0.8122)}

In [28]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [85]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): WinAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [86]:
from tqdm import tqdm

seed_everything(30)

In [87]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:00, 59.31it/s]


array([[-0.60833985, -0.511413  , -0.09896265, ...,  0.1076753 ,
        -0.01422375, -0.28425995],
       [-0.9892348 , -0.56318617,  0.9520664 , ...,  0.9001066 ,
         0.12255293,  0.33234027],
       [-0.84397465, -0.40318084,  0.5685012 , ...,  0.8353386 ,
         0.04327878,  0.6504765 ],
       ...,
       [-0.88698775, -0.22863524,  0.8202358 , ...,  0.12695561,
         0.01108486, -0.2747371 ],
       [-0.92584634,  0.4159664 , -0.33733493, ..., -0.76366967,
         0.03447534, -0.6255679 ],
       [-0.57684284,  0.01687094, -0.26798904, ...,  0.11736609,
         0.01469811, -0.43869486]], dtype=float32)

In [88]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 58.76it/s]


array([[-0.960636  , -0.23163263,  0.9502865 , ...,  0.91831595,
         0.0973716 ,  0.34862608],
       [-0.74289703, -0.552289  ,  0.6497263 , ...,  0.36723498,
        -0.03615671,  0.7167583 ],
       [-0.9683374 , -0.57284385,  0.98294455, ...,  0.89663374,
         0.082857  ,  0.21520062],
       ...,
       [-0.6565343 ,  0.09395197, -0.7778541 , ...,  0.14457308,
        -0.03706352, -0.54188603],
       [-0.85760665,  0.02938094, -0.46376103, ..., -0.17878065,
        -0.0320862 , -0.47163618],
       [-0.99402833,  0.10789418, -0.38567895, ..., -0.6211662 ,
         0.10599243, -0.3955111 ]], dtype=float32)

In [89]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=30)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6696746	total: 9.13ms	remaining: 9.12s
1:	learn: 0.6490142	total: 15.4ms	remaining: 7.67s
2:	learn: 0.6323775	total: 21.7ms	remaining: 7.21s
3:	learn: 0.6176477	total: 28.2ms	remaining: 7.01s
4:	learn: 0.6037361	total: 34.9ms	remaining: 6.93s
5:	learn: 0.5919549	total: 41.7ms	remaining: 6.9s
6:	learn: 0.5811617	total: 48.6ms	remaining: 6.89s
7:	learn: 0.5718378	total: 55.3ms	remaining: 6.85s
8:	learn: 0.5627696	total: 61.2ms	remaining: 6.74s
9:	learn: 0.5550774	total: 67.3ms	remaining: 6.66s
10:	learn: 0.5481952	total: 73.8ms	remaining: 6.64s
11:	learn: 0.5420741	total: 80ms	remaining: 6.58s
12:	learn: 0.5358254	total: 86.2ms	remaining: 6.54s
13:	learn: 0.5303205	total: 92.4ms	remaining: 6.51s
14:	learn: 0.5252356	total: 98.6ms	remaining: 6.47s
15:	learn: 0.5212352	total: 105ms	remaining: 6.43s
16:	learn: 0.5170278	total: 111ms	remaining: 6.4s
17:	learn: 0.5131427	total: 117ms	remaining: 6.38s
18:	learn: 0.5097282	total: 123ms	remaining: 6.35s


<catboost.core.CatBoostClassifier at 0x7992df8fbe80>

In [90]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [91]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.726
ROC-AUC: 0.7927830211587963


- COLES embeds + Catboost:
  - `Accuracy: 0.7328 +- 0.0194`
  -  `ROC-AUC: 0.8057 +- 0.0088`

---

- COLES embeds + WinAgg (3 trx) + Catboost:
  - Accuracy: `0.7272 +- 0.0093`
  - ROC-AUC: `0.7992 +- 0.0078`

---

- COLES embeds + WinAgg (5 trx) + Catboost:
  - Accuracy: `0.7192 +- 0.0079`
  - ROC-AUC: `0.7977 +- 0.0045`

---

- COLES embeds + WinAgg (10 trx) + Catboost:
  - Accuracy: `0.7176 +- 0.0108`
  - ROC-AUC: `0.7896 +- 0.0057`

**Вывод:** для CoLES - чем больше агрегирующее окно, тем хуже результаты. Также результаты очень просели для окна из 10 транзакций => предполагая, что та же тенденция сохраняется и для других моделей, дальше не используем окно такого размера.

---

- **CPC modeling:**

In [134]:
seed_everything(222)

**DataLoaders:**

In [135]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=85,             
        max_len=105
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=85,
        max_len=105
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [136]:
N_EPOCHS = 20

In [137]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 32}, # 8 / 16
        "channel_type": {"in": 7, "out": 32},
        "currency": {"in": 60, "out": 32},
        "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=31,
    time_col="event_time",
    agg_samples=6, # 3, 6
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=1, # 2, 1
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [138]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_WinAgg")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [139]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/7cd8ca555d9b4e5090ace2a598899e5d

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_WinAgg
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/7cd8ca555d9b4e5090ace2a598899e5d
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (1.4521113634109497, 4.163649082183838)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (61.9375, 68.0078125)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.4018036425113678, 0.6952918171882629)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : C

In [140]:
trainer.logged_metrics

{'loss': tensor(1.9523),
 'seq_len': tensor(58.2000),
 'valid/cpc_accuracy': tensor(0.6821)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [141]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): WinAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(161, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [142]:
encoder.seq_encoder.is_reduce_sequence = True

In [143]:
from tqdm import tqdm

seed_everything(222)

In [144]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:00, 48.88it/s]


array([[-0.17229109,  0.07615294,  0.07387932, ...,  0.0483517 ,
        -0.01161854, -0.1190909 ],
       [ 0.17989685,  0.64318675, -0.45537663, ...,  0.58666295,
         0.37570205,  0.31505302],
       [ 0.29434347,  0.6613744 , -0.04085184, ...,  0.03814859,
         0.65302455,  0.18656495],
       ...,
       [ 0.55100775,  0.37672985,  0.15035829, ...,  0.58635485,
         0.37677187, -0.04708023],
       [ 0.16981274,  0.62859243, -0.3382272 , ...,  0.4807086 ,
         0.25789702,  0.11205652],
       [-0.03994576, -0.19041565,  0.13152125, ...,  0.3004274 ,
         0.02740382,  0.3298819 ]], dtype=float32)

In [145]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 48.91it/s]


array([[-0.1913497 ,  0.65974444, -0.18446562, ...,  0.46404174,
         0.48654094,  0.58440626],
       [-0.07537233,  0.5729245 ,  0.10845622, ...,  0.13441351,
         0.37286583, -0.30425724],
       [ 0.18643904,  0.69629174, -0.14590877, ...,  0.22245212,
         0.33003503,  0.10039242],
       ...,
       [ 0.18698406,  0.761091  ,  0.12840372, ...,  0.4177893 ,
         0.56784433,  0.01461408],
       [ 0.17908724,  0.46401167,  0.19052406, ...,  0.3157004 ,
         0.39695886, -0.2815557 ],
       [-0.04101846,  0.55337816,  0.09129538, ...,  0.54729146,
         0.4544642 ,  0.12178972]], dtype=float32)

In [146]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=222)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6662335	total: 9.71ms	remaining: 9.71s
1:	learn: 0.6430436	total: 16.5ms	remaining: 8.22s
2:	learn: 0.6233235	total: 22.8ms	remaining: 7.59s
3:	learn: 0.6065900	total: 29.4ms	remaining: 7.33s
4:	learn: 0.5912355	total: 36.1ms	remaining: 7.18s
5:	learn: 0.5782384	total: 42.8ms	remaining: 7.09s
6:	learn: 0.5666390	total: 49.6ms	remaining: 7.03s
7:	learn: 0.5558644	total: 56.4ms	remaining: 7s
8:	learn: 0.5465631	total: 63.1ms	remaining: 6.95s
9:	learn: 0.5381661	total: 69.6ms	remaining: 6.89s
10:	learn: 0.5302764	total: 76.4ms	remaining: 6.87s
11:	learn: 0.5228521	total: 82.9ms	remaining: 6.83s
12:	learn: 0.5167877	total: 89.7ms	remaining: 6.81s
13:	learn: 0.5111705	total: 96.4ms	remaining: 6.79s
14:	learn: 0.5061233	total: 103ms	remaining: 6.78s
15:	learn: 0.5008920	total: 110ms	remaining: 6.78s
16:	learn: 0.4959553	total: 117ms	remaining: 6.78s
17:	learn: 0.4916247	total: 124ms	remaining: 6.78s
18:	learn: 0.4873912	total: 131ms	remaining: 6.77s


<catboost.core.CatBoostClassifier at 0x7f651a25f940>

In [147]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [148]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.758
ROC-AUC: 0.8204173479464474


In [150]:
arr = np.array([0.7942562043677452, 0.8178109468844603, 0.8084214275307184, 0.8204173479464474])

arr.mean(), arr.std()

(0.8102264816823428, 0.01024311775791973)

- CPC context embeds + Catboost:
  - `Accuracy: 0.748 +- 0.0033`
  - `ROC-AUC: 0.81 +- 0.0048`

---

- CPC context embeds + WinAgg (3 trx) + Catboost:
  - `Accuracy: 0.7504 +- 0.0093`
  - `ROC-AUC: 0.8136 +- 0.0087`

---

- CPC context embeds + WinAgg (5 trx) + Catboost:
  - `Accuracy: 0.7352 +- 0.0047`
  - `ROC-AUC: 0.8086 +- 0.0055`

---

- CPC context embeds (2 forward steps) + WinAgg (3 trx) + Catboost:
  - `Accuracy: 0.7475 +- 0.0086`
  - `ROC-AUC: 0.8099 +- 0.0069`

---

- CPC context embeds (1 forward step) + WinAgg (6 trx) + Catboost:
  - `Accuracy: 0.748 +- 0.0062` (`max: 0.758` - выше, чем во всех остальных случаях)
  - `ROC-AUC: 0.8102 +- 0.0102` (`max: 0.8204173479464474` - выше, чем во всех остальных случаях)


**Вывод:** оптимальный размер окна - 3 - при таком сжатии  информации качество в среднем лучше, дальнейшее сжатие информации (при бОльших размерах окна) лишь ухудшает результаты.

---

- **GPT:**

In [158]:
seed_everything(222)

**DataLoaders:**

In [159]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000, # 85
        max_len=1200 # 105
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [160]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
import torch.nn.functional as F 
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class GptPretrainModule(pl.LightningModule):
    """GPT2 Language model

    Sequence transactions are encoded by `trx_encoder`.
    Then `seq_encoder` encodes the given sequence 
    (we actually use NN to modify sequence transactions representations,
    then (during inference) we calculate the mean of these encoded transactions to get the representation of the whole sequence).
    After this we use heads to predict the classes of features of the future transaction.

    Parameters
    ----------
    trx_encoder:
        Module for transform dict with feature sequences to sequence of transaction representations
    seq_encoder:
        Module for sequence processing. Generally this is transformer based encoder. Rnn is also possible
        Should work without sequence reduction
    head_hidden_size:
        Hidden size of heads for feature prediction
    seed_seq_len:
         Size of starting sequence without loss 
    total_steps:
        total_steps expected in OneCycle lr scheduler
    max_lr:
        max_lr of OneCycle lr scheduler
    weight_decay:
        weight_decay of Adam optimizer
    pct_start:
        % of total_steps when lr increase
    norm_predict:
        use l2 norm for transformer output or not
    """

    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

In [161]:
class WinAggGPTPretrainModule(GptPretrainModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder=seq_encoder,
            head_hidden_size=head_hidden_size,
            total_steps=total_steps,
            seed_seq_len=seed_seq_len,
            max_lr=max_lr,
            weight_decay=weight_decay,
            pct_start=pct_start,
            norm_predict=norm_predict
        )
        self.agg_samples = trx_encoder.agg_samples

    def loss_gpt(self, logits, labels):
        loss = 0
        
        for col_name, head in self.head.items():
            out = head(logits[:, self.hparams.seed_seq_len:-1, :])
            
            y_true = labels[col_name][:, ((self.hparams.seed_seq_len + 1) * self.agg_samples)::self.agg_samples]
            y_true = torch.flatten(y_true.long())
            
            if y_true.shape[0] < out.shape[0] * out.shape[1]:
                pred = out[:, :-1, :]
                pred = pred.reshape(-1, pred.size(-1))
            else:
                pred = out.reshape(-1, out.size(-1))
                    
            loss += self.loss(pred, y_true)
                
        return loss


class WinAggGPTPretrainModule_MultiLabel(GptPretrainModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder=seq_encoder,
            head_hidden_size=head_hidden_size,
            total_steps=total_steps,
            seed_seq_len=seed_seq_len,
            max_lr=max_lr,
            weight_decay=weight_decay,
            pct_start=pct_start,
            norm_predict=norm_predict
        )
        self.agg_samples = trx_encoder.agg_samples
        self.loss = nn.MultiLabelSoftMarginLoss()

    def loss_gpt(self, logits, labels):
        loss = 0
        
        for col_name, head in self.head.items():
            pred = head(logits[:, self.hparams.seed_seq_len:-1, :])

            ohe_labels = torch.zeros((pred.shape[0] * pred.shape[1], pred.shape[2]), device=pred.device)
            
            for shift in range(self.agg_samples):
                y_true = labels[col_name][:, ((self.hparams.seed_seq_len + 1) * self.agg_samples + shift)::self.agg_samples]
                y_true = torch.flatten(y_true.long())
                ohe_labels_part = F.one_hot(y_true, num_classes=pred.shape[2])
                
                if ohe_labels_part.shape[0] < pred.shape[0] * pred.shape[1]:
                    padding = torch.zeros((pred.shape[0], 1, pred.shape[2]), device=ohe_labels_part.device)
                    ohe_labels_part = torch.cat((ohe_labels_part.reshape(pred.shape[0], pred.shape[1] - 1, pred.shape[2]), padding), dim=1).reshape(pred.shape[0] * pred.shape[1], pred.shape[2])
                
                ohe_labels += ohe_labels_part

            ohe_labels[ohe_labels > 1] = 1
            
            pred = pred.reshape(-1, pred.size(-1))

            loss += self.loss(pred, ohe_labels)
                
        return loss

In [162]:
N_EPOCHS = 20

In [163]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={
        "MCC": {"in": 342, "out": 16},
        "channel_type": {"in": 7, "out": 16},
        "currency": {"in": 60, "out": 16},
        "trx_category": {"in": 11, "out": 16},
        "amount": {"in": BINS_NUM, "out": 16}
    },
    k=15,
    time_col="event_time",
    agg_samples=5, # 3, 5, 10
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=512,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = WinAggGPTPretrainModule_MultiLabel(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    head_hidden_size=512,
    total_steps=(N_EPOCHS * 71), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=3e-3,
    weight_decay=3e-4,
    pct_start=0.1,
    norm_predict=False
)

**Обучение:**

In [164]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="GPT_modeling_WinAgg (multilabel, 5 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [165]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/023c4195f71940bbb9b238a3bc43da91



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : GPT_modeling_WinAgg (multilabel, 5 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/023c4195f71940bbb9b238a3bc43da91
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [170]                : (0.03635042905807495, 3.478447198867798)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (0.0783764198422432, 0.11562386155128479)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : GPT_modeling_WinAgg (multilabel, 5 trx)
[1;38;5;39mCOM

In [166]:
trainer.logged_metrics

{'loss': tensor(0.0740), 'val loss (by epochs)': tensor(0.0784)}

In [167]:
encoder = gpt.seq_encoder

In [135]:
torch.save(encoder.state_dict(), "gpt_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [102]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=b0f44bc3-b84b-425c-968f-016e419987af
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 83.5MB/s]


'gpt_baseline_NAdam.pt'

In [168]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): WinAggGPTPretrainModule_MultiLabel(
    (trx_encoder): WinAggregator(
      (embeddings): ModuleDict(
        (MCC): NoisyEmbedding(
          342, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (channel_type): NoisyEmbedding(
          7, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (currency): NoisyEmbedding(
          60, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (trx_category): NoisyEmbedding(
          11, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount): NoisyEmbedding(
          128, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
      (time2vec_days): Time2Vec()
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 96)
        (wpe): Embedding(2048, 96)
        (drop): 

In [169]:
from tqdm import tqdm

seed_everything(222)

In [170]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds


0it [00:00, ?it/s][A
11it [00:00, 107.11it/s][A
24it [00:00, 116.16it/s][A
36it [00:00, 117.59it/s][A
48it [00:00, 118.14it/s][A
60it [00:00, 118.70it/s][A
72it [00:00, 118.12it/s][A
84it [00:00, 117.74it/s][A
96it [00:00, 117.83it/s][A
109it [00:00, 119.85it/s][A
121it [00:01, 119.69it/s][A
134it [00:01, 120.58it/s][A
147it [00:01, 120.60it/s][A
160it [00:01, 120.97it/s][A
173it [00:01, 121.09it/s][A
186it [00:01, 120.08it/s][A
199it [00:01, 118.80it/s][A
211it [00:01, 117.78it/s][A
223it [00:01, 115.12it/s][A
235it [00:02, 110.94it/s][A
247it [00:02, 108.29it/s][A
258it [00:02, 97.99it/s] [A
268it [00:02, 98.11it/s][A
278it [00:02, 90.21it/s][A
288it [00:02, 90.62it/s][A
298it [00:02, 92.26it/s][A
309it [00:02, 96.92it/s][A
320it [00:02, 98.95it/s][A
331it [00:03, 100.05it/s][A
342it [00:03, 101.79it/s][A
353it [00:03, 101.02it/s][A
364it [00:03, 100.31it/s][A
375it [00:03, 102.35it/s][A
386it [00:03, 102.13it/s][A
397it [00:03, 102.10it/s][A
409it

array([[-25.822071  ,  22.912663  , -25.888826  , ...,  -0.19523391,
         -0.7979302 ,   1.3875624 ],
       [ -0.59063154,   0.7692774 ,  -0.637765  , ...,   0.50447834,
          1.371344  ,   1.2060875 ],
       [ -2.4906378 ,   2.3601081 ,  -2.9877772 , ...,   0.40426877,
          0.9248251 ,   0.66799915],
       ...,
       [ -0.2188473 ,   0.30268428,  -0.25666034, ...,   0.33622214,
          1.1035528 ,   1.1789192 ],
       [ -0.13735084,   0.13351558,  -0.11687493, ...,   0.4347997 ,
          1.0213431 ,   0.95732176],
       [ -1.1434615 ,   1.1937857 ,  -1.1824086 , ...,   0.3767729 ,
          1.6559262 ,   0.80085385]], dtype=float32)

In [171]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=8)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds


0it [00:00, ?it/s][A
12it [00:00, 116.86it/s][A
25it [00:00, 119.50it/s][A
38it [00:00, 120.82it/s][A
63it [00:00, 121.03it/s][A


array([[-0.18037276,  0.35385185, -0.16759218, ...,  0.5003021 ,
         1.5556029 ,  1.058405  ],
       [-2.3018196 ,  2.3530893 , -2.2054327 , ...,  0.5673783 ,
         1.1191154 , -0.11645429],
       [-0.24052781,  0.31528875, -0.20900111, ...,  0.5904805 ,
         1.4916421 ,  0.9909107 ],
       ...,
       [-6.38664   ,  6.560318  , -6.7763    , ...,  0.27152824,
         0.62476355,  0.06293894],
       [-1.5524892 ,  1.7268795 , -1.7173613 , ...,  0.36882314,
         1.553221  , -0.3531581 ],
       [-0.07585675,  0.16789986, -0.13623281, ...,  0.54330325,
         1.8575021 ,  1.3175907 ]], dtype=float32)

In [172]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=222)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6698056	total: 8.31ms	remaining: 8.3s
1:	learn: 0.6503327	total: 13.5ms	remaining: 6.75s
2:	learn: 0.6335206	total: 18.6ms	remaining: 6.2s
3:	learn: 0.6189962	total: 23.7ms	remaining: 5.9s
4:	learn: 0.6068010	total: 28.9ms	remaining: 5.76s
5:	learn: 0.5951043	total: 34.1ms	remaining: 5.64s
6:	learn: 0.5857027	total: 39.2ms	remaining: 5.56s
7:	learn: 0.5767852	total: 44.4ms	remaining: 5.5s
8:	learn: 0.5694263	total: 49.9ms	remaining: 5.49s
9:	learn: 0.5617789	total: 55ms	remaining: 5.44s
10:	learn: 0.5558262	total: 60ms	remaining: 5.39s
11:	learn: 0.5496730	total: 65.3ms	remaining: 5.37s
12:	learn: 0.5445872	total: 70.2ms	remaining: 5.33s
13:	learn: 0.5401463	total: 75.3ms	remaining: 5.3s
14:	learn: 0.5359452	total: 80.7ms	remaining: 5.3s
15:	learn: 0.5322547	total: 85.8ms	remaining: 5.27s
16:	learn: 0.5285830	total: 90.9ms	remaining: 5.25s
17:	learn: 0.5252331	total: 95.9ms	remaining: 5.23s
18:	learn: 0.5222176	total: 101ms	remaining: 5.21s
19:

<catboost.core.CatBoostClassifier at 0x7b2c47929210>

In [173]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [174]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.708
ROC-AUC: 0.7841543766492367


- GPT embeds + Catboost:
  - `Accuracy: 0.7304 +- 0.0101`
  - `ROC-AUC: 0.7957 +- 0.0091`

---

**Пришлось уменьшить количество warmup-эпох, так как процесс обучения был крайне нестабилен.**

- GPT embeds + WinAgg (single trx pred, 3 trx window) + Catboost:
  - `Accuracy: 0.7188 +- 0.0156`
  - `ROC-AUC: 0.7899 +- 0.0142`

---

- GPT embeds + WinAgg (single trx pred, 5 trx window) + Catboost:
  - `Accuracy: 0.7125 +- 0.0107`
  - `ROC-AUC: 0.781 +- 0.0025`

---

- GPT embeds + WinAgg (multilabel pred, 3 trx window) + Catboost:
  - `Accuracy: 0.727 +- 0.0122`
  - `ROC-AUC: 0.7946 +- 0.0156`

--- 

- GPT embeds + WinAgg (multilabel pred, 5 trx window) + Catboost:
  - `Accuracy: 0.7135 +- 0.0074`
  - `ROC-AUC: 0.7921 +- 0.0051`

# Итоги.

| Method                                |    Accuracy           | ROC-AUC         |
|---------------------------------------|-----------------------|-----------------|
| **Flattened Sequences**               | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**                       | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                             | 0.733 ± 0.019         | 0.8057 ± 0.0088 |
| **CPC Modeling**                      | 0.748 ± 0.003         | 0.81 ± 0.0048   |
| **GPT2**                              | 0.73 ± 0.01           | 0.7957 ± 0.0091 |
| **CoLES w/ WinAgg** (best setup)      | 0.727 ± 0.009         | 0.7992 ± 0.0078 |
| **CPC w/ WinAgg** (best setup)        | 0.748 ± 0.006         | 0.8102 ± 0.0102 |
| **GPT w/ WinAgg** (1 next trx pred)   | 0.7188 ± 0.016        | 0.7899 ± 0.0142 |
| **GPT w/ WinAgg** (multilabel pred)   | 0.727 ± 0.012         | 0.7946 ± 0.0156 |

