# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml
!pip install torch_scatter

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=3e35389a74c4e4371bd87296497fb6d1fdf87dda58636bb98ac31536d9ee9d78
  

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter, ToTorch, FilterNonArray, ISeqLenLimit
from ptls.data_load import IterableChain, padded_collate_wo_target
from ptls.data_load.filter_dataset import FilterDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

# torch_scatter
import torch_scatter

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

---

**Time2Vec:**

In [7]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

# Эксперименты.

**Данные:**

In [133]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [134]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [135]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [136]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [137]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [138]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [139]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [140]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [141]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

147

---

In [142]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [143]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [144]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [145]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**AddNulls (Add 'No Trx' Tokens for missing days to make a uniform time grid)** 

**(Если нет ни одного timestamp'а, относящегося к данному дню, то мы его добавляем; иначе ничего не делаем):**

In [146]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

class AddNulls(IterableProcessingDataset):
    def __init__(self, null_cat, numeric_values, col_time, col_id):
        super().__init__()
        self.null_cat = null_cat
        self.numeric_values = numeric_values
        self.col_time = col_time
        self.col_id = col_id
        self.seconds_in_day = 86400

    def __iter__(self):
        for rec in self._src:
            features = rec[0] if type(rec) is tuple else rec
            et = np.array(features[self.col_time])
    
            # Границы диапазона
            start, end = et[0] // self.seconds_in_day * self.seconds_in_day, (et[-1] // self.seconds_in_day + 1) * self.seconds_in_day

            # Массив полного диапазона
            complete = np.arange(start, end + 1, self.seconds_in_day)
            et_unique_days = np.unique(et // self.seconds_in_day * self.seconds_in_day)
            complete = np.setdiff1d(complete, et_unique_days)
            et_unique = np.unique(et)
            complete = np.sort(np.concatenate([complete, et_unique]))
            
            # Находим уникальные значения и их количество
            unique_vals, counts = np.unique(et, return_counts=True)

            # Для повторяющихся значений (если count > 1) создаём массив дополнительных копий (кроме первой)
            extra_list = [np.full(c - 1, val) for val, c in zip(unique_vals, counts) if c > 1]
            if extra_list:
                extra = np.concatenate(extra_list)
            else:
                extra = np.array([])  # Если повторов нет, оставляем пустым

            # Объединяем полный диапазон и дополнительные копии, затем сортируем
            new_event_time = np.sort(np.concatenate([complete, extra]))

            # Определяем, какие значения отсутствовали в исходном event_time (они появятся ровно по одному)
            missing_values = np.setdiff1d(complete, unique_vals)
            
            # Флаг: True – значение вставлено, False – значение присутствовало в исходном массиве
            inserted_mask = np.in1d(new_event_time, missing_values)

            # Обновляем event_time
            features[self.col_time] = torch.tensor(new_event_time)

            # Для остальных фич: создаём новый массив того же размера, где на позициях вставленных значений ставим `null_cat`,
            # а на остальных восстанавливаем исходные значения в порядке сортировки.
            for key, arr in features.items():
                if (key == self.col_time) | (key == self.col_id):
                    continue
                if key in self.numeric_values:
                    orig_arr = np.array(arr.to(torch.float32))
                    new_arr = np.full(new_event_time.shape, 1e-10)
                    # Индексы, где значения НЕ вставлены
                    non_inserted_idx = np.where(~inserted_mask)[0]
                    
                    new_arr[non_inserted_idx] = orig_arr
                    features[key] = torch.tensor(new_arr, dtype=torch.float32)
                else:
                    orig_arr = np.array(arr)
                    new_arr = np.full(new_event_time.shape, self.null_cat)
                    # Индексы, где значения НЕ вставлены
                    non_inserted_idx = np.where(~inserted_mask)[0]
                    
                    new_arr[non_inserted_idx] = orig_arr
                    features[key] = torch.tensor(new_arr, dtype=torch.int32)
            yield features

---

**Time Aggregator Class:**

In [147]:
from ptls.data_load.padded_batch import PaddedBatch


class TimeAggregator(TrxEncoderT2V):
    """The NN layer, a combination of TrxEncoder and Mean Aggregation (by `n_days` days) 
       (works like nn.Sequential([TrxEncoder, Mean Aggregation])).
       `n_days` can be between 0 and 1 as well.
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B is batch_size,
       L/L' is the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T is the dimensionality of a single transaction.

       Parameters
        n_days (float):
            The number of days used for data aggregation.

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.

        k (int):
            Number of periodic components in T2V time embeddings

        time_col (str):
            Name of the time column in data
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 n_days=1,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                ):

        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
            k=k,
            time_col=time_col
        )

        self.n_days = n_days
        self.seconds_in_day = 86400
        
        self.use_window_attention = use_window_attention

        self.numeric_feats = list(numeric_values.keys())
        
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        for key in pb.payload.keys():
            if key in self.numeric_feats:
                pb.payload[key] = pb.payload[key].to(torch.float32)
            else:
                pb.payload[key] = pb.payload[key].to(torch.int32)
        embeds = super().forward(pb)

        timestamps = pb.payload["event_time"] // (self.seconds_in_day * self.n_days)
    
        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
        
        masked_embeds = embeds.payload * mask

        agg_embeds = []
        seq_lens = []
        max_seq_len = 0
        for j in range(masked_embeds.shape[0]):
            curr_seq_embeds = masked_embeds[j, :, :]
            curr_timestamps = timestamps[j, :]
    
            unique_timestamps, idx = torch.unique_consecutive(curr_timestamps, return_inverse=True) 
            
            curr_agg_seq = torch_scatter.scatter(curr_seq_embeds, idx[:, None], dim=0, reduce="mean")
            if unique_timestamps[-1].item() == 0:
                curr_agg_seq = curr_agg_seq[:-1, :]
    
            agg_embeds += [curr_agg_seq]
            seq_lens += [curr_agg_seq.shape[0]]
            max_seq_len = max(max_seq_len, curr_agg_seq.shape[0])
    
        for j in range(len(agg_embeds)):
            if max_seq_len - agg_embeds[j].shape[0] > 0:
                agg_embeds[j] = torch.cat([agg_embeds[j], torch.zeros((max_seq_len - agg_embeds[j].shape[0], agg_embeds[j].shape[1]), device=agg_embeds[j].device)], dim=0)[None, :, :]
            else:
                agg_embeds[j] = agg_embeds[j][None, :, :]
    
        agg_embeds = torch.cat(agg_embeds, dim=0)
        
        seq_lens = torch.tensor(seq_lens, device=agg_embeds.device).int()
        
        return PaddedBatch(agg_embeds, seq_lens)

In [107]:
# seed_everything(0)

In [23]:
# device = "cuda:0"

In [105]:
# agg_encoder_params = dict(
#     embeddings={
#         "MCC": {"in": 342, "out": 8},
#         "channel_type": {"in": 7, "out": 8},
#         "currency": {"in": 60, "out": 8},
#         "trx_category": {"in": 11, "out": 8}            
#     },
#     numeric_values={"amount": "log"},
#     embeddings_noise=0.003,
#     k=7,
#     time_col="event_time"
# )

# trx_encoder = TrxEncoderT2V(**agg_encoder_params).to(device)

In [85]:
# from ptls.data_load.padded_batch import PaddedBatch


# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# SECONDS_IN_DAY = 86400
# DAYS = 1
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
        
#     embeds = trx_encoder(batch)

#     timestamps = batch.payload["event_time"] // (SECONDS_IN_DAY * DAYS)
    
#     mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
#     mask[mask > embeds.seq_lens[:, None]] = 0.
#     mask[mask > 0.] = 1.
#     mask = mask[:, :, None]
    
#     masked_embeds = embeds.payload * mask

#     agg_embeds = []
#     seq_lens = []
#     max_seq_len = 0
#     for j in range(masked_embeds.shape[0]):
#         curr_seq_embeds = masked_embeds[j, :, :]
#         curr_timestamps = timestamps[j, :]

#         unique_timestamps, counts = torch.unique_consecutive(curr_timestamps, return_counts=True) 
        
#         curr_agg_seq = []
#         start = 0
#         for it, count in enumerate(counts):
#             if unique_timestamps[it] == 0:
#                 break
#             curr_agg_seq += [torch.mean(curr_seq_embeds[start:(start + count)], dim=0, keepdims=True)]
#             start += count
#         curr_agg_seq = torch.cat(curr_agg_seq, dim=0)

#         agg_embeds += [curr_agg_seq]
#         seq_lens += [curr_agg_seq.shape[0]]
#         max_seq_len = max(max_seq_len, curr_agg_seq.shape[0])

#     for j in range(len(agg_embeds)):
#         if max_seq_len - agg_embeds[j].shape[0] > 0:
#             agg_embeds[j] = torch.cat([agg_embeds[j], torch.zeros((max_seq_len - agg_embeds[j].shape[0], agg_embeds[j].shape[1]), device=agg_embeds[j].device)], dim=0)[None, :, :]
#         else:
#             agg_embeds[j] = agg_embeds[j][None, :, :]

#     agg_embeds = torch.cat(agg_embeds, dim=0)
#     seq_lens = torch.tensor(seq_lens, device=agg_embeds.device).int()
    
#     out1 = PaddedBatch(agg_embeds, seq_lens)

36it [00:20,  1.78it/s]


In [106]:
# from ptls.data_load.padded_batch import PaddedBatch


# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# SECONDS_IN_DAY = 86400
# DAYS = 1
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
        
#     embeds = trx_encoder(batch)

#     timestamps = batch.payload["event_time"] // (SECONDS_IN_DAY * DAYS)
    
#     mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
#     mask[mask > embeds.seq_lens[:, None]] = 0.
#     mask[mask > 0.] = 1.
#     mask = mask[:, :, None]
    
#     masked_embeds = embeds.payload * mask

#     agg_embeds = []
#     seq_lens = []
#     max_seq_len = 0
#     for j in range(masked_embeds.shape[0]):
#         curr_seq_embeds = masked_embeds[j, :, :]
#         curr_timestamps = timestamps[j, :]

#         unique_timestamps, idx = torch.unique_consecutive(curr_timestamps, return_inverse=True) 

#         curr_agg_seq = torch_scatter.scatter(curr_seq_embeds, idx[:, None], dim=0, reduce="mean")
#         if unique_timestamps[-1].item() == 0:
#             curr_agg_seq = curr_agg_seq[:-1, :]

#         agg_embeds += [curr_agg_seq]
#         seq_lens += [curr_agg_seq.shape[0]]
#         max_seq_len = max(max_seq_len, curr_agg_seq.shape[0])

#     for j in range(len(agg_embeds)):
#         if max_seq_len - agg_embeds[j].shape[0] > 0:
#             agg_embeds[j] = torch.cat([agg_embeds[j], torch.zeros((max_seq_len - agg_embeds[j].shape[0], agg_embeds[j].shape[1]), device=agg_embeds[j].device)], dim=0)[None, :, :]
#         else:
#             agg_embeds[j] = agg_embeds[j][None, :, :]

#     agg_embeds = torch.cat(agg_embeds, dim=0)
#     seq_lens = torch.tensor(seq_lens, device=agg_embeds.device).int()
    
#     out = PaddedBatch(agg_embeds, seq_lens)

36it [00:02, 15.10it/s]


In [88]:
# from ptls.data_load.padded_batch import PaddedBatch


# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# SECONDS_IN_DAY = 86400
# DAYS = 1
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
        
#     embeds = trx_encoder(batch)

#     timestamps = batch.payload["event_time"] // (SECONDS_IN_DAY * DAYS)
    
#     mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
#     mask[mask > embeds.seq_lens[:, None]] = 0.
#     mask[mask > 0.] = 1.
#     mask = mask[:, :, None]
    
#     masked_embeds = embeds.payload * mask

#     #out_size = torch.unique_consecutive(timestamps[torch.argmax(embeds.seq_lens)]).shape[0]
#     #out = torch.zeros((masked_embeds.shape[0], out_size, masked_embeds.shape[2]), device=masked_embeds.device)

#     unique_timestamps, idx = torch.unique_consecutive(timestamps, return_inverse=True)
    
#     idx -= idx.min(dim=1, keepdims=True)[0]
    
#     # print()
#     # print(unique_timestamps)
#     # print(idx)

#     out = torch_scatter.scatter(masked_embeds, idx[:, :, None], dim=1, reduce="mean")

36it [00:00, 91.07it/s]


In [24]:
# agg_encoder_params = dict(
#     embeddings={
#         "MCC": {"in": 342, "out": 8},
#         "channel_type": {"in": 7, "out": 8},
#         "currency": {"in": 60, "out": 8},
#         "trx_category": {"in": 11, "out": 8}            
#     },
#     numeric_values={"amount": "log"},
#     embeddings_noise=0.003,
#     k=7,
#     time_col="event_time",
#     n_days=1,
#     use_window_attention=False
# )

# trx_encoder = TimeAggregator(**agg_encoder_params).to(device)

In [None]:
# trx_encoder.eval()

# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# for i, batch in tqdm(enumerate(train_loader)):
#     trx_encoder(batch.to(device))

---

**Train sequences lengths check:**

In [23]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    n_days=1,
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

TimeAggregator(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (time2vec_days): Time2Vec()
)

In [24]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

36it [00:02, 14.09it/s]

Max Length: 40





# Aggregation By Days (Mean Pooling) 

- **COLES:**

In [153]:
seed_everything(42)

**DataLoaders:**

In [154]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[
                       SeqLenFilter(min_seq_len=10), 
                       # AddNulls(null_cat=500, numeric_values=['amount'], col_time='event_time', col_id='cl_id')
                      ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=40,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[
                       SeqLenFilter(min_seq_len=10),
                       # AddNulls(null_cat=500, numeric_values=['amount'], col_time='event_time', col_id='cl_id')
                      ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=40,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [155]:
N_EPOCHS = 20

In [156]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 501, "out": 8},
        "channel_type": {"in": 501, "out": 8},
        "currency": {"in": 501, "out": 8},
        "trx_category": {"in": 501, "out": 8}
        #"MCC": {"in": 342, "out": 8},
        #"channel_type": {"in": 7, "out": 8},
        #"currency": {"in": 60, "out": 8},
        #"trx_category": {"in": 11, "out": 8} 
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    n_days=1, #0.5, 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [157]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_AggByDays (1 day, w/ AddNulls)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [158]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/c95f05245b7d48ad8399f84e45db8d1c

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_AggByDays (1 day, w/ AddNulls)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/c95f05245b7d48ad8399f84e45db8d1c
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (120.61343383789062, 1006.6783447265625)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (21.676563262939453, 22.9375)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.1075400859117508, 0.2554173469543457)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCO

In [159]:
trainer.logged_metrics

{'loss': tensor(123.7174),
 'seq_len': tensor(22.4885),
 'valid/recall_top_k': tensor(0.2552)}

In [110]:
torch.save(seq_encoder.state_dict(), "coles_enc_aggbydays.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [160]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TimeAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [161]:
from tqdm import tqdm

seed_everything(42)

In [162]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:03, 11.17it/s]


array([[ 0.93622625, -0.05178909,  0.19087344, ..., -0.96121603,
        -0.5278019 , -0.44119808],
       [ 0.9292055 , -0.20126076, -0.14644495, ..., -0.9664763 ,
        -0.7405364 , -0.65024126],
       [ 0.84990907, -0.1402293 , -0.09719023, ..., -0.9411611 ,
        -0.6330215 , -0.6273654 ],
       ...,
       [ 0.77353257,  0.01652216, -0.3708247 , ..., -0.91125375,
        -0.5722551 , -0.5379528 ],
       [ 0.761541  , -0.04619621, -0.41280004, ..., -0.9330558 ,
        -0.7478856 , -0.7312275 ],
       [ 0.636961  ,  0.06830669,  0.08214493, ..., -0.8779054 ,
         0.0106684 , -0.597877  ]], dtype=float32)

In [163]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 10.26it/s]


array([[ 0.8990631 , -0.16598615, -0.06594966, ..., -0.9662215 ,
        -0.68612313, -0.6832476 ],
       [ 0.8141912 , -0.23818657, -0.24754775, ..., -0.9328777 ,
        -0.69514453, -0.5208521 ],
       [ 0.9409772 , -0.10718692, -0.2756635 , ..., -0.9543508 ,
        -0.7285551 , -0.48252144],
       ...,
       [ 0.6498947 , -0.08217648, -0.22362508, ..., -0.90359235,
        -0.5280303 , -0.67600864],
       [ 0.74664253, -0.04872967, -0.27261952, ..., -0.8921901 ,
        -0.6093328 , -0.5399925 ],
       [ 0.66754556, -0.11340289, -0.01845337, ..., -0.93476105,
        -0.65968406, -0.85052055]], dtype=float32)

In [164]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6666784	total: 9.93ms	remaining: 9.91s
1:	learn: 0.6437919	total: 16.4ms	remaining: 8.2s
2:	learn: 0.6251373	total: 22.9ms	remaining: 7.6s
3:	learn: 0.6082981	total: 29.3ms	remaining: 7.3s
4:	learn: 0.5926914	total: 35.8ms	remaining: 7.13s
5:	learn: 0.5789293	total: 42.5ms	remaining: 7.03s
6:	learn: 0.5665876	total: 49ms	remaining: 6.95s
7:	learn: 0.5554615	total: 55.3ms	remaining: 6.85s
8:	learn: 0.5462097	total: 61.2ms	remaining: 6.74s
9:	learn: 0.5368301	total: 67.2ms	remaining: 6.65s
10:	learn: 0.5295009	total: 72.9ms	remaining: 6.55s
11:	learn: 0.5220891	total: 79ms	remaining: 6.5s
12:	learn: 0.5155396	total: 84.9ms	remaining: 6.45s
13:	learn: 0.5097399	total: 91.7ms	remaining: 6.46s
14:	learn: 0.5048092	total: 97.5ms	remaining: 6.4s
15:	learn: 0.4993747	total: 103ms	remaining: 6.35s
16:	learn: 0.4949210	total: 110ms	remaining: 6.34s
17:	learn: 0.4904137	total: 116ms	remaining: 6.34s
18:	learn: 0.4869137	total: 122ms	remaining: 6.32s
19:	l

<catboost.core.CatBoostClassifier at 0x7ce61325dc60>

In [165]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [166]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.736
ROC-AUC: 0.8149455245989218


In [168]:
a = np.array([0.8205954250376389, 0.7973482702238915, 0.8149455245989218])

a.mean(), a.std()

(0.8109630732868175, 0.009899578798229952)

- COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

- COLES embeds (w/ Aggregation by Days, 1 day) + Catboost:
  - `Accuracy: 0.732`, `0.764`, `0.73`, avg: `0.742 +- 0.0156`
  - `ROC-AUC: 0.8078386297777274`, `0.8116915704780561`, `0.8152207346489453`, avg: `0.8116 +- 0.003`

---

- COLES embeds (w/ Aggregation by Days, 0.5 days) + Catboost:
  - `Accuracy: 0.744`, `0.73`, `0.726`, avg: `0.7333 +- 0.0077`
  - `ROC-AUC: 0.8204011591199754`, `0.8003593919476777`, `0.8147836363342023`, avg: `0.8118 +- 0.0084`

---

- COLES embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.73`, `0.736`, `0.742`, avg: `0.736 +- 0.0049`
  - `ROC-AUC: 0.8121610464457432`, `0.8020106522478185`, `0.8208544462611904`, avg: `0.8117 +- 0.0077`

---

- COLES embeds (w/ Aggregation by Days, 1 day; w/ 'No Trx' Indicators) + Catboost:
  - `Accuracy: 0.732`, `0.72`, `0.736`, avg: `0.7293 +- 0.0068`
  - `ROC-AUC: 0.8205954250376389`, `0.7973482702238915`, `0.8149455245989218`, avg: `0.811 +- 0.0099`

---

**Логика + Вывод:** для CoLES агрегация по времени приводит к повышению качества - все результаты с агрегацией по времени лучше, чем бейзлайн. Наилучший результат по accuracy достигается при агрегации по 1 дню. У этого результата относительно большая дисперсия, тем не менее возьмём именно его как лучший, так как (mean - std) здесь сравним с таким же показателем для других конфигураций. По ROC-AUC наилучший результат - при агрегации по полдня дням, но он незначительно лучше результата для агрегации по 1 дню (и по 2 дням); при этом accuracy для агрегации по полдня всё же ощутимо хуже, чем при агрегации по 1 дню. Результаты для агрегации по 2 дням в целом немного лучше, чем для агрегации по полдня, но всё ещё хуже, чем агрегация по 1 дню.

Лучшие результаты - при агрегации по 1 дню => будем далее экспериментировать с этой конфигурацией. В частности, попробуем добавить в дни, в которые не было ни одной транзакции специальные транзакции-индикаторы, означающие отсутствие в этот день транзакций. 

Как оказалось, для конфигурации c транзакциями-индикаторами результаты - хотя они и лучше бейзлайна - оказались значительно хуже, чем для обычной агрегации по 1 дню.

**Лучший по метрикам результат:**

- COLES embeds (w/ Aggregation by Days, 1 day) + Catboost:
  - `Accuracy: 0.732`, `0.764`, `0.73`, avg: `0.742 +- 0.0156`
  - `ROC-AUC: 0.8078386297777274`, `0.8116915704780561`, `0.8152207346489453`, avg: `0.8116 +- 0.003`

---

**Train sequences lengths check:**

In [23]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time",
    n_days=2,
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

TimeAggregator(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (time2vec_days): Time2Vec()
)

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

36it [00:02, 16.66it/s]

Max Length: 32





---

- **CPC modeling:**

In [148]:
seed_everything(42)

**DataLoaders:**

In [149]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[AddNulls(null_cat=500, numeric_values=['amount'], col_time='event_time', col_id='cl_id')]
        ),
        min_len=27,             
        max_len=32
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[AddNulls(null_cat=500, numeric_values=['amount'], col_time='event_time', col_id='cl_id')]
        ),
        min_len=27,
        max_len=32
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [150]:
N_EPOCHS = 20

In [151]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 501, "out": 8},
        "channel_type": {"in": 501, "out": 8},
        "currency": {"in": 501, "out": 8},
        "trx_category": {"in": 501, "out": 8}
        # "MCC": {"in": 342, "out": 32}, # 8 / 16
        # "channel_type": {"in": 7, "out": 32},
        # "currency": {"in": 60, "out": 32},
        # "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7, # 31
    time_col="event_time",
    n_days=2, #0.5, 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [152]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_AggByDays (2 days, w/ AddNulls, emb_dim=8)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [153]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/339996686db4414ebdefc3536505075e

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_AggByDays (2 days, w/ AddNulls, emb_dim=8)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/339996686db4414ebdefc3536505075e
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (1.1672450304031372, 4.324618816375732)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (28.0859375, 28.921875)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.216631218791008, 0.6367470026016235)
[1;38;5;39mCOMET INFO:[0m   Others:
[1

In [154]:
trainer.logged_metrics

{'loss': tensor(1.0395),
 'seq_len': tensor(29.2000),
 'valid/cpc_accuracy': tensor(0.6132)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [155]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TimeAggregator(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        501, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [156]:
encoder.seq_encoder.is_reduce_sequence = True

In [157]:
from tqdm import tqdm

seed_everything(42)

In [158]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:02, 12.24it/s]


array([[ 0.9541381 ,  0.99403846,  0.10655117, ..., -0.916811  ,
        -0.8311571 , -0.38786554],
       [ 0.99218386,  0.99758536,  0.03814796, ..., -0.9777452 ,
        -0.97589546, -0.49847263],
       [ 0.9881791 ,  0.997941  ,  0.3342201 , ..., -0.9748289 ,
        -0.9562503 , -0.52611   ],
       ...,
       [ 0.98834205,  0.99861336,  0.29198986, ..., -0.9694647 ,
        -0.9505587 , -0.50703603],
       [ 0.9918332 ,  0.99803543,  0.11386814, ..., -0.97684854,
        -0.97800344, -0.54774195],
       [-0.96684486, -0.8031287 ,  0.9564984 , ..., -0.48132893,
         0.9631571 ,  0.3243851 ]], dtype=float32)

In [159]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 11.90it/s]


array([[ 0.991809  ,  0.9979612 ,  0.06920517, ..., -0.97705305,
        -0.97472566, -0.5017541 ],
       [ 0.98711056,  0.99756336,  0.32260552, ..., -0.97143745,
        -0.96632314, -0.51865774],
       [ 0.99033576,  0.9980214 ,  0.17980914, ..., -0.97829676,
        -0.96514994, -0.47930518],
       ...,
       [ 0.9888353 ,  0.99810064,  0.27188414, ..., -0.97468996,
        -0.956347  , -0.5483373 ],
       [ 0.9876623 ,  0.9987104 ,  0.28125852, ..., -0.96727455,
        -0.946679  , -0.47381157],
       [ 0.9904298 ,  0.99856895,  0.11408179, ..., -0.9774277 ,
        -0.9760469 , -0.5442818 ]], dtype=float32)

In [160]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6656268	total: 10.1ms	remaining: 10.1s
1:	learn: 0.6421251	total: 17ms	remaining: 8.51s
2:	learn: 0.6220905	total: 23.9ms	remaining: 7.95s
3:	learn: 0.6050007	total: 30ms	remaining: 7.46s
4:	learn: 0.5901621	total: 36.2ms	remaining: 7.2s
5:	learn: 0.5766946	total: 42.5ms	remaining: 7.04s
6:	learn: 0.5657447	total: 48.8ms	remaining: 6.92s
7:	learn: 0.5552907	total: 55ms	remaining: 6.82s
8:	learn: 0.5468687	total: 61.1ms	remaining: 6.73s
9:	learn: 0.5387932	total: 67.3ms	remaining: 6.67s
10:	learn: 0.5316554	total: 73.4ms	remaining: 6.6s
11:	learn: 0.5256854	total: 79.4ms	remaining: 6.54s
12:	learn: 0.5197792	total: 85.8ms	remaining: 6.51s
13:	learn: 0.5150962	total: 92.1ms	remaining: 6.49s
14:	learn: 0.5104508	total: 98.1ms	remaining: 6.44s
15:	learn: 0.5061779	total: 104ms	remaining: 6.41s
16:	learn: 0.5026820	total: 110ms	remaining: 6.37s
17:	learn: 0.4998763	total: 116ms	remaining: 6.34s
18:	learn: 0.4967796	total: 123ms	remaining: 6.33s
19:	

<catboost.core.CatBoostClassifier at 0x7fb52ac184c0>

In [161]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [162]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.712
ROC-AUC: 0.7912450826439592


In [164]:
arr = np.array([0.8062683136099463, 0.795454177526671, 0.7912450826439592])

arr.mean(), arr.std()

(0.7976558579268588, 0.006327712309240838)

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`

---

- CPC context embeds (w/ Aggregation by Days, 1 day) + Catboost:
  - `Accuracy: 0.728`, `0.752`, `0.756`, avg: `0.7453 +- 0.0124`
  - `ROC-AUC: 0.806559712486442`, `0.8169529390814461`, `0.8197050395816806`, avg: `0.8144 +- 0.0057`

---

- CPC context embeds (w/ Aggregation by Days, half a day) + Catboost:
  - `Accuracy: 0.748`, `0.74`, `0.728`, avg: `0.7387 +- 0.0082`
  - `ROC-AUC: 0.8183128005050914`, `0.8161920642372634`, `0.8074177202894561`, avg: `0.814 +- 0.0047`

---

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.736`, `0.752`, `0.744`, avg: `0.744 +- 0.0065`
  - `ROC-AUC: 0.8183775558109794`, `0.8223761959495557`, `0.8203525926405595`, avg: `0.8204 +- 0.0016`

---

- CPC context embeds (w/ Aggregation by Days, 2 days; w/ 'No Trx' Indicators) + Catboost:
  - `Accuracy: 0.73`, `0.722`, `0.736`, avg: `0.7293 +- 0.0057`
  - `ROC-AUC: 0.809117547069013`, `0.7998413495005747`, `0.8104774084926584`, avg: `0.8065 +- 0.0047`

**Логика + Вывод:** для CPC агрегация по времени приводит к ухудшению accuracy, но значимо улучшает ROC-AUC. Чем больше дней учтено в окне для агрегации, тем лучше выходит ROC-AUC (предполагается, что всё же с определённого количества дней результат будет становиться лишь хуже).

Лучшие результаты - при агрегации по 2 дням - наилучший ROC-AUC и не очень большие просадки по accuracy => будем далее экспериментировать с этой конфигурацией. Попробуем добавить в дни, в которые не было ни одной транзакции специальные транзакции-индикаторы, означающие отсутствие в этот день транзакций. 

Как оказалось, для конфигурации c транзакциями-индикаторами результаты оказались значительно хуже по метрикам, чем для обычной агрегации по 2 дням, и даже хуже, чем бейзлайн (особенно сильно просел accuracy).

**Лучший по метрикам результат:**

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.736`, `0.752`, `0.744`, avg: `0.744 +- 0.0065`
  - `ROC-AUC: 0.8183775558109794`, `0.8223761959495557`, `0.8203525926405595`, avg: `0.8204 +- 0.0016`

---

**Результаты для CPC с меньшей размерностью embed_dim (8):**

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.754`, `0.742`, `0.744`, avg: `0.7467 +- 0.0052`
  - `ROC-AUC: 0.8175195480079649`, `0.8197697948875686`, `0.8122096129251591`, avg: `0.8165 +- 0.0032`

---

- CPC context embeds (w/ Aggregation by Days, 1 day) + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.74`, `0.73`, `0.73`, avg: `0.7333 +- 0.0047`
  - `ROC-AUC: 0.8071748878923767`, `0.8047141862686373`, `0.80226967347137`, avg: `0.8047 +- 0.002`

---

- CPC context embeds (w/ Aggregation by Days, half a day) + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.748`, `0.728`, `0.722`, avg: `0.7327 +- 0.0111`
  - `ROC-AUC: 0.8096517783425878`, `0.8050865292774927`, `0.8056531382040117`, avg: `0.8068 +- 0.002`

---

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.74`, `0.734`, `0.736`, avg: `0.7367 +- 0.0025`
  - `ROC-AUC: 0.8262129478234124`, `0.8055721940716517`, `0.8071586990659048`, avg: `0.813 +- 0.0094`

---

- CPC context embeds (w/ Aggregation by Days, 2 days; w/ 'No Trx' Indicators) + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.736`, `0.72`, `0.712`, avg: `0.7227 +- 0.01`
  - `ROC-AUC: 0.8062683136099463`, `0.795454177526671`, `0.7912450826439592`, avg: `0.7977 +- 0.0063`

**Логика + Вывод:** для CPC с эмбеддингами транзакций меньшей размерности агрегация по времени приводит к ухудшению как accuracy, так и ROC-AUC.

Лучшие результаты - при агрегации по 2 дням - наилучшие здесь ROC-AUC и accuracy для агрегации по времени => будем далее экспериментировать с этой конфигурацией. Попробуем добавить в дни, в которые не было ни одной транзакции специальные транзакции-индикаторы, означающие отсутствие в этот день транзакций. 

Как оказалось, для конфигурации c транзакциями-индикаторами результаты оказались значительно хуже по метрикам, чем для обычной агрегации по 2 дням, и хуже, чем бейзлайн.

**Лучший по метрикам результат:**

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.74`, `0.734`, `0.736`, avg: `0.7367 +- 0.0025`
  - `ROC-AUC: 0.8262129478234124`, `0.8055721940716517`, `0.8071586990659048`, avg: `0.813 +- 0.0094`

---

# Итоги

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**               | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**                       | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                             | 0.726 ± 0.0071        | 0.8076 ± 0.0025 |
| **COLES embeds + AggByDays (1 day)**  | 0.742 ± 0.0156        | 0.8116 ± 0.003  |
| **CPC Modeling (emb_dim=32)**         | 0.747 ± 0.0041        | 0.8099 ± 0.0035 |
| **CPC Modeling (emb_dim=32) + AggByDays (2 days)** | 0.744 ± 0.0065        | 0.8204 ± 0.0016 |
| **CPC Modeling (emb_dim=8)**          | 0.747 ± 0.0052        | 0.8165 ± 0.0032 |
| **CPC Modeling (emb_dim=8) + AggByDays (2 days)**  | 0.7367 ± 0.0025        | 0.813 ± 0.0094 |
| **TD-GPT**                            | 0.73 ± 0.0049         | 0.7949 ± 0.0065 |
| **TD-GPT + AggByDays** |   -      | - |


---

`'-'` means that method is not applicable here.