# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml
!pip install torch_scatter

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=b87dc4fcb8d28a8c95585dbba1155a96d722797a4f1c6fbe0e1e88449455b3bd
 

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter, ToTorch, FilterNonArray, ISeqLenLimit
from ptls.data_load import IterableChain, padded_collate_wo_target
from ptls.data_load.filter_dataset import FilterDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

# torch_scatter
import torch_scatter

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

# Эксперименты.

**Данные:**

In [101]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [102]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [103]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [104]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [105]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [106]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [107]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [108]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**AddNulls (Add 'No Trx' Tokens for missing days to make a uniform time grid)** 

**(Если нет ни одного timestamp'а, относящегося к данному дню, то мы его добавляем; иначе ничего не делаем):**

In [109]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

class AddNulls(IterableProcessingDataset):
    def __init__(self, null_cat, numeric_values, col_time, col_id):
        super().__init__()
        self.null_cat = null_cat
        self.numeric_values = numeric_values
        self.col_time = col_time
        self.col_id = col_id

    def __iter__(self):
        for rec in self._src:
            features = rec[0] if type(rec) is tuple else rec
            et = np.array(features[self.col_time])
    
            # Границы диапазона
            start, end = et[0], et[-1]

            # Массив полного диапазона
            complete = np.arange(start, end + 1)

            # Находим уникальные значения и их количество
            unique_vals, counts = np.unique(et, return_counts=True)

            # Для повторяющихся значений (если count > 1) создаём массив дополнительных копий (кроме первой)
            extra_list = [np.full(c - 1, val) for val, c in zip(unique_vals, counts) if c > 1]
            if extra_list:
                extra = np.concatenate(extra_list)
            else:
                extra = np.array([])  # Если повторов нет, оставляем пустым

            # Объединяем полный диапазон и дополнительные копии, затем сортируем
            new_event_time = np.sort(np.concatenate([complete, extra]))

            # Определяем, какие значения отсутствовали в исходном event_time (они появятся ровно по одному)
            missing_values = np.setdiff1d(complete, unique_vals)
            # Флаг: True – значение вставлено, False – значение присутствовало в исходном массиве
            inserted_mask = np.in1d(new_event_time, missing_values)

            # Обновляем event_time
            features[self.col_time] = torch.tensor(new_event_time, dtype=torch.int32)

            # Для остальных фич: создаём новый массив того же размера, где на позициях вставленных значений ставим 300,
            # а на остальных восстанавливаем исходные значения в порядке сортировки.
            for key, arr in features.items():
                if (key == self.col_time) | (key == self.col_id):
                    continue
                orig_arr = np.array(arr)
                if key in self.numeric_values:
                    new_arr = np.full(new_event_time.shape, 0)
                    # Индексы, где значения НЕ вставлены
                    non_inserted_idx = np.where(~inserted_mask)[0]
                    new_arr[non_inserted_idx] = orig_arr
                    features[key] = torch.tensor(new_arr, dtype=torch.float32)
                else:
                    new_arr = np.full(new_event_time.shape, self.null_cat)
                    # Индексы, где значения НЕ вставлены
                    non_inserted_idx = np.where(~inserted_mask)[0]
                    new_arr[non_inserted_idx] = orig_arr
                    features[key] = torch.tensor(new_arr, dtype=torch.int32)
            yield features

---

**Time Aggregator Class:**

In [110]:
from ptls.data_load.padded_batch import PaddedBatch


class TimeAggregator(TrxEncoder):
    """The NN layer, a combination of TrxEncoder and Mean Aggregation (by `n_days` days) 
       (works like nn.Sequential([TrxEncoder, Mean Aggregation])).
       `n_days` can be `int` only.
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B is batch_size,
       L/L' is the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T is the dimensionality of a single transaction.

       Parameters
        n_days (float):
            The number of days used for data aggregation.

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 n_days=1,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                ):

        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index
        )

        self.n_days = n_days
        
        self.use_window_attention = use_window_attention

        self.numeric_feats = list(numeric_values.keys())
        
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        for key in pb.payload.keys():
            if key in self.numeric_feats:
                pb.payload[key] = pb.payload[key].to(torch.float32)
            else:
                pb.payload[key] = pb.payload[key].to(torch.int32)
        embeds = super().forward(pb)

        timestamps = pb.payload["event_time"] // self.n_days
    
        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
        
        masked_embeds = embeds.payload * mask

        agg_embeds = []
        seq_lens = []
        max_seq_len = 0
        for j in range(masked_embeds.shape[0]):
            curr_seq_embeds = masked_embeds[j, :, :]
            curr_timestamps = timestamps[j, :]
            
            unique_timestamps, idx = torch.unique_consecutive(curr_timestamps, return_inverse=True) 
            
            curr_agg_seq = torch_scatter.scatter(curr_seq_embeds, idx[:, None], dim=0, reduce="mean")
            if unique_timestamps[-1].item() == 0:
                curr_agg_seq = curr_agg_seq[:-1, :]
    
            agg_embeds += [curr_agg_seq]
            seq_lens += [curr_agg_seq.shape[0]]
            max_seq_len = max(max_seq_len, curr_agg_seq.shape[0])
    
        for j in range(len(agg_embeds)):
            if max_seq_len - agg_embeds[j].shape[0] > 0:
                agg_embeds[j] = torch.cat([agg_embeds[j], torch.zeros((max_seq_len - agg_embeds[j].shape[0], agg_embeds[j].shape[1]), device=agg_embeds[j].device)], dim=0)[None, :, :]
            else:
                agg_embeds[j] = agg_embeds[j][None, :, :]
    
        agg_embeds = torch.cat(agg_embeds, dim=0)
        
        seq_lens = torch.tensor(seq_lens, device=agg_embeds.device).int()
        
        return PaddedBatch(agg_embeds, seq_lens)

In [45]:
# seed_everything(0)

In [46]:
# device = "cuda:0"

In [47]:
# trx_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
# )

# trx_encoder = TrxEncoder(**trx_encoder_params).to(device)

In [None]:
# from ptls.data_load.padded_batch import PaddedBatch


# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# agg_samples = 5
# use_attention = False
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
        
#     embeds = trx_encoder(batch)
    
#     mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
#     mask[mask > embeds.seq_lens[:, None]] = 0.
#     mask[mask > 0.] = 1.
#     mask = mask[:, :, None]
    
#     masked_embeds = embeds.payload * mask
    
#     num_samples_to_add = agg_samples - (masked_embeds.shape[1] % agg_samples)  
#     if num_samples_to_add > 0:
#         additional_samples = torch.zeros((masked_embeds.shape[0], num_samples_to_add, masked_embeds.shape[2]), device=masked_embeds.device)
#         masked_embeds = torch.cat((masked_embeds, additional_samples), dim=1)

#         mask_additional_samples = torch.zeros((mask.shape[0], num_samples_to_add, mask.shape[2]), device=mask.device)
#         mask = torch.cat((mask, mask_additional_samples), dim=1)
    
#     masked_embeds = torch.reshape(masked_embeds, (masked_embeds.shape[0], masked_embeds.shape[1] // agg_samples, agg_samples, masked_embeds.shape[2]))
#     mask = torch.reshape(mask, (mask.shape[0], mask.shape[1] // agg_samples, agg_samples, mask.shape[2]))

#     mask = torch.sum(mask, dim=2)
#     mask[mask == 0.] = 1.
    
#     mean_embeds = torch.sum(masked_embeds, dim=2) / mask

#     new_seq_lens = embeds.seq_lens // agg_samples
#     div_mod_seq_lens = ((embeds.seq_lens % agg_samples) > 0).int()
#     new_seq_lens += div_mod_seq_lens

#     out = PaddedBatch(mean_embeds, new_seq_lens)

#     if i == 0:
#         print(out.payload)

In [54]:
# seed_everything(0)

In [55]:
# agg_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
#     n_days=1,
#     use_window_attention=False
# )

# trx_encoder = TimeAggregator(**agg_encoder_params)

In [56]:
# device = "cuda:0"

# trx_encoder.to(device)

TimeAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [None]:
# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)    
#     embeds = trx_encoder(batch)

#     if i == 0:
#         print(embeds.payload)
#         print(embeds.seq_lens)
#         #print(batch.payload)

---

**Train sequences lengths check:**

In [122]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    n_days=1, # 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

TimeAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [123]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

211it [00:14, 14.60it/s]

Max Length: 368





---

# Aggregation By Days (Mean Pooling) 

- **COLES:**

In [192]:
seed_everything(17)

**DataLoaders:**

In [193]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[
                       SeqLenFilter(min_seq_len=30), 
                       AddNulls(null_cat=1000, numeric_values=['amount_rur'], col_time='trans_date', col_id='client_id')
                      ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=368,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[
                       SeqLenFilter(min_seq_len=30),
                       AddNulls(null_cat=1000, numeric_values=['amount_rur'], col_time='trans_date', col_id='client_id')
                      ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=368,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [194]:
N_EPOCHS = 20

In [195]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 1001, "out": 16},
        "small_group": {"in": 1001, "out": 16},
        #"trans_date": {"in": 800, "out": 16},
        #"small_group": {"in": 250, "out": 16},
    },
    n_days=1, # 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [196]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_AggByDays (1 day, w/ AddNulls)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [None]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/ece7f6aab4cb4ba985b666aa49f9e0fa

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [198]:
trainer.logged_metrics

{'loss': tensor(146.8837),
 'seq_len': tensor(201.2017),
 'valid/recall_top_k': tensor(0.7715)}

In [56]:
torch.save(seq_encoder.state_dict(), "coles_enc_win_agg.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [199]:
encoder = coles.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TimeAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        1001, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        1001, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(33, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [200]:
from tqdm import tqdm

seed_everything(17)

In [201]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds


0it [00:00, ?it/s][A
1it [00:00,  5.14it/s][A
2it [00:00,  6.09it/s][A
3it [00:00,  6.55it/s][A
4it [00:00,  6.85it/s][A
5it [00:00,  7.10it/s][A
6it [00:00,  7.18it/s][A
7it [00:01,  7.29it/s][A
8it [00:01,  7.32it/s][A
9it [00:01,  7.19it/s][A
10it [00:01,  7.11it/s][A
11it [00:01,  7.09it/s][A
12it [00:01,  7.02it/s][A
13it [00:01,  7.07it/s][A
14it [00:01,  7.18it/s][A
15it [00:02,  7.14it/s][A
16it [00:02,  7.11it/s][A
17it [00:02,  7.04it/s][A
18it [00:02,  7.12it/s][A
19it [00:02,  7.21it/s][A
20it [00:02,  7.23it/s][A
21it [00:02,  7.23it/s][A
22it [00:03,  7.24it/s][A
23it [00:03,  7.27it/s][A
24it [00:03,  7.27it/s][A
25it [00:03,  7.29it/s][A
26it [00:03,  7.27it/s][A
27it [00:03,  7.33it/s][A
28it [00:03,  7.28it/s][A
29it [00:04,  7.30it/s][A
30it [00:04,  7.31it/s][A
31it [00:04,  7.31it/s][A
32it [00:04,  7.36it/s][A
33it [00:04,  7.37it/s][A
34it [00:04,  7.39it/s][A
35it [00:04,  7.43it/s][A
36it [00:05,  7.44it/s][A
37it [00:05,  

array([[ 2.8425206e-03, -9.7964609e-01, -3.2377061e-01, ...,
        -7.1829192e-02, -8.8294458e-01, -3.8477625e-03],
       [-1.6972843e-01, -2.6135787e-01, -4.3345577e-01, ...,
         2.9847172e-01, -7.4816060e-01,  1.5940424e-02],
       [-9.1814564e-04,  2.4249214e-03, -5.3966939e-01, ...,
        -3.7906498e-01,  4.1144556e-01, -6.2717512e-02],
       ...,
       [ 2.8373690e-02, -1.4805751e-01, -3.7862214e-01, ...,
        -4.6238199e-01, -3.6332053e-01, -5.8375042e-02],
       [ 4.4077501e-02, -4.3897101e-01, -2.8330559e-01, ...,
        -5.4755992e-01,  4.0678298e-01, -2.4106253e-02],
       [-3.9166901e-02, -1.1854508e-01, -3.2839009e-01, ...,
         2.2816817e-01,  4.1406390e-01, -1.4972411e-02]], dtype=float32)

In [202]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds


0it [00:00, ?it/s][A
1it [00:00,  6.68it/s][A
2it [00:00,  7.02it/s][A
3it [00:00,  7.14it/s][A
4it [00:00,  7.18it/s][A
5it [00:00,  7.09it/s][A
6it [00:00,  7.21it/s][A
7it [00:00,  7.28it/s][A
8it [00:01,  7.34it/s][A
9it [00:01,  7.38it/s][A
10it [00:01,  7.39it/s][A
11it [00:01,  7.42it/s][A
12it [00:01,  7.36it/s][A
13it [00:01,  7.37it/s][A
14it [00:01,  7.38it/s][A
15it [00:02,  7.33it/s][A
16it [00:02,  7.35it/s][A
17it [00:02,  7.31it/s][A
18it [00:02,  7.33it/s][A
19it [00:02,  7.41it/s][A
20it [00:02,  7.46it/s][A
21it [00:02,  7.38it/s][A
22it [00:03,  7.41it/s][A
24it [00:03,  7.47it/s][A


array([[-0.06609811, -0.09023617, -0.32359967, ...,  0.12252191,
        -0.00967149,  0.00635439],
       [ 0.05263222, -0.22831608, -0.29678375, ..., -0.21249671,
        -0.75594705, -0.01233044],
       [-0.00463578, -0.21982203, -0.46017352, ..., -0.54700077,
        -0.4663102 ,  0.0288155 ],
       ...,
       [-0.02561516, -0.96200335, -0.2771003 , ...,  0.01120308,
        -0.6018382 , -0.03068198],
       [-0.11381032, -0.15281488, -0.39538053, ..., -0.1878948 ,
         0.44572863, -0.00134987],
       [ 0.00823167, -0.09243694, -0.33361372, ...,  0.16006725,
        -0.49312705,  0.02546082]], dtype=float32)

In [203]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=17)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3079071	total: 13.8ms	remaining: 13.8s
1:	learn: 1.2501163	total: 24.1ms	remaining: 12s
2:	learn: 1.2030255	total: 34.2ms	remaining: 11.4s
3:	learn: 1.1652435	total: 44.5ms	remaining: 11.1s
4:	learn: 1.1332253	total: 55.1ms	remaining: 11s
5:	learn: 1.1077017	total: 65.2ms	remaining: 10.8s
6:	learn: 1.0858117	total: 75.3ms	remaining: 10.7s
7:	learn: 1.0670127	total: 85.6ms	remaining: 10.6s
8:	learn: 1.0495838	total: 95.8ms	remaining: 10.6s
9:	learn: 1.0340195	total: 107ms	remaining: 10.5s
10:	learn: 1.0213372	total: 116ms	remaining: 10.5s
11:	learn: 1.0100749	total: 127ms	remaining: 10.4s
12:	learn: 1.0001852	total: 137ms	remaining: 10.4s
13:	learn: 0.9907035	total: 147ms	remaining: 10.4s
14:	learn: 0.9830870	total: 157ms	remaining: 10.3s
15:	learn: 0.9759142	total: 167ms	remaining: 10.3s
16:	learn: 0.9692036	total: 178ms	remaining: 10.3s
17:	learn: 0.9632307	total: 188ms	remaining: 10.3s
18:	learn: 0.9576424	total: 198ms	remaining: 10.2s
19:	lea

<catboost.core.CatBoostClassifier at 0x7fc293fd5480>

In [204]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [205]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.591
ROC-AUC: 0.8412818485451155


In [121]:
arr = np.array([0.8374069458382453, 0.8371004593414186, 0.8348023797489404])

arr.mean(), arr.std()

(0.8364365949762015, 0.0011623189607972097)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

- COLES embeds (w/ Aggregation by Days, 1 day) + Catboost:
  - `Accuracy: 0.5903333333333334`, `0.5976666666666667`, `0.5976666666666667`, avg: `0.5952 +- 0.0035`
  - `ROC-AUC: 0.8359934171032082`, `0.8401389801353204`, `0.8431170146372144`, avg: `0.8397 +- 0.0029`

---

- COLES embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.5796666666666667`, `0.5846666666666667`, `0.5756666666666667`, avg: `0.58 +- 0.0037`
  - `ROC-AUC: 0.8374069458382453`, `0.8371004593414186`, `0.8348023797489404`, avg: `0.8364 +- 0.0012`

---

- COLES embeds (w/ Aggregation by Days, 1 day; w/ 'No Trx' Indicators) + Catboost:
  - `Accuracy: 0.6`, `0.591`, `0.5986666666666667`, avg: `0.5966 +- 0.004`
  - `ROC-AUC: 0.8440655920805434`, `0.8412818485451155`, `0.8421997227414773`, avg: `0.8425 +- 0.0012`

---

**Логика + Вывод:** для CoLES агрегация по времени приводит к худшему, чем в случае бейзлайна, качеству. При этом с увеличением периода времени, учитываемого при агрегации, качество лишь ухудшается.

Так как для агрегации по одному дню результаты лучше, чем для агрегации по двум дням, будем далее экспериментировать с этой конфигурацией (агрегация по одному дню). В частности, попробуем добавить в дни, в которые не было ни одной транзакции специальные транзакции-индикаторы, означающие отсутствие в этот день транзакций.

Неожиданно (во всех остальных случаях - иначе) для конфигурации c транзакциями-индикаторами результаты оказались несколько лучше, чем для обычной агрегации по 1 дню, как по accuracy, так и по ROC-AUC, хотя эта конфигурация всё ещё хуже, чем бейзлайн. 

**Лучший по метрикам результат:**

- COLES embeds (w/ Aggregation by Days, 1 day; w/ 'No Trx' Indicators) + Catboost:
  - `Accuracy: 0.6`, `0.591`, `0.5986666666666667`, avg: `0.5966 +- 0.004`
  - `ROC-AUC: 0.8440655920805434`, `0.8412818485451155`, `0.8421997227414773`, avg: `0.8425 +- 0.0012`

---

**Train sequences lengths check:**

In [17]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    n_days=2, # 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

TimeAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [18]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:13, 15.17it/s]

Max Length: 315





---

- **CPC modeling:**

In [111]:
seed_everything(42)

**DataLoaders:**

In [112]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[AddNulls(null_cat=1000, numeric_values=['amount_rur'], col_time='trans_date', col_id='client_id')]
        ),
        min_len=309,             
        max_len=315
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=CpcDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[AddNulls(null_cat=1000, numeric_values=['amount_rur'], col_time='trans_date', col_id='client_id')]
        ),
        min_len=309,
        max_len=315
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [113]:
N_EPOCHS = 20

In [114]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 1001, "out": 128},
        "small_group": {"in": 1001, "out": 128},
        # "trans_date": {"in": 800, "out": 128},
        # "small_group": {"in": 250, "out": 128},
    },
    n_days=2, # 1, 2
    use_window_attention=False
)

trx_encoder = TimeAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6, 
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=2e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [115]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_AggByDays (2 days, w/ AddNulls)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [116]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/f716dea771964e42a1fc562fbc4b37fc



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_AggByDays (2 days, w/ AddNulls)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/f716dea771964e42a1fc562fbc4b37fc
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (0.7066872119903564, 4.526556491851807)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (311.4375, 312.53125)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.8983393907546997, 0.929221510887146)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET I

In [117]:
trainer.logged_metrics

{'loss': tensor(0.9989),
 'seq_len': tensor(311.6964),
 'valid/cpc_accuracy': tensor(0.9291)}

In [23]:
# torch.save(seq_encoder.state_dict(), "cpc_enc_win_agg_trx20.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [118]:
encoder = cpc.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TimeAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        1001, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        1001, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(257, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [119]:
encoder.seq_encoder.is_reduce_sequence = True

In [120]:
from tqdm import tqdm

seed_everything(42)

In [121]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:28,  7.48it/s]


array([[ 9.0661354e-02, -1.7940442e-04,  1.6187808e-01, ...,
        -2.0322989e-01,  1.5438804e-01, -2.1484785e-01],
       [-1.2730356e-01,  5.7423402e-02,  7.5044572e-02, ...,
        -1.6158232e-01,  1.1380931e-01, -3.0741990e-01],
       [ 6.1412532e-02,  2.1402998e-01,  1.4462718e-01, ...,
        -3.4001883e-02, -4.6531048e-02, -1.4534314e-01],
       ...,
       [ 2.3595154e-01,  6.6408962e-02,  2.1761590e-01, ...,
        -1.2518129e-01,  1.5884288e-01, -3.3319229e-01],
       [-5.9891578e-02, -1.4634946e-02,  2.2151016e-01, ...,
        -2.3773111e-01,  2.8775817e-02, -2.1939257e-01],
       [-1.8175900e-01, -1.3543776e-01,  1.9350383e-01, ...,
        -3.6563453e-01,  1.4757746e-01, -2.3649485e-01]], dtype=float32)

In [122]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:02,  8.03it/s]


array([[-0.09716944, -0.06535138,  0.10782406, ..., -0.15078679,
         0.18098472, -0.3724438 ],
       [ 0.19423717,  0.15957253,  0.08630638, ..., -0.05126814,
        -0.01902957,  0.07876328],
       [ 0.00109657,  0.05739365,  0.18357062, ..., -0.15846033,
         0.08779249, -0.1989849 ],
       ...,
       [-0.05791983,  0.05395125,  0.19589399, ..., -0.07685129,
         0.11897571, -0.18030733],
       [-0.03931396,  0.08975551,  0.12729435, ..., -0.01959309,
         0.17313237, -0.34923705],
       [ 0.06401593,  0.14231046,  0.16555986, ..., -0.08124476,
         0.24138758, -0.16685297]], dtype=float32)

In [123]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3205184	total: 13.9ms	remaining: 13.9s
1:	learn: 1.2692111	total: 24.3ms	remaining: 12.1s
2:	learn: 1.2302591	total: 34.5ms	remaining: 11.5s
3:	learn: 1.1976002	total: 44.6ms	remaining: 11.1s
4:	learn: 1.1706399	total: 54.7ms	remaining: 10.9s
5:	learn: 1.1489706	total: 64.7ms	remaining: 10.7s
6:	learn: 1.1286913	total: 75.4ms	remaining: 10.7s
7:	learn: 1.1114361	total: 85.8ms	remaining: 10.6s
8:	learn: 1.0972407	total: 96.7ms	remaining: 10.7s
9:	learn: 1.0851062	total: 107ms	remaining: 10.6s
10:	learn: 1.0738839	total: 117ms	remaining: 10.5s
11:	learn: 1.0641899	total: 127ms	remaining: 10.5s
12:	learn: 1.0551678	total: 138ms	remaining: 10.5s
13:	learn: 1.0479664	total: 148ms	remaining: 10.4s
14:	learn: 1.0412188	total: 158ms	remaining: 10.4s
15:	learn: 1.0347198	total: 167ms	remaining: 10.3s
16:	learn: 1.0292612	total: 176ms	remaining: 10.2s
17:	learn: 1.0240231	total: 187ms	remaining: 10.2s
18:	learn: 1.0189375	total: 198ms	remaining: 10.2s
19:

<catboost.core.CatBoostClassifier at 0x78b62c0b4490>

In [124]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [125]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5526666666666666
ROC-AUC: 0.8083604141160274


In [127]:
arr = np.array([0.8119719125015539, 0.8194989765156216, 0.8083604141160274])

arr.mean(), arr.std()

(0.813277101044401, 0.004640009406003254)

- CPC context embeds + Catboost:
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

---

- CPC context embeds (w/ Aggregation by Days, 1 day) + Catboost:
  - `Accuracy: 0.5566666666666666`, `0.5633333333333334`, `0.5666666666666667`, avg: `0.5622 +- 0.0042`
  - `ROC-AUC: 0.8170450532014047`, `0.8193299016595438`, `0.8193262261512632`, avg: `0.8186 +- 0.0011`

---

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.561`, `0.571`, `0.5726666666666667`, avg: `0.5682 +- 0.0052`
  - `ROC-AUC: 0.8206436640112535`, `0.8229407334548793`, `0.8270720554974593`, avg: `0.8236 +- 0.0027`

---

- CPC context embeds (w/ Aggregation by Days, 2 days; w/ 'No Trx' Indicators) + Catboost:
  - `Accuracy: 0.5573333333333333`, `0.5723333333333334`, `0.5526666666666666`, avg: `0.5608 +- 0.0084`
  - `ROC-AUC: 0.8119719125015539`, `0.8194989765156216`, `0.8083604141160274`, avg: `0.8133 +- 0.0046`

---

**Логика + Вывод:** для CPC агрегация по времени также приводит к худшему, чем в случае бейзлайна, качеству. При этом, в отличие от CoLES, с увеличением периода времени, учитываемого при агрегации, качество становится лучше (хотя далее эксперименты не проводились, предполагается, что с увеличением учитываемого при агрегации периода времени значения метрик в итоге выйдут на плато, а затем начнут ухудшаться - из-за слишком сильного сжатия информации).

Так как для агрегации по двум дням результаты значительно лучше, чем для агрегации по одному дню, будем далее экспериментировать с этой конфигурацией (агрегация по двум дням). В частности, попробуем добавить в дни, в которые не было ни одной транзакции специальные транзакции-индикаторы, означающие отсутствие в этот день транзакций.

Получили, что для конфигурации c транзакциями-индикаторами результаты оказались значительно хуже, чем для обычной агрегации по 2 дням, как по accuracy, так и по ROC-AUC. 

**Лучший по метрикам результат:**

- CPC context embeds (w/ Aggregation by Days, 2 days) + Catboost:
  - `Accuracy: 0.561`, `0.571`, `0.5726666666666667`, avg: `0.5682 +- 0.0052`
  - `ROC-AUC: 0.8206436640112535`, `0.8229407334548793`, `0.8270720554974593`, avg: `0.8236 +- 0.0027`

---

# Итоги.

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**                   | 0.4921 ± 0.005        | 0.76 ± 0.0012   |
| **GRU (+ MLP)**                           | 0.6066 ± 0.0019       | 0.8479 ± 0.0013 |
| **CoLES**                                 | 0.6042 ± 0.0083       | 0.8482 ± 0.0007 |
| **CoLES embeds + AggByDays (1 day, w/ Null Tokens)** | 0.5966 ± 0.004       | 0.8425 ± 0.0012 |
| **CPC Modeling**                          | 0.5762 ± 0.0058       | 0.8305 ± 0.003  |
| **CPC Modeling + AggByDays (2 days)**     | 0.5682 ± 0.0052       | 0.8236 ± 0.0027 |
| **GPT2**                                  | 0.6146 ± 0.0075       | 0.852 ± 0.0029  |
| **GPT2 + AggByDays**                      | -                     | -               |