# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting duckdb (from pytorch-lifestream)
  Downloading duckdb-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (966 bytes)
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.1.2->pytorch-lifestream)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m6

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

# Эксперименты.

**Данные:**

In [7]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [8]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [9]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [10]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [11]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [12]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [13]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [14]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**DenseCNN Aggregator Class:**

In [15]:
#---------------------------------------------------------------------------------------------------
# A modified version of https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
#---------------------------------------------------------------------------------------------------

from collections import OrderedDict
from functools import partial
from typing import Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor


class DenseLayer(nn.Module):
    def __init__(self,
                 num_input_features: int,
                 growth_rate: int,
                 bn_size: int,
                 drop_rate: float) -> None:
        super().__init__()
        
        min_kernel_size = 3
        padding_for_min_ks = 1
        
        self.norm1 = nn.BatchNorm1d(num_input_features)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv1d(in_channels=num_input_features, \
                               out_channels=(bn_size * growth_rate),
                               kernel_size=1, bias=False)
        
        self.norm2 = nn.BatchNorm1d(bn_size * growth_rate)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(in_channels=(bn_size * growth_rate), 
                               out_channels=growth_rate,
                               kernel_size=min_kernel_size,
                               padding=padding_for_min_ks, bias=False)
        
        self.drop_rate = float(drop_rate)

    def bn_function(self, inputs: list[Tensor]) -> Tensor:
        concated_features = torch.cat(inputs, dim=1)
        bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))
        return bottleneck_output

    def forward(self, input: Tensor) -> Tensor:
        if isinstance(input, Tensor):
            prev_features = [input]
        else:
            prev_features = input
            
        bottleneck_output = self.bn_function(prev_features)
        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
        
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
            
        return new_features


class DenseBlock(nn.ModuleDict):
    """DenseBlock class, based on
    "Densely Connected Convolutional Networks" (https://arxiv.org/pdf/1608.06993.pdf).

    Args:
        num_layers (int) - how many DenseLayers are in a block
        num_input_features (int) - the number of input features
        growth_rate (int) - how many filters to add by each DenseLayer (`k` in paper)
        bn_size (int) - multiplicative factor for features num in bottleneck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
    """
    
    def __init__(self,
                 num_layers: int,
                 num_input_features: int,
                 bn_size: int,
                 growth_rate: int,
                 drop_rate: float) -> None:
        super().__init__()
        
        for i in range(num_layers):
            layer = DenseLayer(
                num_input_features + i * growth_rate,
                growth_rate=growth_rate,
                bn_size=bn_size,
                drop_rate=drop_rate
            )
            self.add_module("denselayer%d" % (i + 1), layer)

        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, init_features: Tensor) -> Tensor:
        features = [init_features]
        for name, layer in self.items():
            new_features = layer(features)
            features.append(new_features)
        return torch.cat(features, 1)

In [16]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class DenseCNN_Aggregator(TrxEncoder):
    """The NN layer, a combination of TrxEncoder and DenseBlock (based on "Densely Connected Convolutional Networks" (https://arxiv.org/pdf/1608.06993.pdf))
       (works like nn.Sequential([TrxEncoder, DenseBlock])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters

       proj_channels (int):
            The number of channels after the first conv layer

       dense_layers_num (int):
            How many DenseLayers are in DenseBlock
       
       dense_block_drop_rate (float):
            Dropout rate after each DenseLayer in DenseBlock
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 proj_channels,
                 dense_layers_num,
                 dense_block_drop_rate=0.,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip'
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index
        )

        input_channels = super().output_size

        bn_size = 4

        self.proj_channels = proj_channels
        self.growth_rate = proj_channels
        self.dense_layers_num = dense_layers_num

        min_kernel_size = 3
        padding_for_min_ks = 1

        self.conv = nn.Conv1d(in_channels=input_channels, out_channels=proj_channels, kernel_size=min_kernel_size, padding=padding_for_min_ks, bias=False) # (B, T, L)
       
        self.dense_block = DenseBlock(num_layers=dense_layers_num, num_input_features=proj_channels, bn_size=bn_size, growth_rate=self.growth_rate, drop_rate=dense_block_drop_rate) # (B, T, L)

        self.norm = nn.BatchNorm1d(self.output_size)

        nn.init.kaiming_normal_(self.conv.weight)
        nn.init.constant_(self.norm.weight, 1)
        nn.init.constant_(self.norm.bias, 0)

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        agg_embeds = torch.transpose(self.norm(self.dense_block(self.conv(torch.transpose(masked_embeds, 1, 2)))), 1, 2)

        new_seq_lens = torch.minimum(embeds.seq_lens + self.dense_layers_num + 1, torch.tensor(embeds.payload.shape[1]))

        return PaddedBatch(agg_embeds, new_seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        return self.proj_channels + self.growth_rate * self.dense_layers_num

In [19]:
# seed_everything(0)

In [20]:
# device = "cuda:0"

In [21]:
# trx_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
#     proj_channels=128,
#     dense_layers_num=5,
#     dense_block_drop_rate=0.1
# )

# trx_encoder = DenseCNN_Aggregator(**trx_encoder_params).to(device)

In [None]:
# trx_encoder

In [None]:
# trx_encoder.eval()

# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
#     embeds_batch = trx_encoder(batch)

#     # if i == 0:
#     #     # print(batch.payload)
#     #     print(batch.seq_lens)
#     #     print()
#     #     print(embeds_batch.payload[31, 2])
#     #     print()
#     #     print(embeds_batch.payload.shape)
#     #     print()
#     #     print(embeds_batch.seq_lens)

---

**Train sequences lengths check:**

In [17]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    }
)

trx_encoder = TrxEncoder(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [18]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

211it [00:01, 107.35it/s]

Max Length: 683





---

# Aggregation with DenseCNN (DenseNet-like architecture) 

- **COLES:**

In [19]:
seed_everything(0)

**DataLoaders:**

In [20]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [21]:
N_EPOCHS = 20

In [22]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    proj_channels=64,
    dense_layers_num=1,
    dense_block_drop_rate=0.1
)

trx_encoder = DenseCNN_Aggregator(**trx_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [23]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_DenseCNN_Agg (1 layer)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [24]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/7e472c965d3c40cf92ab1b55ea8231c3

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

The following arguments were not expected: --md5 --explicit
Run with --help for more information.

  self.pid = os.fork()
  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_DenseCNN_Agg (1 layer)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/7e472c965d3c40cf92ab1b55ea8231c3
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (27.952375411987305, 754.5553588867188)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (325.015625, 381.5031433105469)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.9327602982521057, 0.9703243374824524)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:

In [25]:
trainer.logged_metrics

{'loss': tensor(32.8477),
 'seq_len': tensor(346.4679),
 'valid/recall_top_k': tensor(0.9703)}

In [56]:
torch.save(seq_encoder.state_dict(), "coles_enc_win_agg.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [26]:
encoder = coles.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): DenseCNN_Aggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (conv): Conv1d(33, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (dense_block): DenseBlock(
      (denselayer1): DenseLayer(
        (norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv1d(64, 256, kernel_size=(1,), stride=(1,), bias=False)
        (norm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv1d(256, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      )
  

In [27]:
from tqdm import tqdm

seed_everything(0)

In [28]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:27,  7.79it/s]


array([[ 0.15396439, -0.00824383,  0.9824757 , ...,  0.22254878,
         0.10625609, -0.12774293],
       [-0.02088743, -0.01378264,  0.99124944, ...,  0.01794863,
         0.6512662 , -0.41989076],
       [ 0.33791104, -0.01696401,  0.985445  , ..., -0.72933286,
        -0.35175878,  0.33619013],
       ...,
       [-0.1617634 , -0.00712722,  0.98667777, ...,  0.3526152 ,
        -0.6998682 ,  0.21967801],
       [-0.5796866 , -0.02088821,  0.9805125 , ..., -0.2539551 ,
        -0.52717936,  0.36146232],
       [ 0.01590348, -0.02165814,  0.996348  , ..., -0.02953346,
        -0.80922467, -0.3922859 ]], dtype=float32)

In [29]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:02,  8.48it/s]


array([[ 0.13005133, -0.01585356,  0.9950963 , ...,  0.13523263,
        -0.6571497 , -0.32572022],
       [-0.25035775,  0.01244894,  0.9629091 , ..., -0.34351724,
         0.46843624,  0.07756703],
       [-0.18567179, -0.00506412,  0.95833033, ..., -0.09388069,
         0.81089926, -0.31123817],
       ...,
       [-0.06738189, -0.02238828,  0.9931964 , ..., -0.51948065,
         0.26539767, -0.00247834],
       [-0.14859807, -0.01070064,  0.99205923, ..., -0.48512596,
        -0.34368044,  0.01964684],
       [-0.03908173, -0.01525925,  0.9880635 , ..., -0.08620451,
         0.88298523, -0.45523974]], dtype=float32)

In [30]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=0)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.12714
0:	learn: 1.2951100	total: 14.9s	remaining: 4h 8m 37s
1:	learn: 1.2294829	total: 14.9s	remaining: 2h 4m 16s
2:	learn: 1.1784094	total: 15s	remaining: 1h 22m 49s
3:	learn: 1.1362391	total: 15s	remaining: 1h 2m 6s
4:	learn: 1.1014551	total: 15s	remaining: 49m 40s
5:	learn: 1.0728994	total: 15s	remaining: 41m 22s
6:	learn: 1.0487682	total: 15s	remaining: 35m 27s
7:	learn: 1.0281107	total: 15s	remaining: 31m
8:	learn: 1.0108543	total: 15s	remaining: 27m 33s
9:	learn: 0.9955181	total: 15s	remaining: 24m 47s
10:	learn: 0.9817943	total: 15s	remaining: 22m 32s
11:	learn: 0.9702491	total: 15.1s	remaining: 20m 39s
12:	learn: 0.9603687	total: 15.1s	remaining: 19m 3s
13:	learn: 0.9512436	total: 15.1s	remaining: 17m 41s
14:	learn: 0.9428155	total: 15.1s	remaining: 16m 30s
15:	learn: 0.9360982	total: 15.1s	remaining: 15m 28s
16:	learn: 0.9293070	total: 15.1s	remaining: 14m 33s
17:	learn: 0.9240031	total: 15.1s	remaining: 13m 44s
18:	learn: 0.9188138	total: 15.1s	remainin

<catboost.core.CatBoostClassifier at 0x789aa9e55810>

In [31]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [32]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.6016666666666667
ROC-AUC: 0.8500901151589281


In [3]:
arr = np.array([0.8500901151589281, 0.8503190639685215, 0.8467990227300057])

arr.mean(), arr.std()

(0.8490694006191518, 0.0016081181981840342)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

- COLES embeds + DenseCNN_Agg (1 layer) + Catboost:
  - Accuracy: `0.6016666666666667`, `0.5996666666666667`, `0.607`, avg: `0.6028 +- 0.0031`
  - ROC-AUC: `0.8500901151589281`, `0.8503190639685215`, `0.8467990227300057`, avg: `0.8491 +- 0.0016`

---

- COLES embeds + DenseCNN_Agg (2 layers) + Catboost:
  - Accuracy: `0.6016666666666667`, `0.6083333333333333`, `0.5966666666666667`, avg: `0.6022 +- 0.0048`
  - ROC-AUC: `0.8509578051936223`, `0.8468051481367529`, `0.8454724366680251`, avg: `0.8477 +- 0.0023`

---

- COLES embeds + DenseCNN_Agg (4 layers) + Catboost:
  - Accuracy: `0.6013333333333334`, `0.6`, `0.6016666666666667`, avg: `0.601 +- 0.0007`
  - ROC-AUC: `0.8463081214073236`, `0.8436575166251925`, `0.8474182937271871`, avg: `0.8458 +- 0.0016`

---

**Train sequences lengths check:**

In [17]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    }
)

trx_encoder = TrxEncoder(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [19]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:01, 122.10it/s]

Max Length: 863





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [17]:
import torch
from torch import nn as nn
from torch.nn import functional as F


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=None):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.payload.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [18]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6,
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        min_kernel_size = 3

        loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(min_kernel_size - 1) * (1 + seq_encoder.trx_encoder.dense_layers_num))

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        if seq_encoder is not None and not isinstance(seq_encoder, RnnSeqEncoder):
            raise NotImplementedError(f'Only rnn encoder supported in CpcModule. Found {type(seq_encoder)}')

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(base_embeddings)

        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [35]:
seed_everything(17)

**DataLoaders:**

In [36]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=863,
        max_len=904
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=863,
        max_len=904
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [37]:
N_EPOCHS = 20

In [38]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 64},
        "small_group": {"in": 250, "out": 64},
    },
    proj_channels=192,
    dense_layers_num=2,
    dense_block_drop_rate=0.1
)

trx_encoder = DenseCNN_Aggregator(**trx_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=1024,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=2e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [39]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_DenseCNN_Agg (2 layers)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [40]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/7b35035b943c46b1a4b2ebbffc889f3d

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Training: |          | 0/? [00:00<?, ?it/s]

The following arguments were not expected: --md5 --explicit
Run with --help for more information.


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Validation: |          | 0/? [00:00<?, ?it/s]


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_DenseCNN_Agg (2 layers)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/7b35035b943c46b1a4b2ebbffc889f3d
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (0.2705286741256714, 8.171150207519531)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (804.890625, 848.046875)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.8951330184936523, 0.9632934927940369)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:

In [41]:
trainer.logged_metrics

{'loss': tensor(0.3506),
 'seq_len': tensor(827.8393),
 'valid/cpc_accuracy': tensor(0.9633)}

In [39]:
torch.save(seq_encoder.state_dict(), "cpc_enc_dense_cnn.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [42]:
encoder = cpc.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): DenseCNN_Aggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 64, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 64, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (conv): Conv1d(129, 192, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (dense_block): DenseBlock(
      (denselayer1): DenseLayer(
        (norm1): BatchNorm1d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv1d(192, 768, kernel_size=(1,), stride=(1,), bias=False)
        (norm2): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv1d(768, 192, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
     

In [43]:
encoder.seq_encoder.is_reduce_sequence = True

In [44]:
from tqdm import tqdm

seed_everything(17)

In [45]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=32)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

844it [02:06,  6.69it/s]


array([[-0.19236198, -0.9257678 , -0.9993739 , ..., -0.8364658 ,
        -0.7633165 ,  0.71915025],
       [-0.18395188, -0.9231096 , -0.9997367 , ..., -0.8411336 ,
        -0.76595587,  0.83083963],
       [-0.23846495, -0.92959446, -0.99837893, ..., -0.8200336 ,
        -0.7949756 ,  0.6901717 ],
       ...,
       [-0.2590136 , -0.9057442 , -0.9990099 , ..., -0.84757847,
        -0.7735652 ,  0.67060584],
       [-0.68857145, -0.834737  , -0.99949414, ..., -0.85303825,
        -0.75626755,  0.81188965],
       [-0.21464929, -0.90709597, -0.995453  , ..., -0.92244774,
        -0.7693458 ,  0.7733142 ]], dtype=float32)

In [46]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=32)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

94it [00:12,  7.65it/s]


array([[-0.56903213, -0.81663054, -0.99929726, ..., -0.86705196,
        -0.7523994 ,  0.79794776],
       [-0.13644867, -0.9385376 , -0.9989561 , ..., -0.82510287,
        -0.7734186 ,  0.6148582 ],
       [-0.55897814, -0.8921891 , -0.999957  , ..., -0.83955836,
        -0.7684724 ,  0.89551634],
       ...,
       [-0.51438695, -0.9029528 , -0.99988884, ..., -0.8451826 ,
        -0.75559133,  0.82137614],
       [-0.500284  , -0.8018694 , -0.99998313, ..., -0.8518412 ,
        -0.79223144,  0.8303297 ],
       [-0.26025862, -0.9096141 , -0.9997764 , ..., -0.8601666 ,
        -0.7632251 ,  0.9004124 ]], dtype=float32)

In [47]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=17)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.12714
0:	learn: 1.3312173	total: 22ms	remaining: 22s
1:	learn: 1.2896230	total: 39.6ms	remaining: 19.8s
2:	learn: 1.2541429	total: 57.6ms	remaining: 19.1s
3:	learn: 1.2262351	total: 74.4ms	remaining: 18.5s
4:	learn: 1.2020231	total: 92ms	remaining: 18.3s
5:	learn: 1.1818020	total: 109ms	remaining: 18.1s
6:	learn: 1.1624173	total: 127ms	remaining: 18s
7:	learn: 1.1470054	total: 143ms	remaining: 17.7s
8:	learn: 1.1332275	total: 160ms	remaining: 17.6s
9:	learn: 1.1201670	total: 176ms	remaining: 17.4s
10:	learn: 1.1096930	total: 192ms	remaining: 17.2s
11:	learn: 1.0996943	total: 208ms	remaining: 17.1s
12:	learn: 1.0910289	total: 224ms	remaining: 17s
13:	learn: 1.0823558	total: 241ms	remaining: 16.9s
14:	learn: 1.0748636	total: 257ms	remaining: 16.9s
15:	learn: 1.0671636	total: 274ms	remaining: 16.8s
16:	learn: 1.0608045	total: 291ms	remaining: 16.8s
17:	learn: 1.0546594	total: 307ms	remaining: 16.7s
18:	learn: 1.0486549	total: 324ms	remaining: 16.7s
19:	learn: 1.0433

<catboost.core.CatBoostClassifier at 0x787cbf1d70a0>

In [48]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [49]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5483333333333333
ROC-AUC: 0.8062473261548366


In [56]:
arr = np.array([0.8165423388188646, 0.8062473261548366, 0.8103653312204478])

arr.mean(), arr.std()

(0.8110516653980496, 0.004230848015125139)

- CPC context embeds + Catboost:
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

---

- CPC context embeds + DenseCNN_Agg (1 layer) + Catboost:
  - Accuracy: `0.5756666666666667`, `0.5776666666666667`, `0.5786666666666667`, avg: `0.5773 +- 0.0012`
  - ROC-AUC: `0.8256578030654587`, `0.8262615841369518`, `0.8293920521840458`, avg: `0.8271 +- 0.0016`

---

- CPC context embeds + DenseCNN_Agg (2 layers) + Catboost:
  - Accuracy: `0.5646666666666667`, `0.5483333333333333`, `0.553333333333333`, avg: `0.5554 +- 0.0068`
  - ROC-AUC: `0.8165423388188646`, `0.8062473261548366`, `0.8103653312204478`, avg: `0.8111 +- 0.0042` 

---

- CPC context embeds + DenseCNN_Agg (4 layers) + Catboost:
  - Accuracy: `0.534`, `0.522`, `0.525`, avg: `0.527 +- 0.0051`
  - ROC-AUC: `0.787001935319396`, `0.7903974485036415`, `0.7886996919115188`, avg: `0.7887 +- 0.0014`

# Итоги.

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**                    | 0.4921 ± 0.005        | 0.76 ± 0.0012   |
| **GRU (+ MLP)**                            | 0.6066 ± 0.0019       | 0.8479 ± 0.0013 |
| **CoLES**                                  | 0.6042 ± 0.0083       | 0.8482 ± 0.0007 |
| **COLES embeds + DenseCNN_Agg (1 layer)**  | 0.6028 ± 0.0031       | 0.8491 ± 0.0016 |
| **COLES embeds + DenseCNN_Agg (2 layers)** | 0.6022 ± 0.0048       | 0.8477 ± 0.0023 |
| **COLES embeds + DenseCNN_Agg (4 layers)** | 0.601 ± 0.0007        | 0.8458 ± 0.0016 |
| **CPC Modeling**                           | 0.5762 ± 0.0058       | 0.8305 ± 0.003  |
| **CPC Modeling + DenseCNN_Agg (1 layer)**  | 0.5773 ± 0.0012       | 0.8271 ± 0.0016 |
| **CPC Modeling + DenseCNN_Agg (2 layers)** | 0.5554 ± 0.0068       | 0.8111 ± 0.0042 |
| **CPC Modeling + DenseCNN_Agg (4 layers)** | 0.527 ± 0.0051        | 0.7887 ± 0.0014 |
| **GPT2**                                   | 0.6146 ± 0.0075       | 0.852 ± 0.0029  |