# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=5b73d792ebb2d5ec1f9c3ff03737fbe21eddc10a5da1d15941352c105c6e7b11


In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

# Эксперименты.

**Данные:**

In [7]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [8]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [9]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [10]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [15]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [16]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [17]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [18]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Convolution Aggregator Class:**

In [19]:
from ptls.data_load.padded_batch import PaddedBatch
import torch.nn as nn


class ConvAggregator(TrxEncoder):
    """The NN layer, a combination of TrxEncoder and Conv Layer (a window of #`agg_samples` transactions) 
       (works like nn.Sequential([TrxEncoder, Conv Window Aggregation])).
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window (conv layer).

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
        )

        self.agg_samples = agg_samples

        channels = super().output_size

        self.conv = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=self.agg_samples, padding=(self.agg_samples - 1), bias=False) # (B, T, L)

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        if self.use_window_attention:
            pass # Not Implemented
    
        agg_embeds = torch.transpose(self.conv(torch.transpose(masked_embeds, 1, 2)), 1, 2)

        new_seq_lens = embeds.seq_lens + self.agg_samples - 1

        return PaddedBatch(agg_embeds, new_seq_lens)

In [17]:
# seed_everything(0)

In [18]:
# device = "cuda:0"

In [20]:
# agg_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
#     agg_samples=5,
#     use_window_attention=False
# )

# trx_encoder = ConvAggregator(**agg_encoder_params).to(device)

In [21]:
# trx_encoder.eval()

# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
#     embeds_batch = trx_encoder(batch)

#     # if i == 0:
#     #     # print(batch.payload)
#     #     print(batch.seq_lens)
#     #     print()
#     #     print(embeds_batch.payload[31, 2])
#     #     print()
#     #     print(embeds_batch.payload.shape)
#     #     print()
#     #     print(embeds_batch.seq_lens)

211it [00:02, 84.69it/s] 


---

**Train sequences lengths check:**

In [22]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    }
)

trx_encoder = TrxEncoder(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [23]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

211it [00:01, 116.47it/s]

Max Length: 683





---

# Sliding Window Aggregation (Mean Pooling) 

- **COLES:**

In [18]:
seed_everything(42)

**DataLoaders:**

In [19]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=560,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=560,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [20]:
N_EPOCHS = 20

In [21]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    agg_samples=7, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [22]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_ConvAgg (7 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [23]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/9e9f1e636f4a4d7492e7ef540fd63afa

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_ConvAgg (7 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/9e9f1e636f4a4d7492e7ef540fd63afa
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [506]              : (81.32427215576172, 759.7018432617188)
[1;38;5;39mCOMET INFO:[0m     seq_len [84]            : (281.6890563964844, 307.6047058105469)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.7845447659492493, 0.9423479437828064)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[

In [24]:
trainer.logged_metrics

{'loss': tensor(86.9931),
 'seq_len': tensor(297.0533),
 'valid/recall_top_k': tensor(0.9423)}

In [56]:
torch.save(seq_encoder.state_dict(), "coles_enc_win_agg.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [25]:
encoder = coles.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (conv): Conv1d(33, 33, kernel_size=(7,), stride=(1,), padding=(6,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(33, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [26]:
from tqdm import tqdm

seed_everything(42)

In [27]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:24,  8.75it/s]


array([[ 0.03698849,  0.0065549 ,  0.03586466, ..., -0.07524633,
         0.3811207 , -0.249045  ],
       [ 0.68024206,  0.133578  ,  0.02426561, ..., -0.08792991,
         0.33768442, -0.3966271 ],
       [ 0.01316545, -0.02980805,  0.03586124, ..., -0.13590482,
         0.59702384,  0.00281469],
       ...,
       [-0.08812407, -0.05642426,  0.0234392 , ..., -0.01974014,
         0.44292927, -0.37987572],
       [-0.5504547 , -0.10151711,  0.04181264, ..., -0.08819497,
         0.25297815,  0.11250492],
       [-0.1457776 , -0.10500189,  0.02579749, ..., -0.11508597,
         0.19503188, -0.1200649 ]], dtype=float32)

In [28]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:02,  9.69it/s]


array([[-0.89095265,  0.04210023,  0.026358  , ..., -0.07034364,
         0.5044866 , -0.32076114],
       [ 0.13615598,  0.214713  ,  0.0209006 , ..., -0.12813726,
         0.26064026,  0.0152501 ],
       [ 0.69760936,  0.06759962,  0.01733929, ..., -0.18130785,
         0.44097042, -0.49486622],
       ...,
       [ 0.4669804 , -0.06284533,  0.02873191, ..., -0.02277629,
         0.26693788, -0.1816809 ],
       [ 0.15836863, -0.01949545,  0.02283837, ..., -0.08913588,
         0.5751037 , -0.36161187],
       [ 0.2480348 , -0.00120775,  0.02129856, ..., -0.14086127,
         0.702488  , -0.4512518 ]], dtype=float32)

In [29]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.2991978	total: 14.1s	remaining: 3h 54m 29s
1:	learn: 1.2348466	total: 14.1s	remaining: 1h 57m 12s
2:	learn: 1.1850406	total: 14.1s	remaining: 1h 18m 7s
3:	learn: 1.1441941	total: 14.1s	remaining: 58m 34s
4:	learn: 1.1101088	total: 14.1s	remaining: 46m 50s
5:	learn: 1.0825827	total: 14.1s	remaining: 39m 1s
6:	learn: 1.0593864	total: 14.1s	remaining: 33m 26s
7:	learn: 1.0387410	total: 14.2s	remaining: 29m 15s
8:	learn: 1.0216364	total: 14.2s	remaining: 25m 59s
9:	learn: 1.0063884	total: 14.2s	remaining: 23m 23s
10:	learn: 0.9938767	total: 14.2s	remaining: 21m 15s
11:	learn: 0.9825193	total: 14.2s	remaining: 19m 28s
12:	learn: 0.9728731	total: 14.2s	remaining: 17m 58s
13:	learn: 0.9638458	total: 14.2s	remaining: 16m 41s
14:	learn: 0.9552195	total: 14.2s	remaining: 15m 34s
15:	learn: 0.9481326	total: 14.2s	remaining: 14m 35s
16:	learn: 0.9413106	total: 14.2s	remaining: 13m 43s
17:	learn: 0.9353958	total: 14.3s	remaining: 12m 57s
18:	learn: 0.9304108

<catboost.core.CatBoostClassifier at 0x7d91a1e196f0>

In [30]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [31]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5996666666666667
ROC-AUC: 0.8462534406315487


In [33]:
arr = np.array([0.8459122462737779, 0.8483154534047637, 0.8462534406315487])

arr.mean(), arr.std()

(0.8468270467700302, 0.0010616399672779273)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

- COLES embeds + ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.6`, `0.6023333333333334`, `0.6056666666666667`, avg: `0.6027 +- 0.0023`
  - ROC-AUC: `0.8449018451080985`, `0.8453518001645437`, `0.8479308377093546`, avg: `0.8461 +- 0.0013`

---

- COLES embeds + ConvAgg (5 trx) + Catboost:
  - Accuracy: `0.5963333333333334`, `0.6053333333333333`, `0.605`, avg: `0.6022 +- 0.0042`
  - ROC-AUC: `0.8435980298247711`, `0.8489526718533573`, `0.8445515887986031`, avg: `0.8457 +- 0.0023`

---

- COLES embeds + ConvAgg (7 trx) + Catboost:
  - Accuracy: `0.5996666666666667`, `0.6013333333333334`, `0.5996666666666667`, avg: `0.6002 +- 0.0008`
  - ROC-AUC: `0.8459122462737779`, `0.8483154534047637`, `0.8462534406315487`, avg: `0.8468 +- 0.0011`

---

**Вывод:** для CoLES качество при свёрточной агрегации несколько хуже, чем у бейзлайна. С увеличением размера ядра свёртки в целом убывает, при этом при наибольшем размере ядра (7 транзакций) ROC-AUC выше, чем в остальных случаях. 

**Конфигурация, лучшая по метрикам:**

- COLES embeds + ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.6`, `0.6023333333333334`, `0.6056666666666667`, avg: `0.6027 +- 0.0023`
  - ROC-AUC: `0.8449018451080985`, `0.8453518001645437`, `0.8479308377093546`, avg: `0.8461 +- 0.0013`

---

**Train sequences lengths check:**

In [16]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    }
)

trx_encoder = TrxEncoder(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [17]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:01, 109.70it/s]

Max Length: 904





---

- **CPC modeling:**

---

**Скорректируем класс CpcModule так, чтобы при работе CPC не было даталиков:**

In [18]:
import torch
from torch import nn as nn
from torch.nn import functional as F


class CPC_ShiftedLoss(nn.Module):
    def __init__(self, n_negatives=None, n_forward_steps=None, shift=None):
        super().__init__()
        self.n_negatives = n_negatives
        self.n_forward_steps = n_forward_steps
        self.shift = shift

    def _get_preds(self, base_embeddings, mapped_ctx_embeddings):
        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        _, _, _, n_forward_steps = mapped_ctx_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()
        
        possible_negatives = base_embeddings.payload.reshape(batch_size * max_seq_len, emb_size)

        mask = len_mask.unsqueeze(0).expand(batch_size, *len_mask.shape).clone()

        mask = mask.reshape(batch_size, -1)
        sample_ids = torch.multinomial(mask, self.n_negatives)
        neg_samples = possible_negatives[sample_ids]

        positive_preds, neg_preds = [], []
        len_mask_exp = len_mask.unsqueeze(-1).unsqueeze(-1).to(device).expand(-1, -1, emb_size, n_forward_steps)
        trimmed_mce = mapped_ctx_embeddings.payload.mul(len_mask_exp)  # zero context vectors by sequence lengths
        for i in range(1, n_forward_steps + 1):
            ce_i = trimmed_mce[:, 0:(max_seq_len - i - self.shift), :, i - 1]
            be_i = base_embeddings.payload[:, (i + self.shift):max_seq_len]

            positive_pred_i = ce_i.mul(be_i).sum(axis=-1)
            positive_preds.append(positive_pred_i)

            neg_pred_i = ce_i.matmul(neg_samples.transpose(-2, -1))
            neg_preds.append(neg_pred_i)

        return positive_preds, neg_preds

    def forward(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        device = mapped_ctx_embeddings.payload.device
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        step_losses = []
        for positive_pred_i, neg_pred_i in zip(positive_preds, neg_preds):
            step_loss = -F.log_softmax(torch.cat([positive_pred_i.unsqueeze(-1), neg_pred_i], dim=-1), dim=-1)[:, :, 0].mean()
            step_losses.append(step_loss)

        loss = torch.stack(step_losses).mean()
        return loss

    def cpc_accuracy(self, embeddings, _):
        base_embeddings, _, mapped_ctx_embeddings = embeddings
        positive_preds, neg_preds = self._get_preds(base_embeddings, mapped_ctx_embeddings)

        batch_size, max_seq_len, emb_size = base_embeddings.payload.shape
        seq_lens = mapped_ctx_embeddings.seq_lens
        device = mapped_ctx_embeddings.payload.device

        len_mask = torch.arange(max_seq_len).unsqueeze(0).expand(batch_size, -1).to(device)
        len_mask = (len_mask < seq_lens.unsqueeze(1).expand(-1, max_seq_len)).float()

        total, accurate = 0, 0
        
        for i, (positive_pred_i, neg_pred_i) in enumerate(zip(positive_preds, neg_preds)):
            i_mask = len_mask[:, (self.shift + i + 1):max_seq_len].to(device)
            total += i_mask.sum().item()
            accurate += (((positive_pred_i.unsqueeze(-1).expand(*neg_pred_i.shape) > neg_pred_i) \
                          .sum(dim=-1) == self.n_negatives) * i_mask).sum().item()
        return accurate / total

In [19]:
import torch

from ptls.frames.abs_module import ABSModule
from ptls.frames.cpc.metrics.cpc_accuracy import CpcAccuracy
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.data_load.padded_batch import PaddedBatch


class CpcModule(ABSModule):
    """Contrastive Predictive Coding ([CPC](https://arxiv.org/abs/1807.03748))

    Original sequence are encoded by `TrxEncoder`.
    Hidden representation `z` is an embedding for each individual transaction.
    Next `RnnEncoder` used for `context` calculation from `z`.
    Linear predictors are used to predict next trx embedding by context.
    The loss function tends to make future trx embedding and they predict closer.
    Negative sampling are used to avoid trivial solution.

    Parameters
        seq_encoder:
            Model which calculate embeddings for original raw transaction sequences
            `seq_encoder` is trained by `CoLESModule` to get better representations of input sequences
        head:
            Not used
        loss:
            Keep None. CPCLoss used by default
        validation_metric:
            Keep None. CPCAccuracy used by default
        optimizer_partial:
            optimizer init partial. Network parameters are missed.
        lr_scheduler_partial:
            scheduler init partial. Optimizer are missed.

    """
    def __init__(self, validation_metric=None,
                       seq_encoder=None,
                       head=None,
                       n_negatives=40, n_forward_steps=6,
                       optimizer_partial=None,
                       lr_scheduler_partial=None):

        self.save_hyperparameters('n_negatives', 'n_forward_steps')

        loss = CPC_ShiftedLoss(n_negatives=n_negatives, n_forward_steps=n_forward_steps, shift=(seq_encoder.trx_encoder.agg_samples - 1))

        if validation_metric is None:
            validation_metric = CpcAccuracy(loss)

        if seq_encoder is not None and not isinstance(seq_encoder, RnnSeqEncoder):
            raise NotImplementedError(f'Only rnn encoder supported in CpcModule. Found {type(seq_encoder)}')

        seq_encoder.seq_encoder.is_reduce_sequence = False

        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        linear_size = self.seq_encoder.trx_encoder.output_size
        embedding_size = self.seq_encoder.embedding_size
        self._linears = torch.nn.ModuleList([torch.nn.Linear(embedding_size, linear_size)
                                             for _ in range(loss.n_forward_steps)])

    @property
    def metric_name(self):
        return 'cpc_accuracy'

    @property
    def is_requires_reduced_sequence(self):
        return False

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        seq_encoder = self._seq_encoder.seq_encoder

        base_embeddings = trx_encoder(x)
        context_embeddings = seq_encoder(base_embeddings)

        me = []
        for l in self._linears:
            me.append(l(context_embeddings.payload))
        mapped_ctx_embeddings = PaddedBatch(torch.stack(me, dim=3), context_embeddings.seq_lens)

        return (base_embeddings, context_embeddings, mapped_ctx_embeddings), y

---

In [84]:
seed_everything(17)

**DataLoaders:**

In [85]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=863,
        max_len=904
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=863,
        max_len=904
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [86]:
N_EPOCHS = 20

In [87]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 128},
        "small_group": {"in": 250, "out": 128},
    },
    agg_samples=7, # 3, 5, 7
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=2e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [88]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_ConvAgg (7 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [89]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/073b670595a44e61bd27efc51efaad91



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_ConvAgg (7 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/073b670595a44e61bd27efc51efaad91
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (0.265452116727829, 4.289204120635986)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (808.203125, 848.625)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.9208076000213623, 0.9663130044937134)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Nam

In [91]:
trainer.logged_metrics

{'loss': tensor(0.3818),
 'seq_len': tensor(819.3751),
 'valid/cpc_accuracy': tensor(0.9662)}

In [23]:
# torch.save(seq_encoder.state_dict(), "cpc_enc_win_agg_trx20.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [92]:
encoder = cpc.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): ConvAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (conv): Conv1d(257, 257, kernel_size=(7,), stride=(1,), padding=(6,), bias=False)
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(257, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [93]:
encoder.seq_encoder.is_reduce_sequence = True

In [94]:
from tqdm import tqdm

seed_everything(17)

In [95]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:30,  7.03it/s]


array([[-0.05324957,  0.11450856, -0.12086479, ..., -0.07501061,
         0.03365382, -0.09520902],
       [-0.09045979,  0.07325636, -0.15996777, ..., -0.0804252 ,
         0.02945825, -0.09366222],
       [ 0.03150362,  0.14453615, -0.05120659, ..., -0.24039234,
         0.13753735, -0.20452476],
       ...,
       [-0.04467927,  0.11737393, -0.11436034, ..., -0.1464082 ,
         0.13431565, -0.12178652],
       [-0.0121008 ,  0.0200711 , -0.14695942, ..., -0.1723734 ,
         0.07244273, -0.1261596 ],
       [ 0.02000915,  0.01037849, -0.06434368, ...,  0.04906905,
         0.16014408, -0.07977976]], dtype=float32)

In [96]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:03,  7.57it/s]


array([[-0.00633594,  0.01500798, -0.09645905, ..., -0.10396656,
         0.16359031, -0.0927169 ],
       [ 0.02070122,  0.06364661, -0.16149746, ..., -0.15697472,
         0.03835281, -0.17137945],
       [ 0.08664538,  0.1276432 , -0.16208892, ..., -0.04973625,
         0.1019806 , -0.16285062],
       ...,
       [-0.06901073,  0.01682798,  0.02728134, ..., -0.07610157,
         0.04936444, -0.09541061],
       [ 0.00699887,  0.04263111, -0.00087967, ..., -0.15079468,
         0.05048765, -0.21230198],
       [ 0.02064326,  0.10989095, -0.13404979, ...,  0.0484121 ,
         0.01480115, -0.18523376]], dtype=float32)

In [97]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=17)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")



Learning rate set to 0.12714
0:	learn: 1.3094333	total: 17.3ms	remaining: 17.3s
1:	learn: 1.2535525	total: 28.7ms	remaining: 14.3s
2:	learn: 1.2096732	total: 40ms	remaining: 13.3s
3:	learn: 1.1749940	total: 51.1ms	remaining: 12.7s
4:	learn: 1.1472119	total: 62.4ms	remaining: 12.4s
5:	learn: 1.1205456	total: 74.1ms	remaining: 12.3s
6:	learn: 1.0988614	total: 85ms	remaining: 12.1s
7:	learn: 1.0814468	total: 96.4ms	remaining: 12s
8:	learn: 1.0652527	total: 107ms	remaining: 11.8s
9:	learn: 1.0511954	total: 118ms	remaining: 11.7s
10:	learn: 1.0390277	total: 128ms	remaining: 11.5s
11:	learn: 1.0279150	total: 138ms	remaining: 11.4s
12:	learn: 1.0189070	total: 149ms	remaining: 11.3s
13:	learn: 1.0108001	total: 159ms	remaining: 11.2s
14:	learn: 1.0032871	total: 170ms	remaining: 11.1s
15:	learn: 0.9966817	total: 180ms	remaining: 11.1s
16:	learn: 0.9895205	total: 191ms	remaining: 11s
17:	learn: 0.9834917	total: 200ms	remaining: 10.9s
18:	learn: 0.9783106	total: 210ms	remaining: 10.9s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x79b85b5e9870>

In [98]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [99]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5816666666666667
ROC-AUC: 0.8337874711769329


In [101]:
arr = np.array([0.8346637198543245, 0.8337874711769329, 0.829962629518911])

arr.mean(), arr.std()

(0.8328046068500562, 0.0020411727510126487)

- CPC context embeds + Catboost:
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

---

- CPC context embeds + ConvAgg (3 trx) + Catboost:
  - Accuracy: `0.5813333333333334`, `0.5816666666666667`, `0.5723333333333334`, avg: `0.5784 +- 0.0043`
  - ROC-AUC: `0.8281683346974047`, `0.8303942706365449`, `0.8299126513367255`, avg: `0.8295 +- 0.001`

---

- CPC context embeds + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.579`, `0.587`, `0.589`, avg: `0.585 +- 0.0043`
  - `ROC-AUC: 0.8304985732068079`, `0.8342778789187919`, `0.8303966161955582`, avg: `0.8317 +- 0.0018`

---

- CPC context embeds + ConvAgg (7 trx) + Catboost:
  - `Accuracy: 0.5843333333333334`, `0.5816666666666667`, `0.5826666666666667`, avg: `0.5829 +- 0.0011`
  - `ROC-AUC: 0.8346637198543245`, `0.8337874711769329`, `0.829962629518911`, avg: `0.8328 +- 0.002`

---

**Вывод:** Свёрточная агрегация значительно улучшает качество для CPC. С увеличением размера ядра свёртки качество улучшается (далее - предполагается, что оно выйдет на плато, а затем будет лишь ухудшаться).  


**Конфигурация, лучшая по метрикам:** 

- CPC context embeds + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.579`, `0.587`, `0.589`, avg: `0.585 +- 0.0043`
  - `ROC-AUC: 0.8304985732068079`, `0.8342778789187919`, `0.8303966161955582`, avg: `0.8317 +- 0.0018`

---

- **GPT:**

In [53]:
seed_everything(17)

**DataLoaders:**

In [54]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000,
        max_len=1200
    ),
    train_num_workers=4,
    train_batch_size=16,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=16
)

**Модель:**

In [55]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
import torch.nn.functional as F 
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class ConvAgg_GPT_PretrainModule(pl.LightningModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

        self.shift = trx_encoder.agg_samples

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-self.shift, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

Multilabel-подход для обучения здесь не применим, так как окна транзакций пересекаются.

In [116]:
# class WinAgg_GPT_MultiLabelPretrainModule(GptPretrainModule):
#     def __init__(self,
#                  trx_encoder: torch.nn.Module,
#                  seq_encoder: AbsSeqEncoder,
#                  head_hidden_size: int = 64,
#                  total_steps: int = 64000,
#                  seed_seq_len: int = 16,
#                  max_lr: float = 0.00005,
#                  weight_decay: float = 0.0,
#                  pct_start: float = 0.1,
#                  norm_predict: bool = False
#                  ):
#         super().__init__(
#             trx_encoder=trx_encoder,
#             seq_encoder=seq_encoder,
#             head_hidden_size=head_hidden_size,
#             total_steps=total_steps,
#             seed_seq_len=seed_seq_len,
#             max_lr=max_lr,
#             weight_decay=weight_decay,
#             pct_start=pct_start,
#             norm_predict=norm_predict
#         )
#         self.agg_samples = trx_encoder.agg_samples
#         self.loss = nn.MultiLabelSoftMarginLoss()

#     def loss_gpt(self, logits, labels):
#         loss = 0
        
#         for col_name, head in self.head.items():
#             pred = head(logits[:, (self.hparams.seed_seq_len // self.agg_samples):-1, :])

#             ohe_labels = torch.zeros((pred.shape[0] * pred.shape[1], pred.shape[2]), device=pred.device)
            
#             for shift in range(self.agg_samples):
#                 y_true = labels[col_name][:, (self.hparams.seed_seq_len + self.agg_samples + shift)::self.agg_samples]
#                 y_true = torch.flatten(y_true.long())
#                 ohe_labels_part = F.one_hot(y_true, num_classes=pred.shape[2])
                
#                 if ohe_labels_part.shape[0] < pred.shape[0] * pred.shape[1]:
#                     padding = torch.zeros((pred.shape[0], 1, pred.shape[2]), device=ohe_labels_part.device)
#                     ohe_labels_part = torch.cat((ohe_labels_part.reshape(pred.shape[0], pred.shape[1] - 1, pred.shape[2]), padding), dim=1).reshape(pred.shape[0] * pred.shape[1], pred.shape[2])
                
#                 ohe_labels += ohe_labels_part

#             ohe_labels[ohe_labels > 1] = 1
            
#             pred = pred.reshape(-1, pred.size(-1))

#             loss += self.loss(pred, ohe_labels)
                
#         return loss

In [56]:
N_EPOCHS = 20

In [57]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={
        "trans_date": {"in": 730, "out": 64},
        "small_group": {"in": 204, "out": 64},
        "amount_rur": {"in": BINS_NUM, "out": 64}
    },
    agg_samples=5, # 3, 5, 7
    use_window_attention=False
)

trx_encoder = ConvAggregator(**agg_encoder_params)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=256,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = ConvAgg_GPT_PretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    head_hidden_size=256,
    total_steps=(N_EPOCHS * 1688), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=1e-3,
    weight_decay=0.,
    pct_start=0.1,
    norm_predict=False
)

**Обучение:**

In [58]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="GPT_modeling_ConvAgg (5 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [59]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/6a2fa98b88d0452ead57b955d067f4be

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : GPT_modeling_ConvAgg (5 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/6a2fa98b88d0452ead57b955d067f4be
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [4051]               : (7.1974592208862305, 16.79559326171875)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (7.706978797912598, 8.111540794372559)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : GPT_modeling_ConvAgg (5 trx)
[1;38;5;39mCOMET INFO:[0m   Parameters:
[

In [60]:
trainer.logged_metrics

{'loss': tensor(7.4389), 'val loss (by epochs)': tensor(7.7070)}

In [61]:
encoder = gpt.seq_encoder

In [30]:
#torch.save(encoder.state_dict(), "gpt_WinAgg_trx10_multilabel.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [29]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=be4debcd-1d0b-4619-a15e-892431777c63
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 156MB/s] 


'gpt_baseline_NAdam.pt'

In [62]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): ConvAgg_GPT_PretrainModule(
    (trx_encoder): ConvAggregator(
      (embeddings): ModuleDict(
        (trans_date): NoisyEmbedding(
          730, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (small_group): NoisyEmbedding(
          204, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount_rur): NoisyEmbedding(
          128, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
      (conv): Conv1d(192, 192, kernel_size=(5,), stride=(1,), padding=(4,), bias=False)
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 192)
        (wpe): Embedding(2048, 192)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttentio

In [63]:
from tqdm import tqdm

seed_everything(17)

In [64]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

3375it [01:50, 30.49it/s]


array([[-0.33462667, -0.8954681 , -0.4302902 , ...,  0.31755188,
        -0.4282316 ,  1.0043997 ],
       [-0.62445164, -0.64348966,  0.17183866, ...,  0.47483826,
        -0.06584188,  0.09253756],
       [ 0.03645117, -0.00585962,  0.4776758 , ...,  0.4745791 ,
         0.22592434,  0.12130009],
       ...,
       [-0.14497751, -0.92623764, -0.20247638, ..., -0.3186678 ,
        -0.12830502,  0.71001214],
       [-0.22208676,  0.13177368,  0.2472424 , ...,  0.06017498,
        -0.2755309 ,  0.05550779],
       [-0.81465566, -0.3648326 ,  0.8156164 , ..., -0.1697784 ,
        -0.53425485,  0.56683344]], dtype=float32)

In [65]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=1)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

3000it [00:19, 154.49it/s]


array([[-0.44947675, -0.19581969,  0.40659872, ..., -0.04312054,
        -0.03518488,  0.30500835],
       [-0.67480373, -0.03185154,  0.14559251, ...,  0.16728781,
        -0.35244393, -0.0491464 ],
       [-0.33507663,  0.25592807,  0.40558302, ..., -0.05884036,
        -0.02490208, -0.23148002],
       ...,
       [-0.21486415, -0.59190446, -0.16830519, ...,  0.19163302,
        -0.27041575,  0.18258   ],
       [ 0.05898442, -0.06531187,  0.39572296, ...,  0.27670342,
         0.09850958,  0.25114918],
       [-0.38886726, -0.30049315,  0.40989238, ..., -0.29424664,
        -0.15515497, -0.21075957]], dtype=float32)

In [66]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=17)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3193835	total: 10.2ms	remaining: 10.2s
1:	learn: 1.2685855	total: 17.5ms	remaining: 8.73s
2:	learn: 1.2270816	total: 24.6ms	remaining: 8.18s
3:	learn: 1.1948241	total: 31.9ms	remaining: 7.94s
4:	learn: 1.1667674	total: 38.5ms	remaining: 7.66s
5:	learn: 1.1408816	total: 45ms	remaining: 7.46s
6:	learn: 1.1177070	total: 51.7ms	remaining: 7.33s
7:	learn: 1.0991144	total: 58.2ms	remaining: 7.22s
8:	learn: 1.0831135	total: 65ms	remaining: 7.16s
9:	learn: 1.0687182	total: 71.6ms	remaining: 7.09s
10:	learn: 1.0541723	total: 78.3ms	remaining: 7.04s
11:	learn: 1.0410538	total: 85.2ms	remaining: 7.01s
12:	learn: 1.0294052	total: 91.8ms	remaining: 6.97s
13:	learn: 1.0183539	total: 98.5ms	remaining: 6.94s
14:	learn: 1.0078467	total: 105ms	remaining: 6.91s
15:	learn: 0.9991070	total: 112ms	remaining: 6.88s
16:	learn: 0.9902528	total: 119ms	remaining: 6.86s
17:	learn: 0.9833126	total: 125ms	remaining: 6.83s
18:	learn: 0.9757522	total: 132ms	remaining: 6.82s
19

<catboost.core.CatBoostClassifier at 0x7bf16643ded0>

In [67]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [68]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.6153333333333333
ROC-AUC: 0.852413120169623


In [70]:
arr = np.array([0.8490318453772406, 0.852413120169623, 0.8524314150724707])

arr.mean(), arr.std()

(0.8512921268731115, 0.0015982778244722268)

- GPT embeds + Catboost:
    - `Accuracy: 0.6066666666666667`, `0.6246666666666667`, `0.6123333333333333`, avg: `0.6146 +- 0.0075`
    - `ROC-AUC: 0.8479316455892264`, `0.8544376474342738`, `0.8537725354907774`, avg: `0.852 +- 0.0029`

---

- GPT embeds + ConvAgg (3 trx) + Catboost:
  - `Accuracy: 0.613`, `0.6103333333333333`, `0.615`, avg: `0.6128 +- 0.0019`
  - `ROC-AUC: 0.8506448682792362`, `0.8542724256159796`, `0.8518455633248265`, avg: `0.8523 +- 0.0015`

---

- GPT embeds + ConvAgg (5 trx) + Catboost:
  - `Accuracy: 0.5976666666666667`, `0.6153333333333333`, `0.61`, avg: `0.6077 +- 0.0074`
  - `ROC-AUC: 0.8490318453772406`, `0.852413120169623`, `0.8524314150724707`, avg: `0.8513 +- 0.0016`

---

**Вывод:** При свёрточной агрегации метрики качества становятся хуже, чем у бейзлайна. С увеличением размера ядра свёртки качество становится лишь хуже. 

**Лучший результат - из всех рассмотренных конфигураций:**

- GPT embeds + ConvAgg (3 trx) + Catboost:
  - `Accuracy: 0.613`, `0.6103333333333333`, `0.615`, avg: `0.6128 +- 0.0019`
  - `ROC-AUC: 0.8506448682792362`, `0.8542724256159796`, `0.8518455633248265`, avg: `0.8523 +- 0.0015`

# Итоги.

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**                   | 0.4921 ± 0.005        | 0.76 ± 0.0012   |
| **GRU (+ MLP)**                           | 0.6066 ± 0.0019       | 0.8479 ± 0.0013 |
| **CoLES**                                 | 0.6042 ± 0.0083       | 0.8482 ± 0.0007 |
| **COLES embeds + ConvAgg (3 trx)**        | 0.6027 ± 0.0023       | 0.8461 ± 0.0013 |
| **CPC Modeling**                          | 0.5762 ± 0.0058       | 0.8305 ± 0.003  |
| **CPC Modeling + ConvAgg (5 trx)**        | 0.585 ± 0.0043        | 0.8317 ± 0.0018 |
| **GPT2**                                  | 0.6146 ± 0.0075       | 0.852 ± 0.0029  |
| **GPT2 + ConvAgg ()**                     | 0.6128 ± 0.0019       | 0.8523 ± 0.0015 |