# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=0be7a7953d034c21abb806e6512267fe92c0fc10cbaeeb9c4ea09c98e26fcf06
  

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

# Эксперименты.

**Данные:**

In [7]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [8]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [9]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [10]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [15]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [16]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [17]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [18]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Window Aggregator Class:**

In [19]:
from ptls.data_load.padded_batch import PaddedBatch


class WinAggregator(TrxEncoder):
    """The NN layer, a combination of TrxEncoder and Mean Aggregation within a window of #`agg_samples` transactions 
       (works like nn.Sequential([TrxEncoder, Mean Window Aggregation])).
       It is assumed that any two different windows do not overlap here.
       
       The types of the input and output are `PaddedBatch` of shapes (B, L, T) and (B, L', T) respectively, where 
       B means batch_size,
       L/L' means the max length of a sequence of transactions in a batch (the length is the same as #trx)
       T means the dimension of a single transaction.

       Parameters
        agg_samples (int):
            The number of transactions in a sliding aggregation window.

        use_window_attention (bool):
            If True, the attention layer will be applied to transactions in a sliding window before pooling.
            
        embeddings:
            You can find info about this param in TrxEncoder desc.
        
        numeric_values:
            You can find info about this param in TrxEncoder desc.

        embeddings_noise:
            You can find info about this param in TrxEncoder desc.
            
        emb_dropout:
            You can find info about this param in TrxEncoder desc.
            
        spatial_dropout:
            You can find info about this param in TrxEncoder desc.

        use_batch_norm:
            You can find info about this param in TrxEncoder desc.

        orthogonal_init:
            You can find info about this param in TrxEncoder desc.
            
        linear_projection_size:
            You can find info about this param in TrxEncoder desc.

        out_of_index:
            You can find info about this param in TrxEncoder desc.

        norm_embeddings:
            Keep default value for this parameter
        
        clip_replace_value:
            Not used. Keep default value for this parameter
        
        positions: 
            Not used. Keep default value for this parameter
       """

    def __init__(self,
                 agg_samples=3,
                 use_window_attention=False,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=False,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                ):
        
        super().__init__(
            embeddings=embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            embeddings_noise=embeddings_noise,
            norm_embeddings=norm_embeddings,
            use_batch_norm=use_batch_norm,
            use_batch_norm_with_lens=use_batch_norm_with_lens,
            clip_replace_value=clip_replace_value,
            positions=positions,
            emb_dropout=emb_dropout,
            spatial_dropout=spatial_dropout,
            orthogonal_init=orthogonal_init,
            linear_projection_size=linear_projection_size,
            out_of_index=out_of_index,
        )

        self.agg_samples = agg_samples

        self.use_window_attention = use_window_attention
        if self.use_window_attention:
            pass # Not Implemented

    def forward(self, pb: PaddedBatch):
        embeds = super().forward(pb)

        mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
        mask[mask > embeds.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask[:, :, None]
    
        masked_embeds = embeds.payload * mask
    
        num_samples_to_add = self.agg_samples - (masked_embeds.shape[1] % self.agg_samples)  
        if num_samples_to_add < self.agg_samples:
            additional_samples = torch.zeros((masked_embeds.shape[0], num_samples_to_add, masked_embeds.shape[2]), device=masked_embeds.device)
            masked_embeds = torch.cat((masked_embeds, additional_samples), dim=1)
            mask_additional_samples = torch.zeros((mask.shape[0], num_samples_to_add, mask.shape[2]), device=mask.device)
            mask = torch.cat((mask, mask_additional_samples), dim=1)
    
        masked_embeds = torch.reshape(masked_embeds, (masked_embeds.shape[0], masked_embeds.shape[1] // self.agg_samples, self.agg_samples, masked_embeds.shape[2]))
        mask = torch.reshape(mask, (mask.shape[0], mask.shape[1] // self.agg_samples, self.agg_samples, mask.shape[2]))

        if self.use_window_attention:
            pass # Not Implemented
        
        mask = torch.sum(mask, dim=2)
        mask[mask == 0.] = 1.
    
        mean_embeds = torch.sum(masked_embeds, dim=2) / mask

        new_seq_lens = embeds.seq_lens // self.agg_samples
        div_mod_seq_lens = ((embeds.seq_lens % self.agg_samples) > 0).int()
        new_seq_lens += div_mod_seq_lens

        return PaddedBatch(mean_embeds, new_seq_lens)

In [45]:
# seed_everything(0)

In [46]:
# device = "cuda:0"

In [47]:
# trx_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
# )

# trx_encoder = TrxEncoder(**trx_encoder_params).to(device)

In [None]:
# from ptls.data_load.padded_batch import PaddedBatch


# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# agg_samples = 5
# use_attention = False
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)
        
#     embeds = trx_encoder(batch)
    
#     mask = torch.arange(embeds.payload.shape[1], device=embeds.device)[None, :] + torch.ones((embeds.seq_lens.shape[0], embeds.payload.shape[1]), device=embeds.device)
#     mask[mask > embeds.seq_lens[:, None]] = 0.
#     mask[mask > 0.] = 1.
#     mask = mask[:, :, None]
    
#     masked_embeds = embeds.payload * mask
    
#     num_samples_to_add = agg_samples - (masked_embeds.shape[1] % agg_samples)  
#     if num_samples_to_add > 0:
#         additional_samples = torch.zeros((masked_embeds.shape[0], num_samples_to_add, masked_embeds.shape[2]), device=masked_embeds.device)
#         masked_embeds = torch.cat((masked_embeds, additional_samples), dim=1)

#         mask_additional_samples = torch.zeros((mask.shape[0], num_samples_to_add, mask.shape[2]), device=mask.device)
#         mask = torch.cat((mask, mask_additional_samples), dim=1)
    
#     masked_embeds = torch.reshape(masked_embeds, (masked_embeds.shape[0], masked_embeds.shape[1] // agg_samples, agg_samples, masked_embeds.shape[2]))
#     mask = torch.reshape(mask, (mask.shape[0], mask.shape[1] // agg_samples, agg_samples, mask.shape[2]))

#     mask = torch.sum(mask, dim=2)
#     mask[mask == 0.] = 1.
    
#     mean_embeds = torch.sum(masked_embeds, dim=2) / mask

#     new_seq_lens = embeds.seq_lens // agg_samples
#     div_mod_seq_lens = ((embeds.seq_lens % agg_samples) > 0).int()
#     new_seq_lens += div_mod_seq_lens

#     out = PaddedBatch(mean_embeds, new_seq_lens)

#     if i == 0:
#         print(out.payload)

In [17]:
# seed_everything(0)

In [18]:
# agg_encoder_params = dict(
#     embeddings_noise=0.003,
#     numeric_values={"amount_rur": "log"},
#     embeddings={
#         "trans_date": {"in": 800, "out": 16},
#         "small_group": {"in": 250, "out": 16},
#     },
#     agg_samples=5,
#     use_window_attention=False
# )

# trx_encoder = WinAggregator(**agg_encoder_params)

In [19]:
# device = "cuda:0"

# trx_encoder.to(device)

WinAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [None]:
# train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
# trx_encoder.eval()

# for i, batch in tqdm(enumerate(train_loader)):
#     batch = batch.to(device)    
#     embeds = trx_encoder(batch)

#     if i == 0:
#         print(embeds.payload)

---

**Train sequences lengths check:**

In [64]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    agg_samples=7, # 3, 5, 7, 9
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

WinAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [65]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

211it [00:02, 90.54it/s]

Max Length: 98





---

# Sliding Window Aggregation (Mean Pooling) 

- **COLES:**

In [96]:
seed_everything(42)

**DataLoaders:**

In [97]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=98,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=98,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [98]:
N_EPOCHS = 20

In [99]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    agg_samples=7, # 3, 5, 7
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [100]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_WinAgg (7 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [101]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/ce1a22d8538746b1ae163b5606faf6f9

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_WinAgg (7 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/ce1a22d8538746b1ae163b5606faf6f9
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [506]              : (205.81675720214844, 679.3453369140625)
[1;38;5;39mCOMET INFO:[0m     seq_len [84]            : (61.59062576293945, 65.484375)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.3394019603729248, 0.5559663772583008)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     N

In [None]:
trainer.logged_metrics

In [56]:
torch.save(seq_encoder.state_dict(), "coles_enc_win_agg.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [38]:
encoder = coles.seq_encoder

device = "cuda:0"

encoder.to(device)

NameError: name 'coles' is not defined

In [104]:
from tqdm import tqdm

seed_everything(42)

In [105]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:08, 25.65it/s]


array([[-0.00626337,  0.01715741, -0.13479304, ..., -0.03142321,
        -0.01954646,  0.18334441],
       [-0.11690278,  0.00744526, -0.09708115, ..., -0.18422273,
        -0.00399925,  0.36703435],
       [ 0.02249851,  0.01768246, -0.09994256, ..., -0.10652082,
        -0.07727744, -0.12991075],
       ...,
       [ 0.05404179,  0.06340655, -0.17433447, ...,  0.04351491,
         0.00598915,  0.24797532],
       [ 0.04026763, -0.15382345, -0.16274127, ...,  0.07604814,
        -0.01944207,  0.5294823 ],
       [-0.05539756,  0.10131657, -0.02582263, ..., -0.14665572,
         0.00741105,  0.1673597 ]], dtype=float32)

In [106]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:00, 34.90it/s]


array([[-0.06427456,  0.01435707, -0.09943505, ..., -0.09242189,
        -0.01029099, -0.10598388],
       [ 0.01568271,  0.02139088, -0.13089582, ..., -0.0902661 ,
        -0.01162825, -0.32072976],
       [ 0.03547226, -0.14433952, -0.30147183, ..., -0.14662269,
        -0.0206276 ,  0.3131373 ],
       ...,
       [-0.12479684,  0.01490287, -0.22529568, ..., -0.08326113,
         0.13951257,  0.49538934],
       [-0.03719057, -0.07858124, -0.16590972, ..., -0.04052776,
        -0.03857748,  0.3387399 ],
       [-0.10337154, -0.08415452, -0.17213556, ..., -0.19113952,
         0.00407451,  0.12825194]], dtype=float32)

In [107]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3148220	total: 119ms	remaining: 1m 58s
1:	learn: 1.2610926	total: 135ms	remaining: 1m 7s
2:	learn: 1.2198866	total: 151ms	remaining: 50.2s
3:	learn: 1.1859025	total: 167ms	remaining: 41.5s
4:	learn: 1.1574518	total: 182ms	remaining: 36.3s
5:	learn: 1.1335992	total: 197ms	remaining: 32.7s
6:	learn: 1.1127711	total: 238ms	remaining: 33.8s
7:	learn: 1.0952816	total: 254ms	remaining: 31.5s
8:	learn: 1.0805418	total: 270ms	remaining: 29.7s
9:	learn: 1.0670747	total: 285ms	remaining: 28.2s
10:	learn: 1.0553768	total: 301ms	remaining: 27s
11:	learn: 1.0455880	total: 319ms	remaining: 26.3s
12:	learn: 1.0369622	total: 334ms	remaining: 25.3s
13:	learn: 1.0289857	total: 349ms	remaining: 24.6s
14:	learn: 1.0218785	total: 364ms	remaining: 23.9s
15:	learn: 1.0160330	total: 379ms	remaining: 23.3s
16:	learn: 1.0100311	total: 393ms	remaining: 22.7s
17:	learn: 1.0051181	total: 409ms	remaining: 22.3s
18:	learn: 1.0003602	total: 423ms	remaining: 21.8s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x78f1df05a620>

In [108]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [109]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5663333333333334
ROC-AUC: 0.8164720411288953


In [111]:
arr = np.array([0.8207256798430592, 0.8163454938608756, 0.8164720411288953])

arr.mean(), arr.std()

(0.8178477382776101, 0.0020356676698920963)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

- COLES embeds + WinAgg (3 trx) + Catboost:
  - Accuracy: `0.5826666666666667`, `0.588`, `0.5836666666666667`, avg: `0.5848 +- 0.0023`
  - ROC-AUC: `0.8319036627380707`, `0.8345776108559412`, `0.8337327191569267`, avg: `0.8334 +- 0.0011`

---

- COLES embeds + WinAgg (5 trx) + Catboost:
  - Accuracy: `0.5596666666666666`, `0.5626666666666666`, `0.5786666666666667`, avg: `0.567 +- 0.0083`
  - ROC-AUC: `0.8217997231702415`, `0.8235306635500276`, `0.8291795832873488`, avg: `0.8248 +- 0.0032`

---

- COLES embeds + WinAgg (7 trx) + Catboost:
  - Accuracy: `0.558`, `0.5576666666666666`, `0.5663333333333334`, avg: `0.5607 +- 0.004`
  - ROC-AUC: `0.8207256798430592`, `0.8163454938608756`, `0.8164720411288953`, avg: `0.8178 +- 0.002`

---

**Вывод:** для CoLES качество с увеличением размера агрегирующего окна убывает. 



---

**Train sequences lengths check:**

In [71]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
    agg_samples=6, # 3, 6
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)
trx_encoder.to("cuda")

WinAggregator(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
)

In [74]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:01, 117.03it/s]

Max Length: 151





---

- **CPC modeling:**

In [16]:
seed_everything(42)

**DataLoaders:**

In [17]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=144,
        max_len=151
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=144,
        max_len=151
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [18]:
N_EPOCHS = 20

In [19]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 128},
        "small_group": {"in": 250, "out": 128},
    },
    agg_samples=6, # 3, 6
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=1, # 2, 1
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=2e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [20]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_WinAgg (6 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [21]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/bb51470f90c54b5cad07d88c5f85859c

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_WinAgg (6 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/bb51470f90c54b5cad07d88c5f85859c
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (0.09794710576534271, 4.219316482543945)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (146.671875, 148.046875)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.899142324924469, 0.9466244578361511)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     

In [22]:
trainer.logged_metrics

{'loss': tensor(0.1314),
 'seq_len': tensor(147.3750),
 'valid/cpc_accuracy': tensor(0.9457)}

In [23]:
# torch.save(seq_encoder.state_dict(), "cpc_enc_win_agg_trx20.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [23]:
encoder = cpc.seq_encoder

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): WinAggregator(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(257, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [24]:
encoder.seq_encoder.is_reduce_sequence = True

In [25]:
from tqdm import tqdm

seed_everything(42)

In [26]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:09, 23.33it/s]


array([[-0.0363386 , -0.03587728,  0.25668928, ...,  0.5224687 ,
         0.7716708 , -0.02049406],
       [ 0.04081652, -0.10602736,  0.40002048, ...,  0.3275246 ,
         0.52234346,  0.08281665],
       [ 0.05119577, -0.02502887,  0.36779672, ..., -0.04892583,
         0.62494373, -0.03555078],
       ...,
       [-0.10441852, -0.02810379,  0.19443573, ...,  0.49979734,
         0.49048716,  0.06382503],
       [-0.04859884,  0.08827397,  0.44719657, ...,  0.22801788,
         0.32663614, -0.07386585],
       [ 0.17592621, -0.17344205,  0.2283991 , ...,  0.5854966 ,
         0.32582963,  0.07555645]], dtype=float32)

In [27]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:00, 30.00it/s]


array([[ 0.01035566, -0.1447108 ,  0.35650015, ...,  0.5668402 ,
         0.5214527 ,  0.05695374],
       [-0.04436681,  0.03672136,  0.37990516, ...,  0.14695515,
         0.46104044, -0.05235469],
       [ 0.16469611, -0.0747186 ,  0.5322652 , ...,  0.37157276,
         0.45988473,  0.03353842],
       ...,
       [-0.1362306 ,  0.0317709 ,  0.4292283 , ...,  0.41841328,
         0.7450947 , -0.073914  ],
       [ 0.03387939, -0.12427255,  0.38639694, ...,  0.2617555 ,
         0.5063844 , -0.06007706],
       [ 0.00807023, -0.16796482,  0.46044996, ...,  0.29667914,
         0.46810356,  0.04791973]], dtype=float32)

In [28]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3154677	total: 11.7s	remaining: 3h 15m 19s
1:	learn: 1.2625855	total: 11.7s	remaining: 1h 37m 39s
2:	learn: 1.2222927	total: 11.8s	remaining: 1h 5m 5s
3:	learn: 1.1891576	total: 11.8s	remaining: 48m 49s
4:	learn: 1.1606844	total: 11.8s	remaining: 39m 3s
5:	learn: 1.1355854	total: 11.8s	remaining: 32m 32s
6:	learn: 1.1150414	total: 11.8s	remaining: 27m 53s
7:	learn: 1.0977049	total: 11.8s	remaining: 24m 24s
8:	learn: 1.0822246	total: 11.8s	remaining: 21m 41s
9:	learn: 1.0678529	total: 11.8s	remaining: 19m 31s
10:	learn: 1.0554569	total: 11.8s	remaining: 17m 44s
11:	learn: 1.0457593	total: 11.8s	remaining: 16m 15s
12:	learn: 1.0360415	total: 11.9s	remaining: 15m
13:	learn: 1.0274833	total: 11.9s	remaining: 13m 56s
14:	learn: 1.0192130	total: 11.9s	remaining: 13m
15:	learn: 1.0118016	total: 11.9s	remaining: 12m 11s
16:	learn: 1.0053092	total: 11.9s	remaining: 11m 28s
17:	learn: 0.9993252	total: 11.9s	remaining: 10m 49s
18:	learn: 0.9934441	total: 1

<catboost.core.CatBoostClassifier at 0x7ec9eef42f80>

In [29]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [30]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5853333333333334
ROC-AUC: 0.8304399835311407


In [32]:
arr = np.array([0.8280022286814495, 0.8254547692538704, 0.8304399835311407])

arr.mean(), arr.std()

(0.8279656604888203, 0.002035369462302276)

- CPC context embeds + Catboost:
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

---

- CPC context embeds (2 forward steps) + WinAgg (3 trx) + Catboost:
  - `Accuracy: 0.5776666666666667`, `0.583`, `0.592`, avg: `0.5842 +- 0.0059`
  - `ROC-AUC: 0.831959843117587`, `0.8331583068898762`, `0.8354422648691314`, avg: `0.8335 +- 0.0014`

---

- CPC context embeds (1 forward step) + WinAgg (6 trx) + Catboost:
  - `Accuracy: 0.5786666666666667`, `0.57`, `0.5853333333333334`, avg: `0.578 +- 0.0063`
  - `ROC-AUC: 0.8280022286814495`, `0.8254547692538704`, `0.8304399835311407`, avg: `0.828 +- 0.002`

---

**Вывод:** агрегация окном из 3 транзакций демонстрирует качество, которое значительно лучше, чем у бейзлайна, как по accuracy, так и по ROC-AUC (как в среднем, так и по отдельным сидам). При дальнейшем увеличении размера окна незначительно падает ROC-AUC, при этом accuracy всё ещё выше, чем в случае бейзлайна (как в среднем, так и по отдельным сидам).    

---

**Конфигурация, лучшая по метрикам:** 

- CPC context embeds (2 forward steps) + WinAgg (3 trx) + Catboost:
  - `Accuracy: 0.5776666666666667`, `0.583`, `0.592`, avg: `0.5842 +- 0.0059`
  - `ROC-AUC: 0.831959843117587`, `0.8331583068898762`, `0.8354422648691314`, avg: `0.8335 +- 0.0014`

---

- **GPT:**

In [147]:
seed_everything(42)

**DataLoaders:**

In [148]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000,
        max_len=1200
    ),
    train_num_workers=4,
    train_batch_size=16,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=16
)

**Модель:**

In [113]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
import torch.nn.functional as F 
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class GptPretrainModule(pl.LightningModule):
    """GPT2 Language model

    Sequence transactions are encoded by `trx_encoder`.
    Then `seq_encoder` encodes the given sequence 
    (we actually use NN to modify sequence transactions representations,
    then (during inference) we calculate the mean of these encoded transactions to get the representation of the whole sequence).
    After this we use heads to predict the classes of features of the future transaction.

    Parameters
    ----------
    trx_encoder:
        Module for transform dict with feature sequences to sequence of transaction representations
    seq_encoder:
        Module for sequence processing. Generally this is transformer based encoder. Rnn is also possible
        Should work without sequence reduction
    head_hidden_size:
        Hidden size of heads for feature prediction
    seed_seq_len:
         Size of starting sequence without loss 
    total_steps:
        total_steps expected in OneCycle lr scheduler
    max_lr:
        max_lr of OneCycle lr scheduler
    weight_decay:
        weight_decay of Adam optimizer
    pct_start:
        % of total_steps when lr increase
    norm_predict:
        use l2 norm for transformer output or not
    """

    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())

            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log('loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

In [114]:
class WinAgg_GPT_PretrainModule(GptPretrainModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder=seq_encoder,
            head_hidden_size=head_hidden_size,
            total_steps=total_steps,
            seed_seq_len=seed_seq_len,
            max_lr=max_lr,
            weight_decay=weight_decay,
            pct_start=pct_start,
            norm_predict=norm_predict
        )
        self.agg_samples = trx_encoder.agg_samples

    def loss_gpt(self, logits, labels):
        loss = 0
        
        for col_name, head in self.head.items():
            out = head(logits[:, (self.hparams.seed_seq_len // self.agg_samples):-1, :])
            
            y_true = labels[col_name][:, (self.hparams.seed_seq_len + self.agg_samples)::self.agg_samples]
            y_true = torch.flatten(y_true.long())
            
            pred = out.reshape(-1, out.size(-1))
                    
            loss += self.loss(pred, y_true)
                
        return loss

In [115]:
class WinAgg_GPT_MultiPredPretrainModule(pl.LightningModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):
        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.agg_samples = trx_encoder.agg_samples

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            for shift in range(self.agg_samples):
                self.head[f"{col_name}, {shift}"] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        n_obj = 0

        NUM_FEATURES = len(self.head.keys()) // self.agg_samples
        
        for key, head in self.head.items():
            col_name, shift = key.split(', ')
            shift = int(shift)
            
            out = head(logits[:, (self.hparams.seed_seq_len // self.agg_samples):-1, :])
            
            y_true = labels[col_name][:, (self.hparams.seed_seq_len + self.agg_samples + shift)::self.agg_samples]
            y_true = torch.flatten(y_true.long())

            # delete last state of pred for sequences with len not divisible by `self.agg_samples`
            if y_true.shape[0] < out.shape[0] * out.shape[1]:
                pred = out[:, :-1, :]
                pred = pred.reshape(-1, pred.size(-1))
            else:
                pred = out.reshape(-1, out.size(-1))
            
            n_obj += pred.shape[0] 
            loss += self.loss(pred, y_true) * pred.shape[0]

        n_obj //= NUM_FEATURES
        
        return loss / n_obj

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)

In [116]:
class WinAgg_GPT_MultiLabelPretrainModule(GptPretrainModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):
        super().__init__(
            trx_encoder=trx_encoder,
            seq_encoder=seq_encoder,
            head_hidden_size=head_hidden_size,
            total_steps=total_steps,
            seed_seq_len=seed_seq_len,
            max_lr=max_lr,
            weight_decay=weight_decay,
            pct_start=pct_start,
            norm_predict=norm_predict
        )
        self.agg_samples = trx_encoder.agg_samples
        self.loss = nn.MultiLabelSoftMarginLoss()

    def loss_gpt(self, logits, labels):
        loss = 0
        
        for col_name, head in self.head.items():
            pred = head(logits[:, (self.hparams.seed_seq_len // self.agg_samples):-1, :])

            ohe_labels = torch.zeros((pred.shape[0] * pred.shape[1], pred.shape[2]), device=pred.device)
            
            for shift in range(self.agg_samples):
                y_true = labels[col_name][:, (self.hparams.seed_seq_len + self.agg_samples + shift)::self.agg_samples]
                y_true = torch.flatten(y_true.long())
                ohe_labels_part = F.one_hot(y_true, num_classes=pred.shape[2])
                
                if ohe_labels_part.shape[0] < pred.shape[0] * pred.shape[1]:
                    padding = torch.zeros((pred.shape[0], 1, pred.shape[2]), device=ohe_labels_part.device)
                    ohe_labels_part = torch.cat((ohe_labels_part.reshape(pred.shape[0], pred.shape[1] - 1, pred.shape[2]), padding), dim=1).reshape(pred.shape[0] * pred.shape[1], pred.shape[2])
                
                ohe_labels += ohe_labels_part

            ohe_labels[ohe_labels > 1] = 1
            
            pred = pred.reshape(-1, pred.size(-1))

            loss += self.loss(pred, ohe_labels)
                
        return loss

In [149]:
N_EPOCHS = 20

In [150]:
agg_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={
        "trans_date": {"in": 730, "out": 64},
        "small_group": {"in": 204, "out": 64},
        "amount_rur": {"in": BINS_NUM, "out": 64}
    },
    agg_samples=8, # 4, 8
    use_window_attention=False
)

trx_encoder = WinAggregator(**agg_encoder_params)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=256,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = WinAgg_GPT_MultiLabelPretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    head_hidden_size=256,
    total_steps=(N_EPOCHS * 1688), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=1e-3,
    weight_decay=0.,
    pct_start=0.1,
    norm_predict=False
)

**Обучение:**

In [151]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="GPT_modeling_WinAgg (multilabel, 8 trx)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [152]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/9aec80de47fc4bf98fff186679b6f238

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : GPT_modeling_WinAgg (multilabel, 8 trx)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/9aec80de47fc4bf98fff186679b6f238
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [4051]               : (0.1894596666097641, 2.09991192817688)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (0.2216295301914215, 0.2549515664577484)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : GPT_modeling_WinAgg (multilabel, 8 trx)
[1;38;5;39mCOMET INF

In [153]:
trainer.logged_metrics

{'loss': tensor(0.2302), 'val loss (by epochs)': tensor(0.2221)}

In [154]:
encoder = gpt.seq_encoder

In [30]:
#torch.save(encoder.state_dict(), "gpt_WinAgg_trx10_multilabel.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [29]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=be4debcd-1d0b-4619-a15e-892431777c63
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 156MB/s] 


'gpt_baseline_NAdam.pt'

In [155]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): WinAgg_GPT_MultiLabelPretrainModule(
    (trx_encoder): WinAggregator(
      (embeddings): ModuleDict(
        (trans_date): NoisyEmbedding(
          730, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (small_group): NoisyEmbedding(
          204, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount_rur): NoisyEmbedding(
          128, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 192)
        (wpe): Embedding(2048, 192)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): Conv1D(nf=576, nx=192)
              (c_proj): Conv1D

In [156]:
from tqdm import tqdm

seed_everything(42)

In [157]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

3375it [00:30, 112.12it/s]


array([[-0.40674964,  0.1959355 ,  0.24533263, ..., -0.795126  ,
        -0.17622778,  0.2689529 ],
       [-0.89528674,  0.37394863,  0.32791305, ..., -0.07389312,
        -0.26200154,  0.6980752 ],
       [-0.18811287, -0.445097  ,  0.02055499, ..., -0.84917617,
         0.6106786 , -0.24352248],
       ...,
       [-0.59717417, -0.1898793 ,  0.03392228, ..., -0.05148135,
         0.348568  ,  0.45855898],
       [-0.0434066 , -0.22783218,  0.03942945, ..., -0.2786168 ,
         0.36963922,  1.0482833 ],
       [-0.52986205,  0.02660907,  0.2364344 , ..., -1.4152788 ,
         0.24347876, -0.2514858 ]], dtype=float32)

In [158]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=1)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

3000it [00:18, 158.37it/s]


array([[ 2.42870390e-01,  2.80196726e-01,  7.52530023e-02, ...,
         6.04788885e-02, -1.54085651e-01,  2.19389185e-01],
       [-2.82843590e-01,  1.38164386e-01,  1.68172359e-01, ...,
        -1.10153705e-02, -1.21120073e-01,  1.17639434e+00],
       [-2.01603130e-01, -4.08510081e-02,  6.27717450e-02, ...,
        -1.93402935e-02, -7.00594127e-01,  1.17187428e+00],
       ...,
       [-5.43832667e-02, -3.97935510e-02, -4.60554063e-02, ...,
         2.93713808e-01,  3.51669580e-01, -9.97617841e-03],
       [-1.32906556e-01, -5.46759181e-02, -5.74836805e-02, ...,
         3.60775292e-01,  8.52004945e-01, -4.17188019e-01],
       [-6.42999075e-04,  3.17241639e-01, -6.43129572e-02, ...,
         8.53995085e-02, -8.26692402e-01, -1.64276123e-01]], dtype=float32)

In [159]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3191730	total: 11.7ms	remaining: 11.6s
1:	learn: 1.2688307	total: 21.5ms	remaining: 10.7s
2:	learn: 1.2269002	total: 31.1ms	remaining: 10.3s
3:	learn: 1.1925417	total: 41ms	remaining: 10.2s
4:	learn: 1.1646685	total: 50.7ms	remaining: 10.1s
5:	learn: 1.1391448	total: 60.5ms	remaining: 10s
6:	learn: 1.1181455	total: 70.2ms	remaining: 9.96s
7:	learn: 1.0983912	total: 79ms	remaining: 9.8s
8:	learn: 1.0828426	total: 87.7ms	remaining: 9.66s
9:	learn: 1.0683554	total: 96.7ms	remaining: 9.57s
10:	learn: 1.0552652	total: 106ms	remaining: 9.5s
11:	learn: 1.0445733	total: 114ms	remaining: 9.43s
12:	learn: 1.0333869	total: 124ms	remaining: 9.38s
13:	learn: 1.0232512	total: 134ms	remaining: 9.44s
14:	learn: 1.0135802	total: 143ms	remaining: 9.37s
15:	learn: 1.0052124	total: 151ms	remaining: 9.3s
16:	learn: 0.9974878	total: 160ms	remaining: 9.26s
17:	learn: 0.9905699	total: 169ms	remaining: 9.2s
18:	learn: 0.9835332	total: 177ms	remaining: 9.16s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x7c7b0dbdd390>

In [160]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [161]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5886666666666667
ROC-AUC: 0.8391983795415185


In [163]:
# arr = np.array([0.8349655367570361, 0.8361761582227041, 0.8391983795415185])

# arr.mean(), arr.std()

(0.8367800248404196, 0.0017800244714665442)

- GPT embeds + Catboost:
    - `Accuracy: 0.6066666666666667`, `0.6246666666666667`, `0.6123333333333333`, avg: `0.6146 +- 0.0075`
    - `ROC-AUC: 0.8479316455892264`, `0.8544376474342738`, `0.8537725354907774`, avg: `0.852 +- 0.0029`

---

- GPT embeds + WinAgg (single trx pred, 4 trx window) + Catboost:
  - `Accuracy: 0.5936666666666667`, `0.595`, `0.59`, avg: `0.5929 +- 0.0021`
  - `ROC-AUC: 0.8382115223511158`, `0.8411807016631387`, `0.8383169211628088`, avg: `0.8392 +- 0.0014`

---

- GPT embeds + WinAgg (single trx pred, 8 trx window) + Catboost:
  - `Accuracy: 0.5916666666666667`, `0.58`, `0.5856666666666667`, avg: `0.5858 +- 0.0048`
  - `ROC-AUC: 0.8335367211207741`, `0.8336083023396845`, `0.833044459803327`, avg: `0.8334 +- 0.0003`

---

- GPT embeds + WinAgg (multi trx pred, 4 trx window) + Catboost:
  - `Accuracy: 0.5963333333333334`, `0.602`, `0.599`, avg: `0.5991 +- 0.0023`
  - `ROC-AUC: 0.8425110319514505`, `0.8431281221563298`, `0.8418593585453994`, avg: `0.8425 +- 0.0005`

---

- GPT embeds + WinAgg (multi trx pred, 8 trx window) + Catboost:
  - `Accuracy: 0.592`, `0.59`, `0.5856666666666667`, avg: `0.5892 +- 0.0026`
  - `ROC-AUC: 0.8396865366512366`, `0.840494457354911`, `0.8366508027056987`, avg: `0.8389 +- 0.0017`

---

- GPT embeds + WinAgg (multilabel, 4 trx window) + Catboost:
  - `Accuracy: 0.588`, `0.5983333333333334`, `0.5853333333333334`, avg: `0.5906 +- 0.0056`
  - `ROC-AUC: 0.8408759046544702`, `0.8422989284108013`, `0.8351464495113141`, avg: `0.8394 +- 0.0031`

---

- GPT embeds + WinAgg (multilabel, 8 trx window) + Catboost:
  - `Accuracy: 0.5773333333333334`, `0.5896666666666667`, `0.5886666666666667`, avg: `0.5852 +- 0.0056`
  - `ROC-AUC: 0.8349655367570361`, `0.8361761582227041`, `0.8391983795415185`, avg: `0.8368 +- 0.0018`

---

**Вывод:** Все рассмотренные конфигурации оказались значительно хуже бейзлайна по качеству (по метрикам). При этом из 3 подходов - предсказание первой из N следующих транзакций, которые будут сагрегированы; предсказание N из N следующих транзакций, которые будут сагрегированы; предсказание того, какие транзакции войдут в следующую сагрегированную - лучше остальных показал себя второй подход (предсказание N из N следующих транзакций) - он демонстрируют наименьшую просадку по метрикам по сравнению с бейзлайном. 

С увеличением агрегирующего окна качество модели ухудшается. Оптимальный размер окна здесь - 4 транзакции.

**Лучший результат - из всех рассмотренных конфигураций:**

- GPT embeds + WinAgg (multi trx pred, 4 trx window) + Catboost:
  - `Accuracy: 0.5963333333333334`, `0.602`, `0.599`, avg: `0.5991 +- 0.0023`
  - `ROC-AUC: 0.8425110319514505`, `0.8431281221563298`, `0.8418593585453994`, avg: `0.8425 +- 0.0005`

# Итоги.

| Method|Accuracy|ROC-AUC|
| --- |:---:|:---:|
| **Flattened Sequences**                   | 0.4921 ± 0.005        | 0.76 ± 0.0012   |
| **GRU (+ MLP)**                           | 0.6066 ± 0.0019       | 0.8479 ± 0.0013 |
| **CoLES**                                 | 0.6042 ± 0.0083       | 0.8482 ± 0.0007 |
| **COLES embeds + WinAgg (3 trx)**         | 0.5848 ± 0.0023       | 0.8334 ± 0.0011 |
| **CPC Modeling**                          | 0.5762 ± 0.0058       | 0.8305 ± 0.003  |
| **CPC Modeling + WinAgg (3 trx)**         | 0.5842 ± 0.0059       | 0.8335 ± 0.0014 |
| **GPT2**                                  | 0.6146 ± 0.0075       | 0.852 ± 0.0029  |
| **GPT2 + WinAgg (4 trx, next N trx loss)**| 0.5991 ± 0.0023       | 0.8425 ± 0.0005 |
