# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=8da828492c26526816a12a8ddef3bf4a500a5177ba7791a42a0bbb6c3b4f3601
  

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

# Supervised-кейс. Бустинг

**Препроцессим данные:**

**Скачаем данные:**

In [40]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [41]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


**Делим таргет на трейн и на тест:**

In [42]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

**Проводим препроцессинг транзакций (деление на трейн и на тест, подготовка данных под нужный формат):**

In [43]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

**Дополнительно: убираем все транзакции с редкими MCC-кодами:**

In [7]:
# def filter_rows_by_MCC(data):
#     MCC_top100 = data["small_group"].value_counts()[:100].index.to_list()
#     data = data[data["small_group"].isin(MCC_top100)].reset_index(drop=True)
#     return data

In [8]:
# trx_data_train = filter_rows_by_MCC(trx_data_train)
# trx_data_test = filter_rows_by_MCC(trx_data_test)

**Дополнительно: предобработка числовых признаков: $x \longrightarrow \text{sign}(x) \cdot \log(1 + |x|)$**

In [44]:
def preprocess_num_features(data, features_list):
    for feature in features_list:
        data[feature] = np.sign(data[feature]) * np.log1p(np.abs(data[feature]))
    return data

In [45]:
trx_data_train = preprocess_num_features(data=trx_data_train, features_list=["amount_rur"])
trx_data_test = preprocess_num_features(data=trx_data_test, features_list=["amount_rur"])

In [46]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [47]:
trx_data_train = preprocessor.fit_transform(trx_data_train)
trx_data_test = preprocessor.transform(trx_data_test)

In [48]:
trx_data_train.drop(columns=["trans_date"], inplace=True)
trx_data_test.drop(columns=["trans_date"], inplace=True)

In [49]:
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)

In [50]:
trx_data_train.drop(columns=["client_id"], inplace=True)
trx_data_test.drop(columns=["client_id"], inplace=True)
target_train.drop(columns=["client_id"], inplace=True)
target_test.drop(columns=["client_id"], inplace=True)

In [51]:
trx_data_train.reset_index(inplace=True, drop=True)
trx_data_test.reset_index(inplace=True, drop=True)
target_train.reset_index(inplace=True, drop=True)
target_test.reset_index(inplace=True, drop=True)

In [52]:
max_seq_length = 0 

for idx, row in trx_data_train.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

for idx, row in trx_data_test.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

print("Max Sequence Length:", max_seq_length)

Max Sequence Length: 1150


In [53]:
columns = ["event_time", "small_group", "amount_rur"]

#columns = ["small_group", "amount_rur"]

new_tables_columns = []

for i in range(1, max_seq_length + 1):
    for col in columns:
        new_tables_columns.append(col + "_" + str(i))

In [54]:
new_train_table_contents = []

for idx, row in tqdm(trx_data_train.iterrows()):
    new_row = []
    trans_date = row["event_time"]
    small_group = row["small_group"]
    amount = row["amount_rur"]
    for j in range(max_seq_length):
        if j < trans_date.shape[0]:
            new_row.append(trans_date[j].item())
            new_row.append(small_group[j].item())
            new_row.append(amount[j].item())
        else:
            new_row.append(-1)
            new_row.append(-1)
            new_row.append(0.)
    new_train_table_contents.append(new_row)

trx_data_train = pd.DataFrame(data=new_train_table_contents, columns=new_tables_columns)
new_train_table_contents = []

27000it [03:06, 144.51it/s]


In [55]:
new_test_table_contents = []

for idx, row in tqdm(trx_data_test.iterrows()):
    new_row = []
    trans_date = row["event_time"]
    small_group = row["small_group"]
    amount = row["amount_rur"]
    for j in range(max_seq_length):
        if j < trans_date.shape[0]:
            new_row.append(trans_date[j].item())
            new_row.append(int(small_group[j].item()))
            new_row.append(amount[j].item())
        else:
            new_row.append(-1)
            new_row.append(-1)
            new_row.append(0.)
    new_test_table_contents.append(new_row)

trx_data_test = pd.DataFrame(data=new_test_table_contents, columns=new_tables_columns)
new_test_table_contents = []

3000it [00:21, 140.80it/s]


In [56]:
cat_features = []

for idx, feature in enumerate(new_tables_columns):
    if idx % 3 == 0 or idx % 3 == 1:
        cat_features.append(feature)

# for idx, feature in enumerate(new_tables_columns):
#     if idx % 2 == 0:
#         cat_features.append(feature)

**Наконец, обучаем бустинг!**

In [65]:
clf = CatBoostClassifier(loss_function='MultiClass', cat_features=cat_features, task_type="GPU", devices='0', random_state=42)

clf.fit(trx_data_train, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3657957	total: 98.5ms	remaining: 1m 38s
1:	learn: 1.3485917	total: 190ms	remaining: 1m 34s
2:	learn: 1.3328970	total: 275ms	remaining: 1m 31s
3:	learn: 1.3191272	total: 355ms	remaining: 1m 28s
4:	learn: 1.3063277	total: 437ms	remaining: 1m 26s
5:	learn: 1.2953591	total: 517ms	remaining: 1m 25s
6:	learn: 1.2851787	total: 598ms	remaining: 1m 24s
7:	learn: 1.2760647	total: 679ms	remaining: 1m 24s
8:	learn: 1.2679693	total: 759ms	remaining: 1m 23s
9:	learn: 1.2605797	total: 839ms	remaining: 1m 23s
10:	learn: 1.2538271	total: 919ms	remaining: 1m 22s
11:	learn: 1.2465829	total: 998ms	remaining: 1m 22s
12:	learn: 1.2397219	total: 1.08s	remaining: 1m 21s
13:	learn: 1.2326727	total: 1.16s	remaining: 1m 21s
14:	learn: 1.2261185	total: 1.24s	remaining: 1m 21s
15:	learn: 1.2193452	total: 1.32s	remaining: 1m 21s
16:	learn: 1.2128247	total: 1.4s	remaining: 1m 20s
17:	learn: 1.2065889	total: 1.48s	remaining: 1m 20s
18:	learn: 1.1999746	total: 1.56s	remaining: 

<catboost.core.CatBoostClassifier at 0x7ecbba42cdf0>

In [66]:
test_pred = clf.predict(trx_data_test)
test_proba = clf.predict_proba(trx_data_test)

**Посчитаем метрики:**

In [67]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [68]:
print("Catboost Accuracy:", accuracy_score(target_test["bins"], test_pred))
print("Catboost ROC-AUC:", roc_auc_score(target_test["bins"], test_proba, average="weighted", multi_class="ovr"))

Catboost Accuracy: 0.496
Catboost ROC-AUC: 0.7583516846897369


In [70]:
# arr = np.array([0.760985138911461, 0.7606152841976829, 0.7583516846897369])

# arr.mean(), arr.std()

(0.7599840359329603, 0.00116408077132864)

- "Flattened" Sequences + CatBoost (With Time Features):
  - `Accuracy: 0.49533333333333335`, `0.485`, `0.496`, avg: `0.4921 +- 0.005`  
  - `ROC-AUC: 0.760985138911461`, `0.7606152841976829`, `0.7583516846897369`, avg: `0.76 +- 0.0012`

- "Flattened" Sequences + CatBoost (Without Time Features):
  - `Accuracy: 0.49033333333333334`, `0.47833333333333333`, `0.4756666666666667`, avg: `0.4814 +- 0.0064`  
  - `ROC-AUC: 0.755447441492964`, `0.7459454575327468`, `0.7456770879932965`, avg: `0.749 +- 0.0045`

**C учётом времени качество лучше => время надо учитывать в решении.**

---

# Supervised-кейс. GRU.

**Препроцессим данные:**

**Скачаем данные:**

In [3]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [4]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


**Делим таргет на трейн и на тест (поправить пайплайн с учётом этого факта):**

In [5]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [6]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

**Препроцессинг:**

In [7]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [8]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [9]:
data_train = pd.merge(data_train, target_train, on="client_id")
data_test = pd.merge(data_test, target_test, on="client_id")

In [10]:
data_train.rename(columns={"bins": "target"}, inplace=True)
data_test.rename(columns={"bins": "target"}, inplace=True)

In [11]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

In [12]:
data_train = MemoryMapDataset(data_train)
data_test = MemoryMapDataset(data_test)

**Создаём DataLoader:**

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [55]:
seed_everything(42)

In [56]:
data = PtlsDataModule(
    train_data=SeqToTargetDataset(data_train, target_col_name="target", target_dtype=torch.long),
    valid_data=SeqToTargetDataset(data_test, target_col_name="target", target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=128,
    train_num_workers=4
)

**Модель (GRU):**

In [57]:
N_EPOCHS = 20

In [58]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            "small_group": {"in": 250, "out": 16},
            "trans_date": {"in": 800, "out": 16}
        },
        numeric_values={
            "amount_rur": "log",
        },
        embeddings_noise=0.003,
    ),
    hidden_size=256,
    is_reduce_sequence=True
)

In [59]:
gru = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective="classification", num_classes=4, hidden_layers_sizes=[512]),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=4),
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [5]:
comet_ml.login()

In [6]:
from pytorch_lightning.loggers import CometLogger

In [60]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="supervised_baseline_GRU")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [61]:
trainer.fit(gru, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/c8e6762c921441bf8c4b24c386a46b23

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/c8e6762c921441bf8c4b24c386a46b23
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [506]                    : (0.6556657552719116, 1.4615997076034546)
[1;38;5;39mCOMET INFO:[0m     seq_len [84]                  : (856.1015625, 904.359375)
[1;38;5;39mCOMET INFO:[0m     val_loss [20]                 : (0.8641967177391052, 1.0774288177490234)
[1;38;5;39mCOMET INFO:[0m     valid/MulticlassAccura

In [143]:
torch.save(gru.state_dict(), "supervised_gru_with_2layered_MLP.pt")

In [62]:
print(trainer.logged_metrics)

{'loss': tensor(0.7673), 'seq_len': tensor(870.6000), 'y': tensor(1.6333), 'val_loss': tensor(0.8642), 'valid/MulticlassAccuracy': tensor(0.6063)}


---

**Измерим качество на тесте:**

**Используем энкодер + MLP:**

In [63]:
test_loader = torch.utils.data.DataLoader(
    dataset=data_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

In [64]:
model = InferenceModule(
    torch.nn.Sequential(
        gru,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name="prob",
)

model.eval()

InferenceModule(
  (model): Sequential(
    (0): SequenceToTarget(
      (seq_encoder): RnnSeqEncoder(
        (trx_encoder): TrxEncoder(
          (embeddings): ModuleDict(
            (small_group): NoisyEmbedding(
              250, 16, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
            (trans_date): NoisyEmbedding(
              800, 16, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
          )
          (custom_embeddings): ModuleDict(
            (amount_rur): LogScaler()
          )
          (custom_embedding_batch_norm): RBatchNorm(
            (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
        )
        (seq_encoder): RnnEncoder(
          (rnn): GRU(33, 256, batch_first=True)
          (reducer): LastStepEncoder()
        )
      )
      (head): Head(
        (model): Sequential(
          (0): Linear(in_features=256, out_features=512, bias=True)

In [65]:
pred = trainer.predict(model, test_loader)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/c8e6762c921441bf8c4b24c386a46b23

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/c8e6762c921441bf8c4b24c386a46b23
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m     Name         : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     test_batch_size   : None
[1;38;5;39mCOMET INFO:[0m     test_drop_last    : False
[1;38;5;39mCOMET INFO:[0m     test_num_workers  : None
[1;38;5;39mCOME

In [66]:
pred = pd.concat(pred, axis=0)

In [67]:
pred

Unnamed: 0,client_id,target,prob_0000,prob_0001,prob_0002,prob_0003
0,14,0,0.389151,0.125057,0.234248,0.251544
1,19,1,0.010747,0.899534,0.000503,0.089217
2,64,3,0.055237,0.546257,0.003298,0.395207
3,78,2,0.052738,0.001812,0.941807,0.003644
4,105,2,0.485259,0.053146,0.281959,0.179635
...,...,...,...,...,...,...
51,49955,0,0.554032,0.047716,0.081631,0.316621
52,49963,3,0.566801,0.025495,0.234018,0.173687
53,49988,1,0.015501,0.848153,0.001224,0.135122
54,49995,2,0.170188,0.004173,0.811294,0.014345


In [68]:
y_pred = pred[[f"prob_{i:04d}" for i in range(4)]].values.argmax(axis=1)
y_pred

array([0, 1, 1, ..., 1, 2, 1])

In [69]:
pred

Unnamed: 0,client_id,target,prob_0000,prob_0001,prob_0002,prob_0003
0,14,0,0.389151,0.125057,0.234248,0.251544
1,19,1,0.010747,0.899534,0.000503,0.089217
2,64,3,0.055237,0.546257,0.003298,0.395207
3,78,2,0.052738,0.001812,0.941807,0.003644
4,105,2,0.485259,0.053146,0.281959,0.179635
...,...,...,...,...,...,...
51,49955,0,0.554032,0.047716,0.081631,0.316621
52,49963,3,0.566801,0.025495,0.234018,0.173687
53,49988,1,0.015501,0.848153,0.001224,0.135122
54,49995,2,0.170188,0.004173,0.811294,0.014345


In [70]:
y_true = pred["target"].values
y_true

array([0, 1, 3, ..., 1, 2, 1])

In [71]:
y_proba = pred[[f"prob_{i:04d}" for i in range(4)]].values
y_proba

array([[3.89150858e-01, 1.25056699e-01, 2.34248325e-01, 2.51544178e-01],
       [1.07466495e-02, 8.99533927e-01, 5.02846786e-04, 8.92166048e-02],
       [5.52371964e-02, 5.46256959e-01, 3.29845631e-03, 3.95207435e-01],
       ...,
       [1.55009460e-02, 8.48152637e-01, 1.22448208e-03, 1.35121912e-01],
       [1.70188412e-01, 4.17276192e-03, 8.11293662e-01, 1.43451355e-02],
       [1.22525021e-01, 5.59343278e-01, 2.36416478e-02, 2.94490039e-01]],
      dtype=float32)

In [72]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [73]:
print("Accuracy:", accuracy_score(y_true, y_pred))
print("ROC-AUC:", roc_auc_score(y_true, y_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.6063333333333333
ROC-AUC: 0.8490023805359512


In [75]:
# arr = np.array([0.8460593128034952, 0.8485583055917324, 0.8490023805359512])

# arr.mean(), arr.std()

(0.8478733329770596, 0.0012954542399211907)

- GRU + 2layer MLP Head:
  - `Accuracy: 0.6043333333333333`, `0.609`, `0.6063333333333333`, avg: `0.6066 +- 0.0019`
  - `ROC-AUC:  0.8460593128034952`, `0.8485583055917324`, `0.8490023805359512`, avg: `0.8479 +- 0.0013`

# Self-Supervised Case. COLES, CPC, GPT.

**Данные:**

In [7]:
path_data = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [8]:
path_target = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
target = pd.read_csv(path_target)
target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [9]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["bins"], random_state=42)

In [10]:
trx_data_train = pd.merge(data, target_train["client_id"], on="client_id", how="inner")
trx_data_test = pd.merge(data, target_test["client_id"], on="client_id", how="inner")

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [11]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [12]:
BINS_NUM = 128

In [13]:
numeric_features = ["amount_rur"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [14]:
import gc

gc.collect()

147

---

In [15]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,
)

In [16]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [17]:
target_train.rename(columns={"bins": "target"}, inplace=True)
target_test.rename(columns={"bins": "target"}, inplace=True)
target_train.sort_values(by="client_id", inplace=True)
target_test.sort_values(by="client_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [18]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Train sequences lengths check:**

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
)

trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [13]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

211it [00:01, 110.61it/s]

Max Length: 683





---

- **COLES:**

In [15]:
seed_everything(42)

**DataLoaders:**

In [16]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=30)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=30,
            cnt_max=683,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [17]:
N_EPOCHS = 20

In [18]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=512,
    type="gru",
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=3e-3, weight_decay=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [19]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CoLES_Baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [20]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/1a354b21f2d9468e8f84118fc5d391c3

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_Baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/1a354b21f2d9468e8f84118fc5d391c3
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [506]              : (77.82167053222656, 2053.733642578125)
[1;38;5;39mCOMET INFO:[0m     seq_len [84]            : (343.0, 371.1203308105469)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.24035994708538055, 0.9482259154319763)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : CoLE

In [21]:
trainer.logged_metrics

{'loss': tensor(89.2545),
 'seq_len': tensor(341.5350),
 'valid/recall_top_k': tensor(0.9482)}

In [108]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [19]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

  pid, fd = os.forkpty()


--2025-01-18 21:18:00--  https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb
Resolving drive.google.com (drive.google.com)... 74.125.197.138, 74.125.197.102, 74.125.197.113, ...
Connecting to drive.google.com (drive.google.com)|74.125.197.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb&export=download [following]
--2025-01-18 21:18:01--  https://drive.usercontent.google.com/download?id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.188.132, 2607:f8b0:400e:c1b::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.188.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3435234 (3.3M) [application/octet-stream]
Saving to: 'coles_enc_baseline.pt'


2025-01-18 21:18:04 (208 MB/s) - 'coles_enc_baseline.p

In [22]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TrxEncoder(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (custom_embedding_batch_norm): RBatchNorm(
      (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(33, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [23]:
from tqdm import tqdm

seed_everything(42)

In [24]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:23,  8.96it/s]


array([[-0.17817634, -0.17045805, -0.83675903, ...,  0.2965921 ,
        -0.9525774 ,  0.2203211 ],
       [-0.35388613, -0.18615054, -0.8089495 , ..., -0.6524676 ,
        -0.9561562 ,  0.9104663 ],
       [ 0.10568832, -0.31760812,  0.49652702, ...,  0.21620698,
        -0.964913  ,  0.76185   ],
       ...,
       [-0.04757632, -0.25627714, -0.83458734, ...,  0.6351226 ,
        -0.9485299 ,  0.8719854 ],
       [ 0.02054986, -0.2708085 ,  0.0880736 , ...,  0.2886468 ,
        -0.9745003 ,  0.47248772],
       [-0.05113328, -0.17973788, -0.2582745 , ...,  0.3399484 ,
        -0.882377  ,  0.97562635]], dtype=float32)

In [25]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:02,  9.70it/s]


array([[-0.12533934, -0.15637873, -0.6890398 , ...,  0.7300011 ,
        -0.9683216 ,  0.79160887],
       [-0.17130113, -0.1868576 , -0.37100157, ...,  0.2541848 ,
        -0.97115856, -0.49559367],
       [-0.32793483, -0.36313677, -0.3895914 , ...,  0.27171615,
        -0.95172656, -0.60894823],
       ...,
       [-0.25869262, -0.14232087, -0.86704046, ...,  0.23839246,
        -0.9836158 ,  0.7143528 ],
       [-0.1018889 , -0.20139463, -0.3132478 , ..., -0.29278672,
        -0.9688057 ,  0.89167887],
       [-0.41335717, -0.31795862, -0.3127626 , ...,  0.28277287,
        -0.95193994,  0.7273838 ]], dtype=float32)

In [26]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3044036	total: 14.2s	remaining: 3h 55m 46s
1:	learn: 1.2439313	total: 14.2s	remaining: 1h 57m 51s
2:	learn: 1.1957510	total: 14.2s	remaining: 1h 18m 33s
3:	learn: 1.1577488	total: 14.2s	remaining: 58m 53s
4:	learn: 1.1252942	total: 14.2s	remaining: 47m 6s
5:	learn: 1.0988802	total: 14.2s	remaining: 39m 14s
6:	learn: 1.0765684	total: 14.2s	remaining: 33m 37s
7:	learn: 1.0565689	total: 14.2s	remaining: 29m 25s
8:	learn: 1.0390822	total: 14.2s	remaining: 26m 8s
9:	learn: 1.0235179	total: 14.3s	remaining: 23m 31s
10:	learn: 1.0106487	total: 14.3s	remaining: 21m 22s
11:	learn: 0.9996518	total: 14.3s	remaining: 19m 35s
12:	learn: 0.9893931	total: 14.3s	remaining: 18m 4s
13:	learn: 0.9800731	total: 14.3s	remaining: 16m 47s
14:	learn: 0.9718098	total: 14.3s	remaining: 15m 39s
15:	learn: 0.9639732	total: 14.3s	remaining: 14m 40s
16:	learn: 0.9579295	total: 14.3s	remaining: 13m 48s
17:	learn: 0.9518320	total: 14.3s	remaining: 13m 2s
18:	learn: 0.9459753	t

<catboost.core.CatBoostClassifier at 0x7bf375a201c0>

In [27]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [28]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5933333333333334
ROC-AUC: 0.8472952867923927


In [30]:
# arr = np.array([0.8490542004456147, 0.848260886697585, 0.8472952867923927])

# arr.mean(), arr.std()

(0.8482034579785309, 0.0007192208067076708)

- COLES embeds + Catboost:
  - `Accuracy: 0.6133333333333333`, `0.606`, `0.5933333333333334`, avg: `0.6042 +- 0.0083`
  -  `ROC-AUC: 0.8490542004456147`, `0.848260886697585`, `0.8472952867923927`, avg: `0.8482 +- 0.0007`

---

**Train sequences lengths check:**

In [108]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 16},
        "small_group": {"in": 250, "out": 16}
    }
)

trx_encoder = TrxEncoder(**trx_encoder_params)
trx_encoder.to("cuda")

TrxEncoder(
  (embeddings): ModuleDict(
    (trans_date): NoisyEmbedding(
      800, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (small_group): NoisyEmbedding(
      250, 16, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount_rur): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [110]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.6))

print("Max Length:", threshold)

211it [00:01, 120.11it/s]

Max Length: 904





---

- **CPC modeling:**

In [141]:
seed_everything(42)

**DataLoaders:**

In [142]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=863, # 1200
        max_len=904 # 1400
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=863,
        max_len=904
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [143]:
N_EPOCHS = 20

In [144]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "log"},
    embeddings={
        "trans_date": {"in": 800, "out": 128},
        "small_group": {"in": 250, "out": 128}
    }
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=2e-3),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [145]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="CPC_modeling_baseline (emb_dim=128 + Aug)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [146]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/b2b25a15977647b1b95a6fb379a23aee

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_baseline (emb_dim=128 + Aug)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/b2b25a15977647b1b95a6fb379a23aee
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [1012]             : (0.3524034023284912, 5.786584377288818)
[1;38;5;39mCOMET INFO:[0m     seq_len [168]           : (809.984375, 853.265625)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.8953893184661865, 0.9230080246925354)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET 

In [147]:
trainer.logged_metrics

{'loss': tensor(0.4645),
 'seq_len': tensor(827.2322),
 'valid/cpc_accuracy': tensor(0.9227)}

In [22]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [15]:
# !wget "https://drive.google.com/uc?export=download&id=1zQrxKt1VNZBlDVFsfJ6bydi6Hxdyvhnk" -O "cpc_enc_baseline.pt"

--2025-04-10 12:30:36--  https://drive.google.com/uc?export=download&id=1zQrxKt1VNZBlDVFsfJ6bydi6Hxdyvhnk
Resolving drive.google.com (drive.google.com)... 108.177.127.102, 108.177.127.138, 108.177.127.139, ...
Connecting to drive.google.com (drive.google.com)|108.177.127.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1zQrxKt1VNZBlDVFsfJ6bydi6Hxdyvhnk&export=download [following]
--2025-04-10 12:30:36--  https://drive.usercontent.google.com/download?id=1zQrxKt1VNZBlDVFsfJ6bydi6Hxdyvhnk&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.145.132, 2a00:1450:4013:c14::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.145.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3435202 (3.3M) [application/octet-stream]
Saving to: ‘cpc_enc_baseline.pt’


2025-04-10 12:30:38 (87.5 MB/s) - ‘cpc_enc_baseline.

In [148]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TrxEncoder(
    (embeddings): ModuleDict(
      (trans_date): NoisyEmbedding(
        800, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (small_group): NoisyEmbedding(
        250, 128, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount_rur): LogScaler()
    )
    (custom_embedding_batch_norm): RBatchNorm(
      (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(257, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [149]:
encoder.seq_encoder.is_reduce_sequence = True

In [150]:
from tqdm import tqdm

seed_everything(42)

In [151]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

211it [00:26,  7.88it/s]


array([[-0.09515778, -0.19623387, -0.0902167 , ...,  0.04302204,
         0.04714075,  0.05476538],
       [-0.01169488, -0.54726136, -0.01871036, ..., -0.08687544,
         0.23133066,  0.2229286 ],
       [-0.02025112,  0.12419871, -0.01704231, ...,  0.25598714,
         0.33596143,  0.03834539],
       ...,
       [-0.01866702, -0.39978692,  0.04923148, ...,  0.1922325 ,
         0.13169257,  0.0120459 ],
       [ 0.12581141,  0.2830607 ,  0.05576389, ...,  0.27144635,
         0.01966095, -0.26230395],
       [-0.12906437,  0.08694834, -0.00654481, ..., -0.02849863,
         0.13034701,  0.05235801]], dtype=float32)

In [152]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

24it [00:02,  8.54it/s]


array([[ 0.024001  ,  0.48319033,  0.01028331, ...,  0.19655219,
         0.0243271 ,  0.01357437],
       [ 0.29557106, -0.08488458, -0.06641911, ..., -0.04938169,
         0.47431237, -0.15810339],
       [-0.00370976,  0.18468584, -0.08404164, ...,  0.18840028,
        -0.03557061, -0.02538466],
       ...,
       [-0.08828094,  0.00857254, -0.09857324, ...,  0.08241299,
         0.0461966 ,  0.01168595],
       [-0.01203613,  0.04344815, -0.08613248, ...,  0.06646195,
        -0.0212103 , -0.04466537],
       [-0.02131239,  0.19283262, -0.12974751, ...,  0.18286301,
        -0.03398557, -0.00868156]], dtype=float32)

In [153]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3136924	total: 14.3ms	remaining: 14.2s
1:	learn: 1.2570810	total: 24.2ms	remaining: 12.1s
2:	learn: 1.2146091	total: 34.6ms	remaining: 11.5s
3:	learn: 1.1801316	total: 44.8ms	remaining: 11.2s
4:	learn: 1.1514737	total: 55ms	remaining: 10.9s
5:	learn: 1.1274180	total: 65.1ms	remaining: 10.8s
6:	learn: 1.1085867	total: 74.8ms	remaining: 10.6s
7:	learn: 1.0911727	total: 84.8ms	remaining: 10.5s
8:	learn: 1.0758684	total: 94.4ms	remaining: 10.4s
9:	learn: 1.0620595	total: 104ms	remaining: 10.3s
10:	learn: 1.0502406	total: 114ms	remaining: 10.3s
11:	learn: 1.0406780	total: 124ms	remaining: 10.2s
12:	learn: 1.0313011	total: 134ms	remaining: 10.2s
13:	learn: 1.0228333	total: 144ms	remaining: 10.2s
14:	learn: 1.0157925	total: 153ms	remaining: 10.1s
15:	learn: 1.0096299	total: 163ms	remaining: 10s
16:	learn: 1.0030267	total: 173ms	remaining: 10s
17:	learn: 0.9971915	total: 183ms	remaining: 9.99s
18:	learn: 0.9912467	total: 194ms	remaining: 10s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x79edf5578b20>

In [154]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [155]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.5826666666666667
ROC-AUC: 0.8343491131233265


In [157]:
# arr = np.array([0.830123007110738, 0.8271157616313021, 0.8343491131233265])

# arr.mean(), arr.std()

(0.8305292939551222, 0.0029669451406828876)

- CPC context embeds + Catboost (dims of trx embeds: 16 + 16 + 1):
  - `Accuracy: 0.5383333333333333`, `0.539`, `0.5476666666666666`, avg: `0.5417 +- 0.0043`
  - `ROC-AUC: 0.8029432123764916`, `0.7979622542369874`, `0.8024217795805104`,avg: `0.8011 +- 0.0022`

---

- CPC context embeds + Catboost (dims of trx embeds: 128 + 128 + 1 (as in ptls-experiments)):
  - `Accuracy: 0.5713333333333334`, `0.5783333333333334`, `0.585`, avg: `0.5782 +- 0.0056`
  - ` ROC-AUC: 0.823529840055776`, `0.8296522520400083`, `0.8290217057808127`, avg: `0.8274 +- 0.0027`

**Вывод:** при использовании эмбеддингов транзакций более высокой размерности получаем гораздо более высокое качество => будем их использовать.

---

- CPC context embeds (w/ Aug) + Catboost (dims of trx embeds: 128 + 128 + 1):
   - `Accuracy: 0.5773333333333334`, `0.5686666666666667`, `0.5826666666666667`, avg: `0.5762 +- 0.0058`
   - ` ROC-AUC: 0.830123007110738`, `0.8271157616313021`, `0.8343491131233265`, avg: `0.8305 +- 0.003`

**Вывод:** при использовании аугментаций немного просел accuracy, при этом существенно вырос ROC-AUC => будем использовать аугментации.

---

- **GPT:**

In [42]:
seed_everything(17)

**DataLoaders:**

In [43]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000,
        max_len=1200
    ),
    train_num_workers=4,
    train_batch_size=16,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=16
)

**Модель:**

In [44]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class GptPretrainModule(pl.LightningModule):
    """GPT2 Language model

    Sequence transactions are encoded by `trx_encoder`.
    Then `seq_encoder` encodes the given sequence 
    (we actually use NN to modify sequence transactions representations,
    then (during inference) we calculate the mean of these encoded transactions to get the representation of the whole sequence).
    After this we use heads to predict the classes of features of the future transaction.

    Parameters
    ----------
    trx_encoder:
        Module for transform dict with feature sequences to sequence of transaction representations
    seq_encoder:
        Module for sequence processing. Generally this is transformer based encoder. Rnn is also possible
        Should work without sequence reduction
    head_hidden_size:
        Hidden size of heads for feature prediction
    seed_seq_len:
         Size of starting sequence without loss 
    total_steps:
        total_steps expected in OneCycle lr scheduler
    max_lr:
        max_lr of OneCycle lr scheduler
    weight_decay:
        weight_decay of Adam optimizer
    pct_start:
        % of total_steps when lr increase
    norm_predict:
        use l2 norm for transformer output or not
    """

    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())

            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

In [45]:
N_EPOCHS = 20

In [46]:
trx_encoder = TrxEncoder(
    embeddings_noise=0.003,
    embeddings={
        "trans_date": {"in": 730, "out": 64},
        "small_group": {"in": 204, "out": 64},
        "amount_rur": {"in": BINS_NUM, "out": 64}
    }
)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=256,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = GptPretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    head_hidden_size=256,
    total_steps=(N_EPOCHS * 1688), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=1e-3,
    weight_decay=0.,
    pct_start=0.1,
    norm_predict=False
)

**Обучение:**

In [47]:
logger = CometLogger(project_name="EvS_SSL", experiment_name="GPT_modeling_baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [48]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl/63d4139392e542f7bca610d9670909ea

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : GPT_modeling_baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl/63d4139392e542f7bca610d9670909ea
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [4051]               : (7.2766900062561035, 16.851423263549805)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (7.723391056060791, 8.166022300720215)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : GPT_modeling_baseline
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOM

In [49]:
trainer.logged_metrics

{'loss': tensor(7.4110), 'val loss (by epochs)': tensor(7.7234)}

In [50]:
encoder = gpt.seq_encoder

In [49]:
torch.save(encoder.state_dict(), "gpt_baseline.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [86]:
# !pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
[0mInstalling collected packages: gdown
Successfully installed gdown-5.2.0


In [102]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=b0f44bc3-b84b-425c-968f-016e419987af
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 83.5MB/s]


'gpt_baseline_NAdam.pt'

In [51]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): GptPretrainModule(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (trans_date): NoisyEmbedding(
          730, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (small_group): NoisyEmbedding(
          204, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount_rur): NoisyEmbedding(
          128, 64, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 192)
        (wpe): Embedding(2048, 192)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): Conv1D(nf=576, nx=192)
              (c_proj): Conv1D(nf=192, nx=192)
    

In [58]:
from tqdm import tqdm

seed_everything(17)

In [59]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

3375it [01:53, 29.70it/s]


array([[-0.29519898,  0.25641248, -0.4925049 , ...,  1.3123674 ,
         1.7928513 ,  1.0197328 ],
       [-0.21153502,  0.39484537, -0.35638228, ...,  1.4085158 ,
         2.2964575 ,  1.1016012 ],
       [ 0.21439554,  0.01045799,  0.08873235, ...,  1.3627061 ,
         0.9698037 ,  1.4465582 ],
       ...,
       [ 0.03547691,  0.08848654, -0.00928362, ...,  0.99908465,
         1.6528033 ,  1.1190413 ],
       [-0.00346432,  0.00274708, -0.07542911, ...,  1.4833031 ,
         1.2156107 ,  1.1207637 ],
       [-0.08755232,  0.51917505,  0.3621601 , ...,  1.3221061 ,
         1.5000415 ,  1.2646636 ]], dtype=float32)

In [60]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=1)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

3000it [00:17, 172.02it/s]


array([[-0.4518735 ,  0.12146452, -0.42963716, ...,  1.192432  ,
         1.8875252 ,  1.1536922 ],
       [-0.13985637,  0.28886008, -0.5857791 , ...,  0.912618  ,
         1.4577489 ,  0.97797096],
       [-0.19743466,  0.37916127, -0.33578315, ...,  1.2584095 ,
         1.801612  ,  1.1429143 ],
       ...,
       [-0.35047498, -0.01143411, -0.37614968, ...,  1.0261779 ,
         1.3968449 ,  0.79167855],
       [ 0.03161003, -0.17694414,  0.00877061, ...,  1.0914028 ,
         1.2848023 ,  1.4630404 ],
       [-0.26207706,  0.31493393, -0.5617984 , ...,  1.1519172 ,
         1.6255326 ,  1.2134286 ]], dtype=float32)

In [61]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=17)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.12714
0:	learn: 1.3169670	total: 11.3ms	remaining: 11.3s
1:	learn: 1.2643144	total: 20.5ms	remaining: 10.3s
2:	learn: 1.2224643	total: 29.6ms	remaining: 9.85s
3:	learn: 1.1884925	total: 38.7ms	remaining: 9.65s
4:	learn: 1.1597352	total: 47.6ms	remaining: 9.46s
5:	learn: 1.1345845	total: 56.6ms	remaining: 9.38s
6:	learn: 1.1125816	total: 65.2ms	remaining: 9.26s
7:	learn: 1.0931971	total: 74.2ms	remaining: 9.19s
8:	learn: 1.0758457	total: 83.2ms	remaining: 9.16s
9:	learn: 1.0610744	total: 91.9ms	remaining: 9.1s
10:	learn: 1.0468143	total: 101ms	remaining: 9.06s
11:	learn: 1.0341617	total: 110ms	remaining: 9.03s
12:	learn: 1.0231343	total: 118ms	remaining: 8.98s
13:	learn: 1.0127905	total: 127ms	remaining: 8.95s
14:	learn: 1.0034608	total: 136ms	remaining: 8.91s
15:	learn: 0.9946502	total: 144ms	remaining: 8.88s
16:	learn: 0.9869952	total: 153ms	remaining: 8.86s
17:	learn: 0.9796093	total: 162ms	remaining: 8.84s
18:	learn: 0.9723533	total: 171ms	remaining: 8.82s
19:

<catboost.core.CatBoostClassifier at 0x7a46773cd2d0>

In [62]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)

In [63]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba, average="weighted", multi_class="ovr"))

Accuracy: 0.608
ROC-AUC: 0.8510174881181259


In [67]:
# arr = np.array([0.8489362388258066, 0.8510174881181259, 0.8513202652787633])

# arr.mean(), arr.std()

(0.8504246640742319, 0.0010597093288140884)

- GPT embeds + Catboost, MeanPooling at inference:
    - `Accuracy: 0.6066666666666667`, `0.6246666666666667`, `0.6123333333333333`, avg: `0.6146 +- 0.0075`
    - `ROC-AUC: 0.8479316455892264`, `0.8544376474342738`, `0.8537725354907774`, avg: `0.852 +- 0.0029`

---

- GPT embeds + Catboost, StatPooling (Mean + Max) at inference:
    - `Accuracy: 0.6086666666666667`, `0.608`, `0.6123333333333333`, avg: `0.61 +- 0.0019`
    - `ROC-AUC: 0.8489362388258066`, `0.8510174881181259`, `0.8513202652787633`, avg: `0.8504 +- 0.001`

**Вывод:** по метрикам конфигурация с MeanPooling лучше => будем использовать MeanPooling на этапе инференса.

# Итоги.

| Method                  |    Accuracy       | ROC-AUC           |
|-------------------------|-------------------|-------------------|
| **Flattened Sequences** | 0.4921 ± 0.005    | 0.76 ± 0.0012     |
| **GRU (+ MLP)**         | 0.6066 ± 0.0019   | 0.8479 ±  0.0013  |
| **CoLES**               | 0.6042 ± 0.0083   | 0.8482 ± 0.0007   |
| **CPC Modeling**        | 0.5762 ± 0.0058   | 0.8305 ± 0.003    |
| **GPT2**                | 0.6146 ± 0.0075   | 0.852 ± 0.0029    |