# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274639 sha256=741edd72cf700a291ccbcfb05baf22c15fd61f7a2b8494eadf981e9ceaa7ca56
 

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

# Supervised-кейс. Бустинг

**Препроцессим данные:**

**Скачаем данные:**

In [3]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [4]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [5]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

**Делим таргет на трейн и на тест:**

In [6]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

**Проводим препроцессинг транзакций (деление на трейн и на тест, подготовка данных под нужный формат):**

In [7]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

**Дополнительно: предобработка числовых признаков: $x \longrightarrow \text{sign}(x) \cdot \log(1 + |x|)$**

In [8]:
def preprocess_num_features(data, features_list):
    for feature in features_list:
        data[feature] = np.sign(data[feature]) * np.log1p(np.abs(data[feature]))
    return data

In [9]:
trx_data_train = preprocess_num_features(data=trx_data_train, features_list=["amount"])
trx_data_test = preprocess_num_features(data=trx_data_test, features_list=["amount"])

**Предобработка NaN-значений в `channel_type`:**

In [10]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

**Обработка даты:**

In [11]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

In [12]:
trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

**Конвертация значений `channel_type` и `trx_category` из str в int:**

In [13]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [14]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

In [15]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [16]:
trx_data_train = preprocessor.fit_transform(trx_data_train)
trx_data_test = preprocessor.transform(trx_data_test)

**Конвертируем время из секунд в дни, вычитаем из каждого дня минимальный/самый ранний день:**

In [17]:
trx_data_train["event_time"] //= 86400
trx_data_test["event_time"] //= 86400

In [18]:
min_day_train = np.inf
max_day_train = -np.inf

for idx, row in trx_data_train.iterrows():
    min_day_train = min(min_day_train, row["event_time"].min().item())
    max_day_train = max(max_day_train, row["event_time"].max().item())

min_day_train, max_day_train

(17081, 17623)

In [19]:
trx_data_train["event_time"] -= min_day_train

trx_data_test["event_time"] -= min_day_train

for idx in range(len(trx_data_test)):
    seq_times = torch.clip(trx_data_test.iloc[idx]["event_time"], min=0., max=(max_day_train - min_day_train)).long().unsqueeze(dim=1)
    trx_data_test.loc[idx, "event_time"] = seq_times

In [20]:
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)

In [21]:
trx_data_train.drop(columns=["cl_id"], inplace=True)
trx_data_test.drop(columns=["cl_id"], inplace=True)
target_train.drop(columns=["cl_id"], inplace=True)
target_test.drop(columns=["cl_id"], inplace=True)

In [22]:
trx_data_train.reset_index(inplace=True, drop=True)
trx_data_test.reset_index(inplace=True, drop=True)
target_train.reset_index(inplace=True, drop=True)
target_test.reset_index(inplace=True, drop=True)

In [23]:
max_seq_length = 0 

for idx, row in trx_data_train.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

for idx, row in trx_data_test.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

print("Max Sequence Length:", max_seq_length)

Max Sequence Length: 784


In [24]:
columns = ["event_time", "MCC", "channel_type", "currency", "trx_category", "amount"]

new_tables_columns = []

for i in range(1, max_seq_length + 1):
    for col in columns:
        new_tables_columns.append(col + "_" + str(i))

In [25]:
new_train_table_contents = []

for idx, row in tqdm(trx_data_train.iterrows()):
    new_row = []
    event_time = row["event_time"]
    mcc = row["MCC"]
    ch_type = row["channel_type"]
    currency = row["currency"]
    trx_cat = row["trx_category"]
    amount = row["amount"]
    for j in range(max_seq_length):
        if j < event_time.shape[0]:
            new_row.append(event_time[j].item())
            new_row.append(mcc[j].item())
            new_row.append(ch_type[j].item())
            new_row.append(currency[j].item())
            new_row.append(trx_cat[j].item())
            new_row.append(amount[j].item())
        else:
            for k in range(5):
                new_row.append(-1)
            new_row.append(0.)
    new_train_table_contents.append(new_row)

trx_data_train = pd.DataFrame(data=new_train_table_contents, columns=new_tables_columns)
new_train_table_contents = []

4500it [00:09, 456.79it/s]


In [26]:
new_test_table_contents = []

for idx, row in tqdm(trx_data_test.iterrows()):
    new_row = []
    event_time = row["event_time"]
    mcc = row["MCC"]
    ch_type = row["channel_type"]
    currency = row["currency"]
    trx_cat = row["trx_category"]
    amount = row["amount"]
    for j in range(max_seq_length):
        if j < event_time.shape[0]:
            new_row.append(event_time[j].item())
            new_row.append(int(mcc[j].item()))
            new_row.append(ch_type[j].item())
            new_row.append(currency[j].item())
            new_row.append(trx_cat[j].item())
            new_row.append(amount[j].item())
        else:
            for k in range(5):
                new_row.append(-1)
            new_row.append(0.)
    new_test_table_contents.append(new_row)

trx_data_test = pd.DataFrame(data=new_test_table_contents, columns=new_tables_columns)
new_test_table_contents = []

500it [00:01, 439.37it/s]


In [27]:
cat_features = []

for idx, feature in enumerate(new_tables_columns):
    if idx % 6 != 5:
        cat_features.append(feature)

**Наконец, обучаем бустинг!**

In [46]:
clf = CatBoostClassifier(loss_function='MultiClass', cat_features=cat_features, task_type="GPU", devices='0', random_seed=30)

clf.fit(trx_data_train, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6753076	total: 49.3ms	remaining: 49.2s
1:	learn: 0.6608020	total: 93.5ms	remaining: 46.6s
2:	learn: 0.6482471	total: 139ms	remaining: 46.3s
3:	learn: 0.6368725	total: 192ms	remaining: 47.8s
4:	learn: 0.6263995	total: 237ms	remaining: 47.2s
5:	learn: 0.6175677	total: 281ms	remaining: 46.6s
6:	learn: 0.6094716	total: 327ms	remaining: 46.4s
7:	learn: 0.6032371	total: 372ms	remaining: 46.1s
8:	learn: 0.5971491	total: 417ms	remaining: 45.9s
9:	learn: 0.5910566	total: 462ms	remaining: 45.7s
10:	learn: 0.5859313	total: 507ms	remaining: 45.6s
11:	learn: 0.5812125	total: 548ms	remaining: 45.2s
12:	learn: 0.5767888	total: 593ms	remaining: 45s
13:	learn: 0.5730605	total: 637ms	remaining: 44.8s
14:	learn: 0.5696245	total: 682ms	remaining: 44.8s
15:	learn: 0.5663755	total: 726ms	remaining: 44.6s
16:	learn: 0.5630521	total: 770ms	remaining: 44.5s
17:	learn: 0.5606035	total: 812ms	remaining: 44.3s
18:	learn: 0.5581669	total: 854ms	remaining: 44.1s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x7d21cf7bd3c0>

In [47]:
test_pred = clf.predict(trx_data_test)
test_proba = clf.predict_proba(trx_data_test)[:, 1]

**Посчитаем метрики:**

In [38]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [48]:
print("CatBoost Accuracy:", accuracy_score(target_test["target_flag"], test_pred))
print("CatBoost ROC-AUC:", roc_auc_score(target_test["target_flag"], test_proba))

CatBoost Accuracy: 0.668
CatBoost ROC-AUC: 0.7547554677761409


- **"Flattened" Sequences + CatBoost (Time Features in days):**
    - `Accuracy: 0.6704 +- 0.0046`
    - `ROC-AUC: 0.7536 +- 0.003`

---

# Supervised-кейс. GRU.

**Препроцессим данные:**

**Скачаем данные:**

In [51]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [52]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [53]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

**Делим таргет на трейн и на тест:**

In [54]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [55]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

**Препроцессинг:**

In [56]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [57]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [58]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [59]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

In [60]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [61]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [14]:
# data_train["event_time"] //= 86400
# data_test["event_time"] //= 86400

In [14]:
# min_day_train = np.inf
# max_day_train = -np.inf

# for idx, row in data_train.iterrows():
#     min_day_train = min(min_day_train, row["event_time"].min().item())
#     max_day_train = max(max_day_train, row["event_time"].max().item())

# min_day_train, max_day_train

(1475798400, 1522711235)

In [16]:
# data_train["event_time"] -= min_day_train

# data_test["event_time"] -= min_day_train

# for idx in range(len(data_test)):
#     seq_times = torch.clip(data_test.iloc[idx]["event_time"], min=0., max=(max_day_train - min_day_train)).long().unsqueeze(dim=1)
#     data_test.loc[idx, "event_time"] = seq_times

In [62]:
data_train = pd.merge(data_train, target_train, on="cl_id")
data_test = pd.merge(data_test, target_test, on="cl_id")

In [63]:
data_train.rename(columns={"target_flag": "target"}, inplace=True)
data_test.rename(columns={"target_flag": "target"}, inplace=True)

In [64]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

In [65]:
data_train = MemoryMapDataset(data_train)
data_test = MemoryMapDataset(data_test)

**Создаём DataLoader:**

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [146]:
seed_everything(30)

In [147]:
data = PtlsDataModule(
    train_data=SeqToTargetDataset(data_train, target_col_name="target", target_dtype=torch.long),
    valid_data=SeqToTargetDataset(data_test, target_col_name="target", target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=128,
    train_num_workers=4
)

**Модель (GRU):**

In [148]:
N_EPOCHS = 30

In [61]:
# timestamps = set()

# for i in data_train:
#     timestamps = timestamps.union(set(i["event_time"].tolist()))

# for i in data_test:
#     timestamps = timestamps.union(set(i["event_time"].tolist()))

# len(timestamps)

543

In [4]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

In [150]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(
        embeddings={
            "MCC": {"in": 342, "out": 8},
            "channel_type": {"in": 7, "out": 8},
            "currency": {"in": 60, "out": 8},
            "trx_category": {"in": 11, "out": 8}            
        },
        numeric_values={
            "amount": "log",
        },
        embeddings_noise=0.003,
        k=7,
        time_col="event_time"
    ),
    hidden_size=512,
    is_reduce_sequence=True
)

In [151]:
gru = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective="classification", num_classes=2, hidden_layers_sizes=[1024]),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=2),
    optimizer_partial=partial(torch.optim.Adam, lr=1e-4, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [6]:
comet_ml.login()

In [7]:
from pytorch_lightning.loggers import CometLogger

In [152]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="supervised_baseline_GRU")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [153]:
trainer.fit(gru, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/d9e53caa70824e6090a3fc48b67ebfd1

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/d9e53caa70824e6090a3fc48b67ebfd1
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [129]                    : (0.3875987231731415, 0.6915847063064575)
[1;38;5;39mCOMET INFO:[0m     seq_len [21]                  : (85.953125, 137.65000915527344)
[1;38;5;39mCOMET INFO:[0m     val_loss [30]                 : (0.5201566219329834, 0.6169891953468323)
[1;38;5;39mCOMET INFO:[0m     valid/Multicl

In [107]:
torch.save(gru.state_dict(), "supervised_gru_with_2layered_MLP_rosbank.pt")

In [154]:
print(trainer.logged_metrics)

{'loss': tensor(0.3125), 'seq_len': tensor(98.9500), 'y': tensor(0.7500), 'val_loss': tensor(0.5210), 'valid/MulticlassAccuracy': tensor(0.7540)}


**Измерим качество на тесте:**

**Используем энкодер + MLP:**

In [155]:
test_loader = torch.utils.data.DataLoader(
    dataset=data_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

In [156]:
model = InferenceModule(
    torch.nn.Sequential(
        gru,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name="prob",
)

model.eval()

InferenceModule(
  (model): Sequential(
    (0): SequenceToTarget(
      (seq_encoder): RnnSeqEncoder(
        (trx_encoder): TrxEncoderT2V(
          (embeddings): ModuleDict(
            (MCC): NoisyEmbedding(
              342, 8, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
            (channel_type): NoisyEmbedding(
              7, 8, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
            (currency): NoisyEmbedding(
              60, 8, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
            (trx_category): NoisyEmbedding(
              11, 8, padding_idx=0
              (dropout): Dropout(p=0, inplace=False)
            )
          )
          (custom_embeddings): ModuleDict(
            (amount): LogScaler()
          )
          (custom_embedding_batch_norm): RBatchNorm(
            (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 

In [157]:
pred = trainer.predict(model, test_loader)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/d9e53caa70824e6090a3fc48b67ebfd1


The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.



Predicting: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/d9e53caa70824e6090a3fc48b67ebfd1
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m     Name         : supervised_baseline_GRU
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     test_batch_size   : None
[1;38;5;39mCOMET INFO:[0m     test_drop_last    : False
[1;38;5;39mCOMET INFO:[0m     test_num_workers  : None
[1;38;5;39mC

In [158]:
pred = pd.concat(pred, axis=0)

In [159]:
pred

Unnamed: 0,cl_id,target,prob_0000,prob_0001
0,1,0,0.123336,0.876664
1,38,0,0.623679,0.376321
2,48,1,0.185761,0.814239
3,70,1,0.164920,0.835080
4,79,0,0.263725,0.736275
...,...,...,...,...
111,10150,1,0.103653,0.896347
112,10151,1,0.277649,0.722351
113,10176,1,0.479607,0.520393
114,10185,1,0.558209,0.441791


In [160]:
y_pred = pred[[f"prob_{i:04d}" for i in range(2)]].values.argmax(axis=1)
y_pred

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [161]:
y_true = pred["target"].values
y_true

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,

In [162]:
y_proba = pred[[f"prob_{i:04d}" for i in range(2)]].values[:, 1]
y_proba

array([0.87666357, 0.37632108, 0.81423855, 0.8350798 , 0.7362752 ,
       0.85706204, 0.91454834, 0.33148304, 0.98075175, 0.92039543,
       0.969634  , 0.63904226, 0.93815386, 0.94132847, 0.5742793 ,
       0.9464432 , 0.93325955, 0.7261069 , 0.88078666, 0.92777526,
       0.7486656 , 0.8549313 , 0.7474287 , 0.5521448 , 0.8017496 ,
       0.8974453 , 0.96346474, 0.97140753, 0.39853555, 0.2850942 ,
       0.90724766, 0.9238023 , 0.94578105, 0.8618969 , 0.8980207 ,
       0.934898  , 0.9506875 , 0.9726218 , 0.920695  , 0.9556519 ,
       0.60906017, 0.95372146, 0.6444998 , 0.95642114, 0.801741  ,
       0.9642968 , 0.92708254, 0.4472002 , 0.9084691 , 0.94992024,
       0.9565234 , 0.92141235, 0.8839638 , 0.98324454, 0.40326855,
       0.38753822, 0.96534234, 0.9667409 , 0.6632826 , 0.7475132 ,
       0.91314983, 0.9459914 , 0.8223837 , 0.89530915, 0.9601716 ,
       0.49209255, 0.7449037 , 0.9603018 , 0.9721539 , 0.92916155,
       0.95537597, 0.34886312, 0.91930467, 0.9487953 , 0.89012

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [164]:
print("Accuracy:", accuracy_score(y_true, y_pred))
print("ROC-AUC:", roc_auc_score(y_true, y_proba))

Accuracy: 0.754
ROC-AUC: 0.8186527658610027


- GRU + 2layer MLP Head + Time2Vec (k = 7):
  - `Accuracy: 0.746 +- 0.0076`
  - `ROC-AUC: 0.8148 +- 0.0037`

Без Time2Vec получаются крайне нестабильные/некорректные результаты...

# Self-Supervised Case. COLES, CPC, GPT.

**Данные:**

In [30]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [31]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [32]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [33]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [34]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [35]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [36]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [37]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [38]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [39]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [40]:
BINS_NUM = 128

In [41]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [42]:
import gc

gc.collect()

96

---

In [43]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [44]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [45]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [46]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

- **COLES:**

In [237]:
seed_everything(30)

**DataLoaders:**

In [238]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=150,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=15,
            cnt_max=150,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [239]:
N_EPOCHS = 20

In [240]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(**trx_encoder_params),
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [241]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_Baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [242]:
trainer.fit(coles, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/a85b89a61fbb403eb98028ce79fcf512

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (33) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CoLES_Baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/a85b89a61fbb403eb98028ce79fcf512
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [79]               : (51.9121208190918, 641.5636596679688)
[1;38;5;39mCOMET INFO:[0m     seq_len [13]            : (48.70624923706055, 56.78750228881836)
[1;38;5;39mCOMET INFO:[0m     valid/recall_top_k [20] : (0.4932411313056946, 0.8614515066146851)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m   

In [243]:
trainer.logged_metrics

{'loss': tensor(54.4320),
 'seq_len': tensor(55.1967),
 'valid/recall_top_k': tensor(0.8472)}

In [28]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [244]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TrxEncoderT2V(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 8, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (custom_embedding_batch_norm): RBatchNorm(
      (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(41, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [245]:
from tqdm import tqdm

seed_everything(30)

In [246]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 22.67it/s]


array([[-0.5872396 , -0.39084154,  0.11891389, ...,  0.6059513 ,
         0.12908368,  0.0357998 ],
       [-0.9508273 , -0.16250183,  0.7539672 , ...,  0.9203619 ,
         0.42006952, -0.107231  ],
       [-0.8788359 , -0.23457053,  0.42926893, ...,  0.8083197 ,
         0.21755676,  0.39896894],
       ...,
       [-0.92515856, -0.38494828,  0.4463349 , ...,  0.83184195,
         0.14884058, -0.36724275],
       [-0.9424827 , -0.03360297, -0.5485953 , ...,  0.85067356,
         0.22364013, -0.75807095],
       [-0.7171386 , -0.29056314, -0.2742846 , ...,  0.546718  ,
         0.09493509,  0.21211511]], dtype=float32)

In [247]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 21.91it/s]


array([[-0.939315  , -0.01200165,  0.67269194, ...,  0.9069034 ,
         0.3379755 , -0.31258112],
       [-0.7678652 , -0.4311096 ,  0.04336767, ...,  0.6401845 ,
         0.1619981 ,  0.07382047],
       [-0.9379685 , -0.3510451 ,  0.8869753 , ...,  0.8984202 ,
         0.43017837, -0.23246177],
       ...,
       [-0.7961454 , -0.02146922, -0.77230954, ...,  0.6120291 ,
         0.04130362, -0.08948928],
       [-0.9207898 , -0.1879494 , -0.5942831 , ...,  0.71228915,
         0.10027774, -0.03141725],
       [-0.9567799 , -0.5340204 ,  0.02336365, ...,  0.9226599 ,
         0.30793422,  0.21418901]], dtype=float32)

In [248]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=30)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6653722	total: 10.9ms	remaining: 10.9s
1:	learn: 0.6412632	total: 17.7ms	remaining: 8.81s
2:	learn: 0.6214946	total: 24.3ms	remaining: 8.08s
3:	learn: 0.6030732	total: 30.9ms	remaining: 7.69s
4:	learn: 0.5872129	total: 37.3ms	remaining: 7.42s
5:	learn: 0.5731445	total: 43.8ms	remaining: 7.25s
6:	learn: 0.5606026	total: 50.2ms	remaining: 7.12s
7:	learn: 0.5495630	total: 56.8ms	remaining: 7.04s
8:	learn: 0.5398590	total: 64.3ms	remaining: 7.08s
9:	learn: 0.5309856	total: 71.7ms	remaining: 7.1s
10:	learn: 0.5233783	total: 79.8ms	remaining: 7.17s
11:	learn: 0.5161197	total: 86.4ms	remaining: 7.11s
12:	learn: 0.5098213	total: 93.6ms	remaining: 7.11s
13:	learn: 0.5044825	total: 100ms	remaining: 7.04s
14:	learn: 0.4994322	total: 107ms	remaining: 7.02s
15:	learn: 0.4944124	total: 113ms	remaining: 6.98s
16:	learn: 0.4900607	total: 120ms	remaining: 6.93s
17:	learn: 0.4855380	total: 127ms	remaining: 6.93s
18:	learn: 0.4820387	total: 134ms	remaining: 6.92s

<catboost.core.CatBoostClassifier at 0x7d21d4b87eb0>

In [249]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [250]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.752
ROC-AUC: 0.8115296822133364


- COLES embeds + Catboost:
  - `Accuracy: 0.7328 +- 0.0194`
  -  `ROC-AUC: 0.8057 +- 0.0088`

---

- **CPC modeling:**

In [240]:
seed_everything(0)

**DataLoaders:**

In [241]:
data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=85,             #min_len=10 / 85; 1000
        max_len=105             #max_len=105; 1200
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=85,
        max_len=105
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [242]:
N_EPOCHS = 20

In [243]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 32}, # 8 / 16
        "channel_type": {"in": 7, "out": 32},
        "currency": {"in": 60, "out": 32},
        "trx_category": {"in": 11, "out": 32}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=31,
    time_col="event_time"
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(**trx_encoder_params),
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [244]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [245]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/d5e0cd29d807486b97f69cad137f4b53

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/d5e0cd29d807486b97f69cad137f4b53
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (1.7255330085754395, 25.472782135009766)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (58.375, 70.4921875)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.24430523812770844, 0.5497528314590454)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name 

In [246]:
trainer.logged_metrics

{'loss': tensor(1.5236),
 'seq_len': tensor(80.8000),
 'valid/cpc_accuracy': tensor(0.5498)}

In [82]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [247]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TrxEncoderT2V(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 32, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (custom_embedding_batch_norm): RBatchNorm(
      (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(161, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [248]:
encoder.seq_encoder.is_reduce_sequence = True

In [249]:
from tqdm import tqdm

seed_everything(0)

In [250]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 21.47it/s]


array([[-0.19209889,  0.15701051,  0.21384467, ..., -0.1645761 ,
         0.13416895,  0.13677727],
       [-0.62357974, -0.25639307,  0.7148075 , ..., -0.80968666,
         0.5653447 , -0.71061534],
       [-0.6059645 , -0.539159  ,  0.6836383 , ..., -0.7998759 ,
         0.6146016 , -0.7275492 ],
       ...,
       [-0.82181925, -0.49199453,  0.62970805, ..., -0.63831025,
         0.50188076, -0.64945555],
       [-0.83849734, -0.4220637 ,  0.47005558, ..., -0.8026331 ,
         0.46405196, -0.8612809 ],
       [-0.512371  , -0.13991897, -0.4856592 , ...,  0.12618224,
         0.05067025,  0.2307141 ]], dtype=float32)

In [251]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 20.96it/s]


array([[-0.6161818 , -0.2736779 ,  0.6748481 , ..., -0.8022516 ,
         0.6119286 , -0.67115366],
       [-0.643846  , -0.3731083 ,  0.5580652 , ..., -0.46282333,
         0.40052614, -0.5215277 ],
       [-0.6264979 , -0.4865318 ,  0.625218  , ..., -0.89806944,
         0.637347  , -0.7476031 ],
       ...,
       [-0.8480329 , -0.6125593 ,  0.6763762 , ..., -0.65062916,
         0.17983465, -0.5996037 ],
       [-0.7331846 , -0.15567298,  0.7646365 , ..., -0.6553723 ,
         0.57282156, -0.6534018 ],
       [-0.8339077 , -0.32614043,  0.72814524, ..., -0.7769479 ,
         0.35392228, -0.81995493]], dtype=float32)

In [252]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=0)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6669049	total: 11.1ms	remaining: 11.1s
1:	learn: 0.6433847	total: 18ms	remaining: 8.99s
2:	learn: 0.6223505	total: 25.1ms	remaining: 8.33s
3:	learn: 0.6046860	total: 31.7ms	remaining: 7.91s
4:	learn: 0.5891472	total: 38.4ms	remaining: 7.65s
5:	learn: 0.5748068	total: 45.6ms	remaining: 7.55s
6:	learn: 0.5628350	total: 52.3ms	remaining: 7.42s
7:	learn: 0.5521847	total: 59ms	remaining: 7.31s
8:	learn: 0.5428893	total: 65.7ms	remaining: 7.23s
9:	learn: 0.5342197	total: 72.5ms	remaining: 7.17s
10:	learn: 0.5266791	total: 78.8ms	remaining: 7.08s
11:	learn: 0.5197991	total: 85.1ms	remaining: 7s
12:	learn: 0.5140753	total: 91.2ms	remaining: 6.93s
13:	learn: 0.5080617	total: 97.1ms	remaining: 6.84s
14:	learn: 0.5035418	total: 103ms	remaining: 6.78s
15:	learn: 0.4987949	total: 110ms	remaining: 6.73s
16:	learn: 0.4945686	total: 116ms	remaining: 6.69s
17:	learn: 0.4903260	total: 122ms	remaining: 6.66s
18:	learn: 0.4868383	total: 129ms	remaining: 6.64s
19:	

<catboost.core.CatBoostClassifier at 0x7b6a5f4af340>

In [253]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [254]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.75
ROC-AUC: 0.8040342555568147


- CPC context embeds + Catboost (low dim of trx embeds: each embed is of dim 8):
  - `Accuracy: 0.7336 +- 0.0119`,
  - `ROC-AUC: 0.8078 +- 0.004`
  
\

- CPC context embeds + Catboost (higher dim of trx embeds: each embed is of dim 16):
  - `Accuracy: 0.7464 +- 0.0099`
  - `ROC-AUC: 0.805 +- 0.00598`

\

- CPC context embeds + Catboost (even higher dim of trx embeds: each embed is of dim 32):
  - `Accuracy: 0.7372 +- 0.0144`
  - `ROC-AUC: 0.8099 +- 0.0069`

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.7432 +- 0.0078`
  - `ROC-AUC: 0.8134 +- 0.0049`

\

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 16):
  - `Accuracy: 0.7376 +- 0.0145`
  - `ROC-AUC: 0.8055 +- 0.0072`

\

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.748 +- 0.0033`
  - `ROC-AUC: 0.81 +- 0.0048`

**При обучении с аугментациями получаем лучшее качество => будем их использовать. Лучшие результаты - у конфигураций CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8) и CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32), полученные результаты в целом сравнимы, но у второй accuracy значительно выше, чем у другой => берём её.**

---

- **GPT:**

In [668]:
seed_everything(30)

**DataLoaders:**

In [669]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000, # 85
        max_len=1200 # 105
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=85,
        max_len=105
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [670]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class GptPretrainModule(pl.LightningModule):
    """GPT2 Language model

    Sequence transactions are encoded by `trx_encoder`.
    Then `seq_encoder` encodes the given sequence 
    (we actually use NN to modify sequence transactions representations,
    then (during inference) we calculate the mean of these encoded transactions to get the representation of the whole sequence).
    After this we use heads to predict the classes of features of the future transaction.

    Parameters
    ----------
    trx_encoder:
        Module for transform dict with feature sequences to sequence of transaction representations
    seq_encoder:
        Module for sequence processing. Generally this is transformer based encoder. Rnn is also possible
        Should work without sequence reduction
    head_hidden_size:
        Hidden size of heads for feature prediction
    seed_seq_len:
         Size of starting sequence without loss 
    total_steps:
        total_steps expected in OneCycle lr scheduler
    max_lr:
        max_lr of OneCycle lr scheduler
    weight_decay:
        weight_decay of Adam optimizer
    pct_start:
        % of total_steps when lr increase
    norm_predict:
        use l2 norm for transformer output or not
    """

    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

In [671]:
N_EPOCHS = 20

In [672]:
trx_encoder = TrxEncoderT2V(
    embeddings={
        "MCC": {"in": 342, "out": 16},
        "channel_type": {"in": 7, "out": 16},
        "currency": {"in": 60, "out": 16},
        "trx_category": {"in": 11, "out": 16},
        "amount": {"in": BINS_NUM, "out": 16}
    },
    embeddings_noise=0.003,
    k=15,
    time_col="event_time"
)


seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=512,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True, # try switching it off
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = GptPretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    head_hidden_size=512,
    total_steps=(N_EPOCHS * 282), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=3e-3,
    weight_decay=3e-4, # try adding weight_decay > 0
    pct_start=0.1,
    norm_predict=False # never use it again
)

**Обучение:**

In [30]:
# !export HYDRA_FULL_ERROR=1

In [673]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="GPT_modeling_baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [674]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/535a22624e1343b8a256ee6dad3ca114



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : GPT_modeling_baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/535a22624e1343b8a256ee6dad3ca114
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [170]                : (8.229889869689941, 19.123592376708984)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (8.519289016723633, 9.842005729675293)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : GPT_modeling_baseline
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mC

In [675]:
trainer.logged_metrics

{'loss': tensor(8.2774), 'val loss (by epochs)': tensor(8.6232)}

In [676]:
encoder = gpt.seq_encoder

In [135]:
torch.save(encoder.state_dict(), "gpt_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [86]:
# !pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
[0mInstalling collected packages: gdown
Successfully installed gdown-5.2.0


In [102]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6
From (redirected): https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6&confirm=t&uuid=b0f44bc3-b84b-425c-968f-016e419987af
To: /kaggle/working/gpt_baseline_NAdam.pt
100%|██████████| 34.7M/34.7M [00:00<00:00, 83.5MB/s]


'gpt_baseline_NAdam.pt'

In [677]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): GptPretrainModule(
    (trx_encoder): TrxEncoderT2V(
      (embeddings): ModuleDict(
        (MCC): NoisyEmbedding(
          342, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (channel_type): NoisyEmbedding(
          7, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (currency): NoisyEmbedding(
          60, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (trx_category): NoisyEmbedding(
          11, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount): NoisyEmbedding(
          128, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
      (time2vec_days): Time2Vec()
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 96)
        (wpe): Embedding(2048, 96)
        (drop): Dropout(p=0.1, in

In [678]:
from tqdm import tqdm

seed_everything(30)

In [679]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

563it [00:04, 127.07it/s]


array([[ -5.273528  ,  -7.8131323 , -11.62134   , ...,  -0.1666674 ,
          0.56957763,   0.8396692 ],
       [  0.16802467,  -0.45239204,  -0.1157253 , ...,   0.38325766,
          0.8526115 ,   1.1263626 ],
       [  0.33640438,  -1.1159902 ,  -1.2578104 , ...,  -0.05770402,
          0.7695208 ,   1.1871806 ],
       ...,
       [ -0.20027328,  -0.13722456,   0.21473451, ...,   0.3816202 ,
          0.8880486 ,   1.0197345 ],
       [ -0.5653915 ,   0.17153484,   0.05484006, ...,   0.59078014,
          0.7575279 ,   0.9714437 ],
       [ -0.8637452 ,   0.04938944,  -0.97433704, ...,   0.59010166,
          1.0388405 ,   0.6743946 ]], dtype=float32)

In [680]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=8)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="stat")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

63it [00:00, 131.09it/s]


array([[ 6.07364357e-01,  8.43049347e-01, -1.25151563e+00, ...,
         6.29660606e-01,  8.92379642e-01,  1.12806275e-01],
       [-4.84384924e-01, -9.97332871e-01, -1.38819563e+00, ...,
         1.36822909e-01,  7.92777002e-01,  7.56889939e-01],
       [-4.16990789e-03, -2.42832098e-02, -1.35354072e-01, ...,
         4.03480351e-01,  8.87539685e-01,  1.27248597e+00],
       ...,
       [-7.76723862e-01, -1.28507102e+00, -3.31476212e+00, ...,
        -4.06769931e-01,  7.73302078e-01,  9.47347879e-01],
       [-6.27444267e-01, -4.39142466e-01, -6.12760365e-01, ...,
         3.71480435e-01,  8.01610589e-01,  9.59540188e-01],
       [ 1.22853718e-03,  4.05979455e-01, -2.99209714e-01, ...,
         2.04931721e-01,  1.06507194e+00,  1.19544768e+00]], dtype=float32)

In [None]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=30)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

In [682]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [683]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.736
ROC-AUC: 0.8062197471305306


- GPT embeds + Catboost (trx dim: 8):
  - `Accuracy: 0.6948 +- 0.0201`
  - `ROC-AUC: 0.7651 +- 0.0117`

\

- GPT embeds (w/ stat pooling) + Catboost (trx dim: 8):
  - `Accuracy: 0.7004 +- 0.018`
  - `ROC-AUC: 0.7747 +- 0.0104`

\

Плохо, с trx эмбеддингами большей размерности результаты выходит лучше => отказываемся от текущего конфига 

---
- GPT embeds + Catboost (optimal trx dim: 16):
  - `Accuracy: 0.7072 +- 0.0084`
  - `ROC-AUC: 0.7743 +- 0.0119`

\

- GPT embeds (w/ stat pooling) + Catboost (optimal trx dim: 16):
  - `Accuracy: 0.722 +- 0.0139`
  - `ROC-AUC: 0.7887 +- 0.0048`

---

- GPT embeds + Catboost // higher feedforward dim (512):
  - `Accuracy: 0.7188 +- 0.0109` 
  - `ROC-AUC: 0.78 +- 0.0085`

\

- GPT embeds (w/ stat pooling) + Catboost + higher feedforward dim (512):
  - `Accuracy: 0.7276 +- 0.0174`
  - `ROC-AUC: 0.7954 +- 0.0158`

Это лучшие результаты. Также теперь будем использовать только stat pooling, так как он стабильно лучше, чем mean pooling. 

---

- GPT embeds (w/ stat pooling) + Catboost + higher feedforward dim (512), adjusted hyperparams:
  - `Accuracy: 0.7304 +- 0.0101`
  - `ROC-AUC: 0.7957 +- 0.0091`

---

- GPT embeds (w/ stat pooling, w/ Aug) + Catboost + higher feedforward dim (512), adjusted hyperparams:
  - `Accuracy: 0.724 +- 0.009`
  - `ROC-AUC: 0.7935 +- 0.0082`

---

**Итог:** Лучше всего оказался конфиг: GPT embeds (w/ stat pooling) + Catboost + higher feedforward dim (512), adjusted hyperparams

# Итоги.

| Method                  |    Accuracy           | ROC-AUC         |
|-------------------------|-----------------------|-----------------|
| **Flattened Sequences** | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**         | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**               | 0.733 ± 0.019         | 0.8057 ± 0.0088 |
| **CPC Modeling**        | 0.748 ± 0.003         | 0.81 ± 0.0048   |
| **GPT2**                | 0.73 ± 0.01           | 0.7957 ± 0.0091 |