# Импортируем необходимые библиотеки

In [1]:
!pip install pytorch-lifestream
!pip install comet_ml

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pytorch-lifestream
  Building wheel for pytorch-lifestream (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytorch-lifestream: filename=pytorch_lifestream-0.6.0-py3-none-any.whl size=274670 sha256=fa5338776feca17fd11791998c1505f763f62b90ffdf77454136c297c8dc916d
  

In [2]:
# data preprocessing
import os
import numpy as np
import pandas as pd
import pickle

# misc
from tqdm import tqdm
from functools import partial

# logging
import comet_ml

# classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier

# basic deep learning libs
import torch
import pytorch_lightning as pl
import torchmetrics

# ptls
from ptls.nn import TrxEncoder, RnnSeqEncoder, TransformerEncoder, GptEncoder, Head
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.cpc import CpcModule
from ptls.frames.cpc import CpcDataset
from ptls.frames.gpt import GptDataset
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.datasets import inference_data_loader
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

# Supervised-кейс. Бустинг

**Препроцессим данные:**

**Скачаем данные:**

In [None]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

In [None]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

In [None]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

**Делим таргет на трейн и на тест:**

In [None]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

**Проводим препроцессинг транзакций (деление на трейн и на тест, подготовка данных под нужный формат):**

In [None]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

**Дополнительно: предобработка числовых признаков: $x \longrightarrow \text{sign}(x) \cdot \log(1 + |x|)$**

In [None]:
def preprocess_num_features(data, features_list):
    for feature in features_list:
        data[feature] = np.sign(data[feature]) * np.log1p(np.abs(data[feature]))
    return data

In [None]:
trx_data_train = preprocess_num_features(data=trx_data_train, features_list=["amount"])
trx_data_test = preprocess_num_features(data=trx_data_test, features_list=["amount"])

**Предобработка NaN-значений в `channel_type`:**

In [None]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

**Обработка даты:**

In [None]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

In [None]:
trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

**Конвертация значений `channel_type` и `trx_category` из str в int:**

In [None]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [None]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [None]:
trx_data_train = preprocessor.fit_transform(trx_data_train)
trx_data_test = preprocessor.transform(trx_data_test)

**Конвертируем время из секунд в дни, вычитаем из каждого дня минимальный/самый ранний день:**

In [None]:
trx_data_train["event_time"] //= 86400
trx_data_test["event_time"] //= 86400

In [None]:
min_day_train = np.inf
max_day_train = -np.inf

for idx, row in trx_data_train.iterrows():
    min_day_train = min(min_day_train, row["event_time"].min().item())
    max_day_train = max(max_day_train, row["event_time"].max().item())

min_day_train, max_day_train

In [None]:
trx_data_train["event_time"] -= min_day_train

trx_data_test["event_time"] -= min_day_train

for idx in range(len(trx_data_test)):
    seq_times = torch.clip(trx_data_test.iloc[idx]["event_time"], min=0., max=(max_day_train - min_day_train)).long().unsqueeze(dim=1)
    trx_data_test.loc[idx, "event_time"] = seq_times

In [None]:
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)

In [None]:
trx_data_train.drop(columns=["cl_id"], inplace=True)
trx_data_test.drop(columns=["cl_id"], inplace=True)
target_train.drop(columns=["cl_id"], inplace=True)
target_test.drop(columns=["cl_id"], inplace=True)

In [None]:
trx_data_train.reset_index(inplace=True, drop=True)
trx_data_test.reset_index(inplace=True, drop=True)
target_train.reset_index(inplace=True, drop=True)
target_test.reset_index(inplace=True, drop=True)

In [None]:
max_seq_length = 0 

for idx, row in trx_data_train.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

for idx, row in trx_data_test.iterrows():
    max_seq_length = max(max_seq_length, row["event_time"].shape[0])

print("Max Sequence Length:", max_seq_length)

In [None]:
columns = ["event_time", "MCC", "channel_type", "currency", "trx_category", "amount"]

new_tables_columns = []

for i in range(1, max_seq_length + 1):
    for col in columns:
        new_tables_columns.append(col + "_" + str(i))

In [None]:
new_train_table_contents = []

for idx, row in tqdm(trx_data_train.iterrows()):
    new_row = []
    event_time = row["event_time"]
    mcc = row["MCC"]
    ch_type = row["channel_type"]
    currency = row["currency"]
    trx_cat = row["trx_category"]
    amount = row["amount"]
    for j in range(max_seq_length):
        if j < event_time.shape[0]:
            new_row.append(event_time[j].item())
            new_row.append(mcc[j].item())
            new_row.append(ch_type[j].item())
            new_row.append(currency[j].item())
            new_row.append(trx_cat[j].item())
            new_row.append(amount[j].item())
        else:
            for k in range(5):
                new_row.append(-1)
            new_row.append(0.)
    new_train_table_contents.append(new_row)

trx_data_train = pd.DataFrame(data=new_train_table_contents, columns=new_tables_columns)
new_train_table_contents = []

In [None]:
new_test_table_contents = []

for idx, row in tqdm(trx_data_test.iterrows()):
    new_row = []
    event_time = row["event_time"]
    mcc = row["MCC"]
    ch_type = row["channel_type"]
    currency = row["currency"]
    trx_cat = row["trx_category"]
    amount = row["amount"]
    for j in range(max_seq_length):
        if j < event_time.shape[0]:
            new_row.append(event_time[j].item())
            new_row.append(int(mcc[j].item()))
            new_row.append(ch_type[j].item())
            new_row.append(currency[j].item())
            new_row.append(trx_cat[j].item())
            new_row.append(amount[j].item())
        else:
            for k in range(5):
                new_row.append(-1)
            new_row.append(0.)
    new_test_table_contents.append(new_row)

trx_data_test = pd.DataFrame(data=new_test_table_contents, columns=new_tables_columns)
new_test_table_contents = []

In [None]:
cat_features = []

for idx, feature in enumerate(new_tables_columns):
    if idx % 6 != 5:
        cat_features.append(feature)

**Наконец, обучаем бустинг!**

In [None]:
clf = CatBoostClassifier(loss_function='MultiClass', cat_features=cat_features, task_type="GPU", devices='0', random_seed=30)

clf.fit(trx_data_train, target_train, plot_file="catboost_log.html")

In [None]:
test_pred = clf.predict(trx_data_test)
test_proba = clf.predict_proba(trx_data_test)[:, 1]

**Посчитаем метрики:**

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
print("CatBoost Accuracy:", accuracy_score(target_test["target_flag"], test_pred))
print("CatBoost ROC-AUC:", roc_auc_score(target_test["target_flag"], test_proba))

- **"Flattened" Sequences + CatBoost (Time Features in days):**
    - `Accuracy: 0.6704 +- 0.0046`
    - `ROC-AUC: 0.7536 +- 0.003`

---

# Supervised-кейс. GRU.

**Препроцессим данные:**

**Скачаем данные:**

In [None]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

In [None]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

In [None]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

**Делим таргет на трейн и на тест:**

In [None]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [None]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

**Препроцессинг:**

In [None]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [None]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [None]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [None]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

In [None]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [None]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [None]:
# data_train["event_time"] //= 86400
# data_test["event_time"] //= 86400

In [None]:
# min_day_train = np.inf
# max_day_train = -np.inf

# for idx, row in data_train.iterrows():
#     min_day_train = min(min_day_train, row["event_time"].min().item())
#     max_day_train = max(max_day_train, row["event_time"].max().item())

# min_day_train, max_day_train

In [None]:
# data_train["event_time"] -= min_day_train

# data_test["event_time"] -= min_day_train

# for idx in range(len(data_test)):
#     seq_times = torch.clip(data_test.iloc[idx]["event_time"], min=0., max=(max_day_train - min_day_train)).long().unsqueeze(dim=1)
#     data_test.loc[idx, "event_time"] = seq_times

In [None]:
data_train = pd.merge(data_train, target_train, on="cl_id")
data_test = pd.merge(data_test, target_test, on="cl_id")

In [None]:
data_train.rename(columns={"target_flag": "target"}, inplace=True)
data_test.rename(columns={"target_flag": "target"}, inplace=True)

In [None]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

In [None]:
data_train = MemoryMapDataset(data_train)
data_test = MemoryMapDataset(data_test)

**Создаём DataLoader:**

In [3]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
seed_everything(30)

In [None]:
data = PtlsDataModule(
    train_data=SeqToTargetDataset(data_train, target_col_name="target", target_dtype=torch.long),
    valid_data=SeqToTargetDataset(data_test, target_col_name="target", target_dtype=torch.long),
    train_batch_size=128,
    valid_batch_size=128,
    train_num_workers=4
)

**Модель (GRU):**

In [None]:
N_EPOCHS = 30

In [None]:
# timestamps = set()

# for i in data_train:
#     timestamps = timestamps.union(set(i["event_time"].tolist()))

# for i in data_test:
#     timestamps = timestamps.union(set(i["event_time"].tolist()))

# len(timestamps)

In [4]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.trx_encoder.batch_norm import RBatchNorm, RBatchNormWithLens
from ptls.nn.trx_encoder.noisy_embedding import NoisyEmbedding
from ptls.nn.trx_encoder.trx_encoder_base import TrxEncoderBase
import torch.nn as nn


class Time2Vec(nn.Module):
    def __init__(self, k, interval=86400):
        super(Time2Vec, self).__init__()
        self.k = k
        self.w = nn.Parameter(torch.randn(k))
        self.b = nn.Parameter(torch.randn(k))
        self.w0 = nn.Parameter(torch.randn(1))
        self.b0 = nn.Parameter(torch.randn(1))
        self.interval = interval
        
    def forward(self, event_time, t0):
        t0_ = torch.zeros_like(event_time)
        time_diff=None
        if type(t0)!=int:
            first_column = t0[:, 0].unsqueeze(1)
            t0_ = first_column.expand(-1, t0.size(1))
        time_diff = (event_time - t0_)/self.interval
        v1 = self.w0 * time_diff.unsqueeze(-1) + self.b0
        v2 = torch.cos(self.w * time_diff.unsqueeze(-1) + self.b)
        
        return torch.cat([v1, v2], -1)

        
class TrxEncoderT2V(TrxEncoderBase):
    def __init__(self,
                 embeddings=None,
                 numeric_values=None,
                 custom_embeddings=None,
                 time_values=None,
                 embeddings_noise: float = 0,
                 norm_embeddings=None,
                 use_batch_norm=True,
                 use_batch_norm_with_lens=False,
                 clip_replace_value=None,
                 positions=None,
                 emb_dropout=0,
                 spatial_dropout=False,
                 orthogonal_init=False,
                 linear_projection_size=0,
                 out_of_index: str = 'clip',
                 k=2,
                 time_col='event_time'
                 ):
        if clip_replace_value is not None:
            warnings.warn('`clip_replace_value` attribute is deprecated. Always "clip to max" used. '
                          'Use `out_of_index="assert"` to avoid categorical values clip', DeprecationWarning)

        if positions is not None:
            warnings.warn('`positions` is deprecated. positions is not used', UserWarning)

        if embeddings is None:
            embeddings = {}
        if custom_embeddings is None:
            custom_embeddings = {}
        if time_values is None:
            time_values = {}

        noisy_embeddings = {}
        for emb_name, emb_props in embeddings.items():
            if emb_props.get('disabled', False):
                continue
            if emb_props['in'] == 0 or emb_props['out'] == 0:
                continue
            noisy_embeddings[emb_name] = NoisyEmbedding(
                num_embeddings=emb_props['in'],
                embedding_dim=emb_props['out'],
                padding_idx=0,
                max_norm=1 if norm_embeddings else None,
                noise_scale=embeddings_noise,
                dropout=emb_dropout,
                spatial_dropout=spatial_dropout,
            )

        super().__init__(
            embeddings=noisy_embeddings,
            numeric_values=numeric_values,
            custom_embeddings=custom_embeddings,
            out_of_index=out_of_index,
        )

        custom_embedding_size = self.custom_embedding_size
        if use_batch_norm and custom_embedding_size > 0:
            # :TODO: Should we use Batch norm with not-numerical custom embeddings?
            if use_batch_norm_with_lens:
                self.custom_embedding_batch_norm = RBatchNormWithLens(custom_embedding_size)
            else:
                self.custom_embedding_batch_norm = RBatchNorm(custom_embedding_size)
        else:
            self.custom_embedding_batch_norm = None
        
        self.k = k
        self.time2vec_days = Time2Vec(k=self.k)
        self.time_col = time_col
        
        if linear_projection_size > 0:
            self.linear_projection_head = torch.nn.Linear(super().output_size+k+1, linear_projection_size)
        else:
            self.linear_projection_head = None
            

        if orthogonal_init:
            for n, p in self.named_parameters():
                if n.startswith('embeddings.') and n.endswith('.weight'):
                    torch.nn.init.orthogonal_(p.data[1:])
                if n == 'linear_projection_head.weight':
                    torch.nn.init.orthogonal_(p.data)

    def forward(self, x: PaddedBatch):
        processed_embeddings = []
        processed_custom_embeddings = []

        for field_name in self.embeddings.keys():
            processed_embeddings.append(self.get_category_embeddings(x, field_name))
        
        for field_name in self.custom_embeddings.keys():
            processed_custom_embeddings.append(self.get_custom_embeddings(x, field_name))

        if len(processed_custom_embeddings):
            processed_custom_embeddings = torch.cat(processed_custom_embeddings, dim=2)
            if self.custom_embedding_batch_norm is not None:
                processed_custom_embeddings = PaddedBatch(processed_custom_embeddings, x.seq_lens)
                processed_custom_embeddings = self.custom_embedding_batch_norm(processed_custom_embeddings)
                processed_custom_embeddings = processed_custom_embeddings.payload
            processed_embeddings.append(processed_custom_embeddings)

        out = torch.cat(processed_embeddings, dim=2)

        time_encoded_days = self.time2vec_days(x.payload[self.time_col], x.payload[self.time_col])
        out = torch.cat((out, time_encoded_days), dim=2)

        if self.linear_projection_head is not None:
            out = self.linear_projection_head(out)
        return PaddedBatch(out, x.seq_lens)

    @property
    def output_size(self):
        """Returns hidden size of output representation
        """
        if self.linear_projection_head is not None:
            return self.linear_projection_head.out_features
        return super().output_size + self.k + 1

In [None]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(
        embeddings={
            "MCC": {"in": 342, "out": 8},
            "channel_type": {"in": 7, "out": 8},
            "currency": {"in": 60, "out": 8},
            "trx_category": {"in": 11, "out": 8}            
        },
        numeric_values={
            "amount": "log",
        },
        embeddings_noise=0.003,
        k=7,
        time_col="event_time"
    ),
    hidden_size=512,
    is_reduce_sequence=True
)

In [None]:
gru = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=Head(input_size=seq_encoder.embedding_size, objective="classification", num_classes=2, hidden_layers_sizes=[1024]),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(task="multiclass", num_classes=2),
    optimizer_partial=partial(torch.optim.Adam, lr=1e-4, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=1e-6)
)

**Обучение:**

In [6]:
comet_ml.login()

In [7]:
from pytorch_lightning.loggers import CometLogger

In [None]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="supervised_baseline_GRU")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [None]:
trainer.fit(gru, data)

In [None]:
torch.save(gru.state_dict(), "supervised_gru_with_2layered_MLP_rosbank.pt")

In [None]:
print(trainer.logged_metrics)

**Измерим качество на тесте:**

**Используя обученную MLP поверх эмбеддингов:**

In [None]:
test_loader = torch.utils.data.DataLoader(
    dataset=data_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

In [None]:
model = InferenceModule(
    torch.nn.Sequential(
        gru,
        torch.nn.Softmax(dim=1),
    ),
    model_out_name="prob",
)

model.eval()

In [None]:
pred = trainer.predict(model, test_loader)

In [None]:
pred = pd.concat(pred, axis=0)

In [None]:
pred

In [None]:
y_pred = pred[[f"prob_{i:04d}" for i in range(2)]].values.argmax(axis=1)
y_pred

In [None]:
y_true = pred["target"].values
y_true

In [None]:
y_proba = pred[[f"prob_{i:04d}" for i in range(2)]].values[:, 1]
y_proba

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
print("Accuracy:", accuracy_score(y_true, y_pred))
print("ROC-AUC:", roc_auc_score(y_true, y_proba))

- GRU + 2layer MLP Head + Time2Vec (k = 7):
  - `Accuracy: 0.746 +- 0.0076`
  - `ROC-AUC: 0.8148 +- 0.0037`

Без Time2Vec получаются крайне нестабильные/некорректные результаты...

# Self-Supervised Case. COLES, CPC, GPT.

**Данные:**

In [8]:
path_data = "https://huggingface.co/datasets/dllllb/rosbank-churn/resolve/main/train.csv.gz?download=true"
data = pd.read_csv(path_data, compression="gzip")
data

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.00,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.00,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.00,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.00,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.00,C2C_OUT,0,0.0
...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,6011,type1,810,24APR17:14:05:26,600.00,WD_ATM_ROS,1,405.0
490509,01/06/2017,10171,5411,type1,810,06JUN17:00:00:00,132.00,POS,0,0.0
490510,01/02/2017,10167,5541,type1,810,03FEB17:00:00:00,1000.00,POS,1,280428.2
490511,01/06/2017,10163,5941,type1,810,08JUN17:00:00:00,100.00,POS,0,0.0


In [9]:
target = data.groupby(by="cl_id").first().reset_index()[["cl_id", "target_flag"]]
target

Unnamed: 0,cl_id,target_flag
0,0,0
1,1,0
2,5,1
3,9,0
4,10,0
...,...,...
4995,10210,1
4996,10212,0
4997,10213,0
4998,10214,0


In [10]:
data.drop(columns=["PERIOD", "target_flag", "target_sum"], inplace=True)

In [11]:
target_train, target_test = train_test_split(target, test_size=0.1, stratify=target["target_flag"], random_state=42)

In [12]:
trx_data_train = pd.merge(data, target_train["cl_id"], on="cl_id", how="inner")
trx_data_test = pd.merge(data, target_test["cl_id"], on="cl_id", how="inner")

In [13]:
trx_data_train["channel_type"] = trx_data_train["channel_type"].fillna("none")
trx_data_test["channel_type"] = trx_data_test["channel_type"].fillna("none")

In [14]:
month2num = {"JAN": "/01/", "FEB": "/02/", "MAR": "/03/", "APR": "/04/", "MAY": "/05/", "JUN": "/06/",
             "JUL": "/07/", "AUG": "/08/", "SEP": "/09/", "OCT": "/10/", "NOV": "/11/", "DEC": "/12/"}

trx_data_train["TRDATETIME"] = trx_data_train["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])
trx_data_test["TRDATETIME"] = trx_data_test["TRDATETIME"].map(lambda x: x[0:2] + month2num[x[2:5]] + x[5:7] + " " + x[8:])

trx_data_train["TRDATETIME"] = pd.to_datetime(trx_data_train["TRDATETIME"],format='%d/%m/%y %H:%M:%S')
trx_data_test["TRDATETIME"] = pd.to_datetime(trx_data_test["TRDATETIME"],format='%d/%m/%y %H:%M:%S')

In [15]:
chtype2num = {"none": 0, "type1": 1, "type2": 2, "type3": 3, "type4": 4, "type5": 5}

trx_data_train["channel_type"] = trx_data_train["channel_type"].map(lambda x: chtype2num[x])
trx_data_test["channel_type"] = trx_data_test["channel_type"].map(lambda x: chtype2num[x])

In [16]:
trxcat2num = {"POS": 0, "DEPOSIT": 1, "WD_ATM_ROS": 2, "WD_ATM_PARTNER": 3, 
              "C2C_IN": 4, "WD_ATM_OTHER": 5, "C2C_OUT": 6, "BACK_TRX": 7,
              "CAT": 8, "CASH_ADV": 9}

trx_data_train["trx_category"] = trx_data_train["trx_category"].map(lambda x: trxcat2num[x])
trx_data_test["trx_category"] = trx_data_test["trx_category"].map(lambda x: trxcat2num[x])

---

**Квантизация непрерывных признаков (опциональный шаг, нужен только для GPT):**

In [17]:
def digitize(input_array: np.array, q_count: int = 1, bins: np.array = None):
    """Quantile-based discretization function.

    Parameters:
    -------
    input_array (np.array): Input array.
    q_count (int): Amount of quantiles. Used only if input parameter `bins` is None.
    bins (np.array):
        If None, then calculate bins as quantiles of input array,
        otherwise only apply bins to input_array. Default: None

    Returns
    -------
    out_array (np.array of ints): discretized input_array
    bins (np.array of floats):
        Returned only if input parameter `bins` is None.
    """

    if bins is None:
        return_bins = True
        bins = np.quantile(input_array, q=[i / q_count for i in range(1, q_count)], axis=0)
    else:
        return_bins = False

    out_array = np.digitize(input_array, bins)

    if return_bins:
        return out_array, bins
    else:
        return out_array

In [18]:
BINS_NUM = 128

In [19]:
numeric_features = ["amount"]

for feat in numeric_features:
    trx_data_train[feat], bins = digitize(trx_data_train[feat], q_count=BINS_NUM)
    trx_data_test[feat] = digitize(trx_data_test[feat], bins=bins)

In [20]:
import gc

gc.collect()

60

---

In [17]:
preprocessor = PandasDataPreprocessor(
    col_id="cl_id",
    col_event_time="TRDATETIME",
    event_time_transformation="dt_to_timestamp",
    cols_category=["MCC", "channel_type", "currency", "trx_category"],
    cols_numerical=["amount"],
    return_records=False,
)

In [18]:
data_train = preprocessor.fit_transform(trx_data_train)
data_test = preprocessor.transform(trx_data_test)

In [19]:
target_train.rename(columns={"target_flag": "target"}, inplace=True)
target_test.rename(columns={"target_flag": "target"}, inplace=True)
target_train.sort_values(by="cl_id", inplace=True)
target_test.sort_values(by="cl_id", inplace=True)
target_train = target_train["target"]
target_test = target_test["target"]
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [20]:
data_train = data_train.to_dict(orient="records")
data_test = data_test.to_dict(orient="records")

---

**Определение бинов для time diff'ов (в часах) (опциональный шаг, нужен только для TD-GPT):**

In [25]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600
TIME_DIFF_BINS = 256

time_diffs = []

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1
    
    time_diffs += [batch.payload['time_diff'][batch.payload['time_diff'] != -1].numpy()]
    
time_diffs = np.concatenate(time_diffs)

time_diff_bins = np.quantile(time_diffs, q=[(i / TIME_DIFF_BINS) for i in range(1, TIME_DIFF_BINS)], axis=0)

36it [00:00, 96.37it/s]


In [26]:
time_diff_bins

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,   3.,
         4.,   5.,   6.,   7.,   7.,   8.,   9.,   

**Можно видеть, что некоторые из границ бинов дублируются, уберем такие дубликаты (внесём таким образом дисбаланс в распределение классов, но сделаем разбиение более корректным).**

In [27]:
time_diff_bins = list(set(time_diff_bins.tolist()))
time_diff_bins.sort()
time_diff_bins = torch.tensor(time_diff_bins, dtype=torch.int)
time_diff_bins

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  20,  22,  24,  26,  31,  35,  38,  44,  48,
         54,  62,  72,  82,  96, 114, 120, 144, 168, 216, 300, 458],
       dtype=torch.int32)

In [28]:
TIME_DIFF_BINS_NUM = len(time_diff_bins)

TIME_DIFF_BINS_NUM

40

**Тест:**

In [29]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
SECONDS_IN_HOUR = 3600

for batch in tqdm(train_loader):
    timestamps = batch.payload['event_time']
    timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
    batch.payload['time_diff'] = (timestamps - timestamps_prev) // SECONDS_IN_HOUR
    batch.payload['time_diff'][:, 0] = -1

    mask = torch.arange(batch.payload['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], batch.payload['time_diff'].shape[1]), device=batch.device)
    mask[mask > batch.seq_lens[:, None]] = 0.
    mask[mask > 0.] = 1.
    mask = mask.bool()

    batch.payload['time_diff'][~mask] = -1

    print(torch.bucketize(batch.payload['time_diff'], time_diff_bins, right=True))

17it [00:00, 80.40it/s]

tensor([[ 0, 37,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 12, 37,  ...,  0,  0,  0],
        ...,
        [ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0]])
tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        ...,
        [ 0,  2,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 39,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 22,  1,  ...,  0,  0,  0],
        ...,
        [ 0, 11, 21,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 32,  ...,  0,  0,  0]])
tensor([[ 0, 22, 17,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 28,  1,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 31,  ...,  0,  0,  0],
        [ 0, 40, 39,  ...,  0,  0,  0],
        [ 0, 19,  6,  ...

36it [00:00, 86.68it/s]

tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 19,  ...,  0,  0,  0],
        [ 0,  3,  1,  ...,  0,  0,  0],
        ...,
        [ 0, 22, 22,  ...,  0,  0,  0],
        [ 0,  1, 13,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]])
tensor([[ 0,  1, 31,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0,  1, 23,  ...,  0,  0,  0],
        ...,
        [ 0,  1, 17,  ...,  0,  0,  0],
        [ 0, 37, 22,  ...,  0,  0,  0],
        [ 0, 12, 13,  ...,  0,  0,  0]])
tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, 22,  ...,  0,  0,  0],
        [ 0, 22, 28,  ...,  0,  0,  0],
        ...,
        [ 0, 28,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0, 22,  1,  ...,  0,  0,  0]])
tensor([[ 0, 22,  1,  ...,  0,  0,  0],
        [ 0, 32,  1,  ...,  0,  0,  0],
        [ 0, 11, 22,  ...,  0,  0,  0],
        ...,
        [ 0, 40, 40,  ...,  0,  0,  0],
        [ 0, 28, 12,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...




---

**Train sequences lengths check:**

In [None]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**agg_encoder_params)
trx_encoder.to("cuda")

In [None]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.75) * 0.7)

print("Max Length:", threshold)

---

- **COLES:**

In [None]:
seed_everything(42)

**DataLoaders:**

In [None]:
data = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data_train,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=data_test,
            i_filters=[SeqLenFilter(min_seq_len=10)],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=5,
            cnt_max=100,
        ),
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [None]:
N_EPOCHS = 20

In [None]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(**trx_encoder_params),
    hidden_size=512,
    type="gru"
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=1e-3, weight_decay=0),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CosineAnnealingLR, T_max=N_EPOCHS, eta_min=5e-6)
)

**Обучение:**

In [None]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CoLES_Baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [None]:
trainer.fit(coles, data)

In [None]:
trainer.logged_metrics

In [None]:
torch.save(seq_encoder.state_dict(), "coles_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=1Mn8o9IPT4Zzg3946orbw1MVZwpkrBoNb" -O "coles_enc_baseline.pt"

In [None]:
encoder = coles.seq_encoder

# state_dict = torch.load("./coles_enc_baseline.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

In [None]:
from tqdm import tqdm

seed_everything(42)

In [None]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

In [None]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

In [None]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

In [None]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [None]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

- COLES embeds + Catboost:
  - `Accuracy: 0.736`, `0.72`, `0.722`, avg: `0.726 +- 0.0071` 
  -  `ROC-AUC: 0.8099107995661394`, `0.8041475773421184`, `0.8088423370189894`, avg: `0.8076 +- 0.0025`

---

**Train sequences lengths check:**

In [21]:
agg_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8},
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7,
    time_col="event_time"
)

trx_encoder = TrxEncoderT2V(**agg_encoder_params)
trx_encoder.to("cuda")

TrxEncoderT2V(
  (embeddings): ModuleDict(
    (MCC): NoisyEmbedding(
      342, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (channel_type): NoisyEmbedding(
      7, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (currency): NoisyEmbedding(
      60, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
    (trx_category): NoisyEmbedding(
      11, 8, padding_idx=0
      (dropout): Dropout(p=0, inplace=False)
    )
  )
  (custom_embeddings): ModuleDict(
    (amount): LogScaler()
  )
  (custom_embedding_batch_norm): RBatchNorm(
    (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (time2vec_days): Time2Vec()
)

In [22]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)

trx_encoder.eval()

seq_lens = []

for batch in tqdm(train_loader):
    embeds_batch = trx_encoder(batch.to("cuda"))
    seq_lens += [embeds_batch.seq_lens.detach().cpu().numpy()]

seq_lens = np.concatenate(seq_lens)

threshold = int(np.quantile(seq_lens, 0.595))

print("Max Length:", threshold)

36it [00:00, 53.66it/s]

Max Length: 105





---

- **CPC modeling:**

In [102]:
seed_everything(42)

**DataLoaders:**

In [103]:
# min_len <--- 0.4 - 0.5 quantile
# max_len <--- 0.8 - 0.9 quantile

data = PtlsDataModule(
    train_data=CpcDataset(
        MemoryMapDataset(data=data_train),
        min_len=85,             
        max_len=105
    ),
    train_num_workers=4,
    train_batch_size=128,
    valid_data=CpcDataset(
        MemoryMapDataset(data=data_test),
        min_len=85,
        max_len=105
    ),
    valid_num_workers=4,
    valid_batch_size=128
)

**Модель:**

In [104]:
N_EPOCHS = 20

In [105]:
trx_encoder_params = dict(
    embeddings={
        "MCC": {"in": 342, "out": 8}, # 8 / 16 / 32
        "channel_type": {"in": 7, "out": 8},
        "currency": {"in": 60, "out": 8},
        "trx_category": {"in": 11, "out": 8}            
    },
    numeric_values={"amount": "log"},
    embeddings_noise=0.003,
    k=7, # / 31
    time_col="event_time"
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoderT2V(**trx_encoder_params),
    hidden_size=512,
    type="gru"
)

cpc = CpcModule(
    seq_encoder=seq_encoder,
    n_forward_steps=6,
    n_negatives=40,
    optimizer_partial=partial(torch.optim.Adam, lr=5e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.5)
)

**Обучение:**

In [106]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="CPC_modeling_baseline")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [107]:
trainer.fit(cpc, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/f6f155bdffb34953b02ec523c27a23be

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


The number of training batches (36) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : CPC_modeling_baseline
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/f6f155bdffb34953b02ec523c27a23be
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [86]               : (2.7709453105926514, 596.8519897460938)
[1;38;5;39mCOMET INFO:[0m     seq_len [14]            : (60.9140625, 70.6640625)
[1;38;5;39mCOMET INFO:[0m     valid/cpc_accuracy [20] : (0.03508007526397705, 0.23528136312961578)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     N

In [108]:
trainer.logged_metrics

{'loss': tensor(2.7622),
 'seq_len': tensor(68.3000),
 'valid/cpc_accuracy': tensor(0.2353)}

In [None]:
torch.save(seq_encoder.state_dict(), "cpc_enc_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !wget "https://drive.google.com/uc?export=download&id=11j6QgNsdOSTK-GRaAJLKObDW7ehS_aqK" -O "cpc_enc_baseline_higher_trx_dim.pt"

In [109]:
encoder = cpc.seq_encoder

# state_dict = torch.load("./cpc_enc_baseline_higher_trx_dim.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

RnnSeqEncoder(
  (trx_encoder): TrxEncoderT2V(
    (embeddings): ModuleDict(
      (MCC): NoisyEmbedding(
        342, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (channel_type): NoisyEmbedding(
        7, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (currency): NoisyEmbedding(
        60, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
      (trx_category): NoisyEmbedding(
        11, 16, padding_idx=0
        (dropout): Dropout(p=0, inplace=False)
      )
    )
    (custom_embeddings): ModuleDict(
      (amount): LogScaler()
    )
    (custom_embedding_batch_norm): RBatchNorm(
      (bn): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (time2vec_days): Time2Vec()
  )
  (seq_encoder): RnnEncoder(
    (rnn): GRU(81, 512, batch_first=True)
    (reducer): LastStepEncoder()
  )
)

In [110]:
encoder.seq_encoder.is_reduce_sequence = True

In [111]:
from tqdm import tqdm

seed_everything(42)

In [112]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=128)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device))
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

36it [00:01, 22.69it/s]


array([[-0.3059753 , -0.42983228, -0.9721707 , ...,  0.42577967,
        -0.7692118 ,  0.30192366],
       [-0.80275613, -0.99693406, -0.9999986 , ...,  0.99992496,
        -0.99221736,  0.8753204 ],
       [-0.607194  , -0.978649  , -0.9999967 , ...,  0.99951106,
        -0.9924645 ,  0.71040785],
       ...,
       [-0.47359732, -0.99158263, -0.9999974 , ...,  0.9997335 ,
        -0.9909821 ,  0.8622924 ],
       [-0.6813939 , -0.99388015, -0.99999946, ...,  0.9996245 ,
        -0.994852  ,  0.861548  ],
       [-0.07193667, -0.6361093 , -0.84081423, ...,  0.27471218,
        -0.49657768,  0.5544129 ]], dtype=float32)

In [113]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=128)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device))
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

4it [00:00, 21.85it/s]


array([[-0.47416168, -0.9961268 , -0.9999986 , ...,  0.99985564,
        -0.9919283 ,  0.88082623],
       [-0.30373296, -0.69596326, -0.9999964 , ...,  0.974257  ,
        -0.9653468 ,  0.51078546],
       [-0.72930896, -0.99613523, -0.9999987 , ...,  0.99991274,
        -0.989351  ,  0.8638159 ],
       ...,
       [-0.33234674, -0.8184873 , -0.999999  , ...,  0.9724452 ,
        -0.975673  ,  0.43266684],
       [-0.41403773, -0.95594674, -0.99999624, ...,  0.99969405,
        -0.99061775,  0.62632334],
       [-0.86816657, -0.99801064, -0.99999946, ...,  0.9998905 ,
        -0.98666847,  0.9339304 ]], dtype=float32)

In [114]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6653456	total: 8.72ms	remaining: 8.71s
1:	learn: 0.6411280	total: 14.7ms	remaining: 7.33s
2:	learn: 0.6201802	total: 20.3ms	remaining: 6.75s
3:	learn: 0.6027288	total: 26.3ms	remaining: 6.54s
4:	learn: 0.5872571	total: 32.2ms	remaining: 6.41s
5:	learn: 0.5743535	total: 38.2ms	remaining: 6.33s
6:	learn: 0.5625887	total: 44.4ms	remaining: 6.29s
7:	learn: 0.5520436	total: 50.5ms	remaining: 6.26s
8:	learn: 0.5429469	total: 56.4ms	remaining: 6.21s
9:	learn: 0.5346612	total: 62.5ms	remaining: 6.19s
10:	learn: 0.5269486	total: 68.6ms	remaining: 6.17s
11:	learn: 0.5207294	total: 74.7ms	remaining: 6.15s
12:	learn: 0.5150617	total: 80.7ms	remaining: 6.12s
13:	learn: 0.5097575	total: 86.8ms	remaining: 6.11s
14:	learn: 0.5052604	total: 93ms	remaining: 6.11s
15:	learn: 0.5002960	total: 99.2ms	remaining: 6.1s
16:	learn: 0.4959664	total: 105ms	remaining: 6.09s
17:	learn: 0.4923665	total: 112ms	remaining: 6.09s
18:	learn: 0.4883937	total: 118ms	remaining: 6.08

<catboost.core.CatBoostClassifier at 0x7928f1618460>

In [115]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [116]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.76
ROC-AUC: 0.803386702497936


In [118]:
arr = np.array([0.8086318822748539, 0.799711838888799, 0.803386702497936])

arr.mean(), arr.std()

(0.8039101412205296, 0.0036603537692877276)

- CPC context embeds + Catboost (low dim of trx embeds: each embed is of dim 8):
  - `Accuracy: 0.7336 +- 0.0119`
  - `ROC-AUC: 0.8078 +- 0.004`
  
---

- CPC context embeds + Catboost (higher dim of trx embeds: each embed is of dim 16):
  - `Accuracy: 0.7464 +- 0.0099`
  - `ROC-AUC: 0.805 +- 0.00598`

---

- CPC context embeds + Catboost (even higher dim of trx embeds: each embed is of dim 32):
  - `Accuracy: 0.7372 +- 0.0144`
  - `ROC-AUC: 0.8099 +- 0.0069`

---

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8):
  - `Accuracy: 0.754`, `0.742`, `0.744`, avg: `0.7467 +- 0.0052`
  - `ROC-AUC: 0.8175195480079649`, `0.8197697948875686`, `0.8122096129251591`, avg: `0.8165 +- 0.0032`

---

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 16):
  - `Accuracy: 0.734`, `0.722`, `0.76`, avg: `0.739 +- 0.0159`
  - `ROC-AUC: 0.8086318822748539`, `0.799711838888799`, `0.803386702497936`, avg: `0.8039 +- 0.0037`

---

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32):
  - `Accuracy: 0.752`, `0.748`, `0.742`, avg: `0.7473 +- 0.0041`
  - `ROC-AUC: 0.8051836622363244`, `0.8137313626135242`, `0.810639296757378`, avg: `0.8099 +- 0.0035`


**При обучении с аугментациями получаем лучшее качество => будем их использовать. Лучшие результаты - у конфигураций CPC context embeds w/ Aug + Catboost (dim of trx embeds: 8) и CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32), полученные результаты в целом сравнимы по accuracy, но у первой конфигурации средний ROC-AUC значительно выше, чем у другой.**

---

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32) (0.5 - 0.9 quantiles):
  - `Accuracy: 0.73`, `0.752`, `0.754`, avg: `0.7453 +- 0.0109`
  - `ROC-AUC: 0.806478768354082`, `0.8063816353952502`, `0.8105907302779621`, avg: `0.8078 +- 0.002`

\

- CPC context embeds w/ Aug + Catboost (dim of trx embeds: 32) (0.4 - 0.8 quantiles):
  - `Accuracy: 0.738`, `0.75`, `0.746`, avg: `0.7447 +- 0.005`
  - `ROC-AUC: 0.8080490845218631`, `0.8037914231597353`, `0.8044875426980298`, avg: `0.8054 +- 0.0019`

**Всё ещё лучше аугментации с `min_len=85, max_len=105`, т.е. с 0.505-, 0.595-квантилями. Округлим результаты, для дальнейших экспериментов будем брать `min_len` = 0.5-квантиль, `max_len` = 0.6-квантиль.**

---

- **GPT:**

In [229]:
seed_everything(42)

**DataLoaders:**

In [230]:
data = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(data=data_train),
        min_len=1000, # 85
        max_len=1200 # 105
    ),
    train_num_workers=4,
    train_batch_size=64,
    valid_data=GptDataset(
        MemoryMapDataset(data=data_test),
        min_len=1000,
        max_len=1200
    ),
    valid_num_workers=4,
    valid_batch_size=64
)

**Модель:**

In [231]:
from torchmetrics import MeanMetric
from typing import Tuple, Dict, List, Union
from torch import nn
from ptls.nn.seq_encoder.abs_seq_encoder import AbsSeqEncoder
from ptls.nn import PBL2Norm
from ptls.data_load.padded_batch import PaddedBatch


class MeanPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        return pb_mean


class StatPooling(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, pb: PaddedBatch):
        payload = pb.payload # (B, T, H)
        mask = pb.seq_len_mask.bool()
        inf_mask = torch.zeros_like(mask, device=mask.device).float()
        inf_mask[~mask] = -torch.inf
        
        pb_mean = payload.sum(dim=1) / mask.float().sum(dim=1, keepdim=True)
        pb_max = torch.max(payload + inf_mask.unsqueeze(-1), dim=1)[0]
        pb_stat = torch.cat((pb_mean, pb_max), dim=1)
        return pb_stat


class GPTHead(torch.nn.Module):   
    def __init__(self, input_size, n_classes, hidden_size=64, drop_p=0.1):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(input_size, hidden_size, bias=True),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(hidden_size, n_classes)
        )
    def forward(self, x):
        x = self.head(x)
        return x


class GptPretrainModule(pl.LightningModule):
    """GPT2 Language model

    Sequence transactions are encoded by `trx_encoder`.
    Then `seq_encoder` encodes the given sequence 
    (we actually use NN to modify sequence transactions representations,
    then (during inference) we calculate the mean of these encoded transactions to get the representation of the whole sequence).
    After this we use heads to predict the classes of features of the future transaction.

    Parameters
    ----------
    trx_encoder:
        Module for transform dict with feature sequences to sequence of transaction representations
    seq_encoder:
        Module for sequence processing. Generally this is transformer based encoder. Rnn is also possible
        Should work without sequence reduction
    head_hidden_size:
        Hidden size of heads for feature prediction
    seed_seq_len:
         Size of starting sequence without loss 
    total_steps:
        total_steps expected in OneCycle lr scheduler
    max_lr:
        max_lr of OneCycle lr scheduler
    weight_decay:
        weight_decay of Adam optimizer
    pct_start:
        % of total_steps when lr increase
    norm_predict:
        use l2 norm for transformer output or not
    """

    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload
        
        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)


class GPTInferenceModule(torch.nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.model.is_reduce_sequence = False
        self.mean_pooling = MeanPooling()
        self.stat_pooling = StatPooling()

    def forward(self, batch, eval_strategy="mean"):
        z_trx = self.model.trx_encoder(batch)
        out = self.model._seq_encoder(z_trx)
        out = out if isinstance(out, PaddedBatch) else PaddedBatch(out, batch.seq_lens)

        if eval_strategy == "mean":
            out = self.mean_pooling(out)
        elif eval_strategy == "stat":
            out = self.stat_pooling(out)

        if self.model.hparams.norm_predict:
            out = out / (out.pow(2).sum(dim=-1, keepdim=True) + 1e-9).pow(0.5)
        return out

**Модель с учетом времени (предсказываем, сколько времени (в часах) должно пройти от текущей до следующей транзакции):**

In [232]:
class GptPretrainWithTimeDiffsModule(pl.LightningModule):
    def __init__(self,
                 trx_encoder: torch.nn.Module,
                 seq_encoder: AbsSeqEncoder,
                 time_diffs_boundaries: torch.tensor,
                 time_diffs_bins_num: int,
                 head_hidden_size: int = 64,
                 total_steps: int = 64000,
                 seed_seq_len: int = 16,
                 max_lr: float = 0.00005,
                 weight_decay: float = 0.0,
                 pct_start: float = 0.1,
                 norm_predict: bool = False
                 ):

        super().__init__()
        self.save_hyperparameters(ignore=['trx_encoder', 'seq_encoder'])

        self.trx_encoder = trx_encoder
        self._seq_encoder = seq_encoder
        self._seq_encoder.is_reduce_sequence = False

        self.head = nn.ModuleDict()
        for col_name, noisy_emb in self.trx_encoder.embeddings.items():
            self.head[col_name] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=noisy_emb.num_embeddings)

        self.head['time_diff'] = GPTHead(input_size=self._seq_encoder.embedding_size, hidden_size=head_hidden_size, n_classes=time_diffs_bins_num) 

        if self.hparams.norm_predict:
            self.fn_norm_predict = PBL2Norm()

        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        self.train_gpt_loss = MeanMetric()
        self.valid_gpt_loss = MeanMetric()

        self.time_diffs_boundaries = time_diffs_boundaries
        self.SECONDS_IN_HOUR = 3600

    def forward(self, batch: PaddedBatch):
        z_trx = self.trx_encoder(batch) 
        out = self._seq_encoder(z_trx)
        if self.hparams.norm_predict:
            out = self.fn_norm_predict(out)
        return out

    def loss_gpt(self, logits, labels):
        loss = 0
        for col_name, head in self.head.items():
            y_pred = head(logits[:, self.hparams.seed_seq_len:-1, :])
            y_pred = y_pred.view(-1, y_pred.size(-1))

            y_true = labels[col_name][:, self.hparams.seed_seq_len+1:]
            y_true = torch.flatten(y_true.long())
            
            loss += self.loss(y_pred, y_true)
            
        return loss

    def training_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        timestamps = labels['event_time']
        timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
        labels['time_diff'] = (timestamps - timestamps_prev) // self.SECONDS_IN_HOUR
        labels['time_diff'][:, 0] = -1

        mask = torch.arange(labels['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], labels['time_diff'].shape[1]), device=batch.device)
        mask[mask > batch.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask.bool()

        labels['time_diff'][~mask] = -1

        labels['time_diff'] = torch.bucketize(labels['time_diff'], self.time_diffs_boundaries.to(batch.device), right=True)
        
        loss_gpt = self.loss_gpt(out, labels)
        self.train_gpt_loss(loss_gpt)
        self.log(f'loss', loss_gpt, sync_dist=True)
        return loss_gpt

    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)  # PB: B, T, H
        out = out.payload if isinstance(out, PaddedBatch) else out
        labels = batch.payload

        timestamps = labels['event_time']
        timestamps_prev = torch.cat([timestamps[:, 0].unsqueeze(1), timestamps[:, :-1]], dim=1)
        labels['time_diff'] = (timestamps - timestamps_prev) // self.SECONDS_IN_HOUR
        labels['time_diff'][:, 0] = -1

        mask = torch.arange(labels['time_diff'].shape[1], device=batch.device)[None, :] + torch.ones((batch.seq_lens.shape[0], labels['time_diff'].shape[1]), device=batch.device)
        mask[mask > batch.seq_lens[:, None]] = 0.
        mask[mask > 0.] = 1.
        mask = mask.bool()

        labels['time_diff'][~mask] = -1

        labels['time_diff'] = torch.bucketize(labels['time_diff'], self.time_diffs_boundaries.to(batch.device), right=True)
        
        loss_gpt = self.loss_gpt(out, labels)
        self.valid_gpt_loss(loss_gpt)

    def on_training_epoch_end(self):
        self.log('train loss (by epochs)', self.train_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def on_validation_epoch_end(self):
        self.log('val loss (by epochs)', self.valid_gpt_loss, prog_bar=True, logger=True, sync_dist=True, rank_zero_only=True)

    def configure_optimizers(self):
        optim = torch.optim.NAdam(self.parameters(),
                                  lr=self.hparams.max_lr,
                                  weight_decay=self.hparams.weight_decay
                                 )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer=optim,
            max_lr=self.hparams.max_lr,
            total_steps=self.hparams.total_steps,
            pct_start=self.hparams.pct_start,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=10000.0,
            three_phase=False
        )
        
        scheduler = {'scheduler': scheduler, 'interval': 'step'}
        return [optim], [scheduler]
    
    @property
    def seq_encoder(self):
        return GPTInferenceModule(pretrained_model=self)

In [233]:
N_EPOCHS = 20

In [154]:
# # feedforward_dim = 256, embed_dim = 16

# trx_encoder = TrxEncoderT2V(
#     embeddings={
#         "MCC": {"in": 342, "out": 16},
#         "channel_type": {"in": 7, "out": 16},
#         "currency": {"in": 60, "out": 16},
#         "trx_category": {"in": 11, "out": 16},
#         "amount": {"in": BINS_NUM, "out": 16}
#     },
#     embeddings_noise=0.003,
#     k=15,
#     time_col="event_time"
# )

# seq_encoder = GptEncoder(
#     n_embd=trx_encoder.output_size,
#     n_layer=6,
#     n_head=6,
#     n_inner=256,
#     activation_function="gelu_new",
#     resid_pdrop=0.1,
#     embd_pdrop=0.1,
#     attn_pdrop=0.1,
#     n_positions=2048,
#     use_positional_encoding=True,
#     use_start_random_shift=True,
#     is_reduce_sequence=False
# )

# gpt = GptPretrainModule(
#     trx_encoder=trx_encoder,
#     seq_encoder=seq_encoder,
#     head_hidden_size=256,
#     total_steps=(N_EPOCHS * 71), # num_epochs * num_steps_per_epoch
#     seed_seq_len=16,
#     max_lr=3e-3,
#     weight_decay=3e-4, # try adding weight_decay > 0
#     pct_start=0.1,
#     norm_predict=False # never use it again
# )

In [234]:
trx_encoder = TrxEncoderT2V(
    embeddings={
        "MCC": {"in": 342, "out": 16},
        "channel_type": {"in": 7, "out": 16},
        "currency": {"in": 60, "out": 16},
        "trx_category": {"in": 11, "out": 16},
        "amount": {"in": BINS_NUM, "out": 16}
    },
    embeddings_noise=0.003,
    k=15,
    time_col="event_time"
)

seq_encoder = GptEncoder(
    n_embd=trx_encoder.output_size,
    n_layer=6,
    n_head=6,
    n_inner=256,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    n_positions=2048,
    use_positional_encoding=True,
    use_start_random_shift=True,
    is_reduce_sequence=False
)

gpt = GptPretrainWithTimeDiffsModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    time_diffs_boundaries=time_diff_bins,
    time_diffs_bins_num=(TIME_DIFF_BINS_NUM + 1), # (boundaries num) + (1 before the first boundary (OOD)) 
    head_hidden_size=256,
    total_steps=(N_EPOCHS * 71), # num_epochs * num_steps_per_epoch
    seed_seq_len=16,
    max_lr=3e-3, # 2e-3 if hidden_dim = 512
    weight_decay=3e-4, # try adding weight_decay > 0
    pct_start=0.1,
    norm_predict=False # never use it again
)

**Обучение:**

In [None]:
!export HYDRA_FULL_ERROR=1

In [235]:
logger = CometLogger(project_name="evs-ssl-rb", experiment_name="TD-GPT_modeling_baseline (seed_seq_len=0, emb_dim=16)")

trainer = pl.Trainer(
    logger=logger,
    max_epochs=N_EPOCHS,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True
)

In [236]:
trainer.fit(gpt, data)

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/askoro/evs-ssl-rb/8721ed5467a643209be0e247d56e72b6

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/kaggle/working' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : TD-GPT_modeling_baseline (seed_seq_len=0, emb_dim=16)
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/askoro/evs-ssl-rb/8721ed5467a643209be0e247d56e72b6
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [170]                : (9.27989673614502, 23.25154685974121)
[1;38;5;39mCOMET INFO:[0m     val loss (by epochs) [20] : (9.735424995422363, 11.22779655456543)
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Name : TD-GPT_modeling_baseline (seed_seq_len=0, emb_d

In [237]:
trainer.logged_metrics

{'loss': tensor(9.0383), 'val loss (by epochs)': tensor(9.7354)}

In [238]:
encoder = gpt.seq_encoder

In [None]:
torch.save(encoder.state_dict(), "gpt_baseline_rosbank.pt")

**Измерим качество на тесте (catboost поверх эмбеддингов):**

In [None]:
# !pip install gdown

In [None]:
# import gdown

# gdown.download("https://drive.google.com/uc?export=download&id=1YBstN7hpEIREo7zORmPoEZ_0NyBgfjm6", "gpt_baseline_NAdam.pt")

In [239]:
# state_dict = torch.load("./gpt_baseline_NAdam.pt")
# encoder.load_state_dict(state_dict)

device = "cuda:0"

encoder.to(device)

GPTInferenceModule(
  (model): GptPretrainWithTimeDiffsModule(
    (trx_encoder): TrxEncoderT2V(
      (embeddings): ModuleDict(
        (MCC): NoisyEmbedding(
          342, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (channel_type): NoisyEmbedding(
          7, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (currency): NoisyEmbedding(
          60, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (trx_category): NoisyEmbedding(
          11, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (amount): NoisyEmbedding(
          128, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (custom_embeddings): ModuleDict()
      (time2vec_days): Time2Vec()
    )
    (_seq_encoder): GptEncoder(
      (transf): GPT2Model(
        (wte): Embedding(4, 96)
        (wpe): Embedding(2048, 96)
        (drop): Drop

In [252]:
from tqdm import tqdm

seed_everything(42)

In [253]:
train_loader = inference_data_loader(data_train, num_workers=0, batch_size=8)
encoder.eval()
train_embeds = None

for i, batch in tqdm(enumerate(train_loader)):
    train_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        train_embeds = train_embeds_batch.detach().cpu().numpy()
    else:
        train_embeds = np.concatenate([train_embeds, train_embeds_batch.detach().cpu().numpy()], axis=0)
    
train_embeds

563it [00:04, 132.16it/s]


array([[-1.02710361e+01, -7.00244141e+00, -1.45193205e+01, ...,
        -1.33324003e+01, -1.70678174e+00, -2.41260386e+00],
       [ 7.37567171e-02, -4.44750398e-01, -2.20298156e-01, ...,
        -9.96228307e-02, -4.73985821e-01, -4.22955096e-01],
       [ 1.37285784e-01,  8.53458703e-01, -3.10979128e+00, ...,
        -1.21280205e+00, -1.65956810e-01, -6.76736748e-03],
       ...,
       [ 4.65878963e-01,  6.70017481e-01,  9.96225625e-02, ...,
        -1.30904484e+00,  1.17500126e+00,  4.23811257e-01],
       [-2.29893178e-01,  4.15246904e-01, -1.75800300e+00, ...,
        -1.51660287e+00,  1.27084017e+00,  4.06846225e-01],
       [ 2.10688505e-02, -2.33276561e-01, -1.03255546e+00, ...,
        -1.49405956e+00,  1.48531020e+00,  8.59185040e-01]], dtype=float32)

In [254]:
test_loader = inference_data_loader(data_test, num_workers=0, batch_size=8)
encoder.eval()
test_embeds = None

for i, batch in tqdm(enumerate(test_loader)):
    test_embeds_batch = encoder(batch.to(device), eval_strategy="mean")
    if i == 0:
        test_embeds = test_embeds_batch.detach().cpu().numpy()
    else:
        test_embeds = np.concatenate([test_embeds, test_embeds_batch.detach().cpu().numpy()], axis=0)
    
test_embeds

63it [00:00, 129.64it/s]


array([[-1.1641598 ,  0.05195681,  0.8139026 , ..., -0.37870657,
         0.5328597 ,  0.13062394],
       [-1.2825881 , -1.2992858 , -2.2387605 , ..., -2.8301609 ,
         0.4101027 ,  0.07886361],
       [ 0.78959584,  2.0519142 , -0.47888586, ..., -0.41910532,
         0.33295023, -0.38117337],
       ...,
       [-2.0887024 , -0.7639354 , -3.9021547 , ..., -2.5443547 ,
        -2.1898692 , -0.28712144],
       [-1.98548   , -0.29546428, -2.0315497 , ..., -1.6045305 ,
        -0.05820235, -0.18352011],
       [-0.50516826, -0.21619101, -0.19474208, ...,  0.3155762 ,
         0.54149985,  0.13873324]], dtype=float32)

In [255]:
clf = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0', random_state=42)

clf.fit(train_embeds, target_train, plot_file="catboost_log.html")

Learning rate set to 0.088214
0:	learn: 0.6710713	total: 5.9ms	remaining: 5.89s
1:	learn: 0.6528073	total: 10.8ms	remaining: 5.41s
2:	learn: 0.6364890	total: 15.6ms	remaining: 5.2s
3:	learn: 0.6219092	total: 20.5ms	remaining: 5.1s
4:	learn: 0.6091903	total: 25.3ms	remaining: 5.04s
5:	learn: 0.5976128	total: 29.8ms	remaining: 4.94s
6:	learn: 0.5879908	total: 34ms	remaining: 4.83s
7:	learn: 0.5795636	total: 38.3ms	remaining: 4.75s
8:	learn: 0.5715303	total: 42.5ms	remaining: 4.68s
9:	learn: 0.5641124	total: 47.4ms	remaining: 4.69s
10:	learn: 0.5581281	total: 52.2ms	remaining: 4.69s
11:	learn: 0.5522790	total: 57ms	remaining: 4.69s
12:	learn: 0.5467593	total: 62ms	remaining: 4.7s
13:	learn: 0.5428176	total: 66.9ms	remaining: 4.71s
14:	learn: 0.5387026	total: 71.7ms	remaining: 4.71s
15:	learn: 0.5348314	total: 76.4ms	remaining: 4.7s
16:	learn: 0.5313894	total: 81.2ms	remaining: 4.7s
17:	learn: 0.5282133	total: 86.1ms	remaining: 4.7s
18:	learn: 0.5247704	total: 91.1ms	remaining: 4.7s
19:	le

<catboost.core.CatBoostClassifier at 0x7e65d69ee080>

In [256]:
test_pred = clf.predict(test_embeds)
test_proba = clf.predict_proba(test_embeds)[:, 1]

In [257]:
print("Accuracy:", accuracy_score(target_test, test_pred))
print("ROC-AUC:", roc_auc_score(target_test, test_proba))

Accuracy: 0.694
ROC-AUC: 0.7785368538634634


In [261]:
arr = np.array([0.7706528953716146, 0.78031762477538, 0.7785368538634634])

arr.mean(), arr.std()

(0.776502458003486, 0.004199668518531881)

- GPT embeds + Catboost (trx dim: 8):
  - `Accuracy: 0.726`, `0.706`, `0.698`, avg: `0.71 +- 0.012`
  - `ROC-AUC: 0.7732592964336016`, `0.773583072963041`, `0.7668485211507017`, avg: `0.7712 +- 0.0031`

\

- GPT embeds (w/ stat pooling) + Catboost (trx dim: 8):
  - `Accuracy: 0.722`, `0.722`, `0.702`, avg: `0.7153 +- 0.0094`
  - `ROC-AUC: 0.7781806996810801`, `0.7859675252140972`, `0.7828268928785351`, avg: `0.7823 +- 0.0032`

\

Плохо, с trx эмбеддингами большей размерности результаты выходит лучше => отказываемся от текущего конфига 

---
- GPT embeds + Catboost (trx dim: 16):
  - `Accuracy: 0.704`, `0.704`, `0.71`, avg: `0.706 +- 0.0028`
  - `ROC-AUC: 0.7573942464910719`, `0.7741496818895598`, `0.7739554159718962`, avg: `0.7685 +- 0.0079`

- seed_seq_len = 0:
  - `Accuracy: 0.69`, `0.724`, `0.742`, avg: `0.7187 +- 0.022`
  - `ROC-AUC: 0.7671399200271972`, `0.7834582571109421`, `0.7875378413818784`, avg: `0.7793 +- 0.0088` 

\

- GPT embeds (w/ stat pooling) + Catboost (trx dim: 16):
  - `Accuracy: 0.732`, `0.72`, `0.73`, avg: `0.7273 +- 0.0052`
  - `ROC-AUC: 0.7915202926939826`, `0.8000518042447102`, `0.7904032636674168`, avg: `0.794 +- 0.0043`

- seed_seq_len = 0:
  - `Accuracy: 0.716`, `0.716`, `0.72`, avg: `0.7173 +- 0.0019`
  - `ROC-AUC: 0.7824221722167359`, `0.7874892749024625`, `0.7888005698466917`, avg: `0.7862 +- 0.0028` 

---

- GPT embeds + Catboost // higher feedforward dim (512):
  - `Accuracy: 0.71`, `0.702`, `0.73`, avg: `0.714 +- 0.0118`
  - `ROC-AUC: 0.7789577633517347`, `0.7738259053601204`, `0.7971863819591718`, avg: `0.7833 +- 0.01`

- seed_seq_len = 0:
  - `Accuracy: 0.712`, `0.712`, `0.726`, avg: `0.7167 +- 0.0066`
  - `ROC-AUC: 0.7781159443751922`, `0.7817260526784413`, `0.7935277071765067`, avg: `0.7845 +- 0.0066` 

\

- GPT embeds (w/ stat pooling) + Catboost // higher feedforward dim (512):
  - `Accuracy: 0.734`, `0.712`, `0.716`, avg: `0.7207 +- 0.0096`
  - `ROC-AUC: 0.786080846999401`, `0.7767560829515469`, `0.7907594178498001`, avg: `0.7845 +- 0.0058`

- seed_seq_len = 0:
  - `Accuracy: 0.706`, `0.722`, `0.714`, avg: `0.714 +- 0.0065`
  - `ROC-AUC: 0.7836363342021337`, `0.7880720726554531`, `0.7949847015589839`, avg: `0.7889 +- 0.0046` 

---

- TD-GPT embeds + Catboost (trx dim: 8):
  - `Accuracy: 0.706`, `0.716`, `0.704`, avg: `0.7087 +- 0.0052`
  - `ROC-AUC: 0.7691797121626653`, `0.7872788201583266`, `0.7781483220281362`, avg: `0.7782 +- 0.0074`

\

- TD-GPT embeds (w/ stat pooling) + Catboost (trx dim: 8):
  - `Accuracy: 0.722`, `0.718`, `0.716`, avg: `0.7187 +- 0.0025`
  - `ROC-AUC: 0.775655242751453`, `0.793446763044147`, `0.7903708860144727`, avg: `0.7865 +- 0.0078`

---

- TD-GPT embeds + Catboost (trx dim: 16):
  - `Accuracy: 0.718`, `0.732`, `0.736`, avg: `0.7287 +- 0.0077`
  - `ROC-AUC: 0.7763027958103317`, `0.7876025966877661`, `0.7854818604199382`, avg: `0.7831 +- 0.0049`

- seed_seq_len = 0:
  - `Accuracy: 0.71`, `0.724`, `0.694`, avg: `0.7093 +- 0.0123`
  - `ROC-AUC: 0.7706528953716146`, `0.78031762477538`, `0.7785368538634634`, avg: `0.7765 +- 0.0042` 

\

- TD-GPT embeds (w/ stat pooling) + Catboost (trx dim: 16):
  - `Accuracy: 0.736`, `0.724`, `0.73`, avg: `0.73 +- 0.0049`
  - `ROC-AUC: 0.8034352689773517`, `0.787618785514238`, `0.7936410289618107`, avg: `0.7949 +- 0.0065`

- seed_seq_len = 0:
  - `Accuracy: 0.744`, `0.698`, `0.734`, avg: `0.7253 +- 0.0198`
  - `ROC-AUC: 0.8012012109242201`, `0.7834096906315262`, `0.7925401887617166`, avg: `0.7924 +- 0.0073` 

---

- TD-GPT embeds + Catboost // higher feedforward dim (512):
  - `Accuracy: 0.712`, `0.728`, `0.7`, avg: `0.7133 +- 0.0115`
  - `ROC-AUC: 0.7847533632286996`, `0.7800424147253564`, `0.775153389130822`, avg: `0.78 +- 0.0039`

\

- TD-GPT embeds (w/ stat pooling) // higher feedforward dim (512):
  - `Accuracy: 0.742`, `0.706`, `0.722`, avg: `0.7233 +- 0.0147`
  - `ROC-AUC: 0.8094898900778682`, `0.7875054637289343`, `0.7939971831441939`, avg: `0.797 +- 0.0092`

---


**Итог:** TD-GPT (GPT с учётом временных разностей) оказался лучше, чем обычный GPT, не учитывающий время; seed_seq_len=0 делает результаты крайне нестабильными для TD-GPT, для обычного GPT - улучшает результаты при mean pooling'е и ухудшает - при mean + max pooling'е.

Наилучший результат достигается для конфига TD-GPT embeds (w/ stat pooling) + Catboost (trx dim: 16):

  - `Accuracy: 0.73 +- 0.0049`
  - `ROC-AUC: 0.7949 +- 0.0065`

# Итоги.

| Method                        |    Accuracy           | ROC-AUC         |
|-------------------------------|-----------------------|-----------------|
| **Flattened Sequences**       | 0.67 ± 0.0046         | 0.7536 ± 0.003  |
| **GRU (+ MLP)**               | 0.746 ± 0.0076        | 0.8148 ± 0.0037 |
| **CoLES**                     | 0.726 ± 0.0071        | 0.8076 ± 0.0025 |
| **CPC Modeling (emb_dim=8)**  | 0.747 ± 0.0052        | 0.8165 ± 0.0032 |
| **CPC Modeling (emb_dim=32)** | 0.747 ± 0.0041        | 0.8099 ± 0.0035 |
| **TD-GPT**                    | 0.73 ± 0.0049         | 0.7949 ± 0.0065 |