# Colab setup

In [1]:
# if 'google.colab' in str(get_ipython()):
#     !pip install pytorch-lifestream

In [2]:
# !pip install lightning

## Data load

In [3]:
# import os

# if not os.path.exists('data/rosbank/train.csv'):
#     !mkdir -p data/rosbank
#     !curl -OL https://storage.yandexcloud.net/di-datasets/rosbank-ml-contest-boosters.pro.zip
#     !unzip -j -o rosbank-ml-contest-boosters.pro.zip 'data/*.csv' -d data/rosbank
#     !mv age-prediction-nti-sbebank-2019.zip data/rosbank/

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import pytorch_lightning as pl

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

from models import GptPretrainContrastiveModule, NextItemPredictionModule

from ptls.nn import TrxEncoder, RnnEncoder
from ptls.frames.gpt import GptPretrainModule, GptDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule

from dataset import MlmNoSliceDataset

import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Data preproccessing

In [2]:
source_data = pd.read_csv('transactions_sber.csv')
source_data = source_data.rename({'trans_date': 'TRDATETIME'}, axis=1)

In [3]:
source_data

Unnamed: 0,client_id,TRDATETIME,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [4]:
# data_path = 'data/rosbank'


# source_data = pd.read_csv('transactions_sber.csv')
# # source_data['TRDATETIME'] = pd.to_datetime(source_data['TRDATETIME'], format='%d%b%y:%H:%M:%S')
source_data = source_data.sort_values(by=['TRDATETIME'])
source_data = source_data.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
source_data.head()

Unnamed: 0,client_id,TRDATETIME,small_group,amount_rur
5201569,44379,0,52,62.535
2788175,43594,0,125,10.524
2788174,43594,0,36,86.255
18975203,5882,0,12,5.132
18975202,5882,0,18,11.678


In [5]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(source_data['small_group'].unique())}

source_data['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(source_data[['amount_rur']]).astype('int')
source_data['small_group'] = source_data['small_group'].map(mcc_to_id)

In [6]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='TRDATETIME',
    event_time_transformation='dt_to_timestamp',
    cols_category=['small_group'],
    cols_numerical=['amount_rur_bin'],
    return_records=True,
)

In [7]:
%%time

dataset = preprocessor.fit_transform(source_data[['client_id', 'TRDATETIME', 'small_group', 'amount_rur_bin']])

CPU times: user 30.1 s, sys: 3.74 s, total: 33.8 s
Wall time: 33.8 s


In [8]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [9]:
train_valid, test = train_test_split(dataset, test_size=0.2, random_state=42)

train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)
len(train), len(valid), len(test)

(19200, 4800, 6000)

In [10]:
train_dl = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(
            data=train,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
        min_len=25, 
        max_len=200
    ),
    valid_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=valid,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    test_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=test,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    train_batch_size=128,
)

## Embedding training (representation)

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

In [11]:
print("Number of unique MCC codes:", source_data['small_group'].max())

Number of unique MCC codes: 202


### Model definition

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.0,
    embeddings={
        'small_group': {'in': 350, 'out': 16},
        'amount_rur_bin':{'in': 60, 'out': 16}
    },
)

seq_encoder = RnnEncoder(
        input_size=32,
        hidden_size=32,
        type='gru',
)

model = GptPretrainModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    max_lr=0.1
)

### Pre-training

In [13]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss')],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 52


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | trx_encoder    | TrxEncoder       | 6.6 K 
1 | _seq_encoder   | RnnEncoder       | 6.4 K 
2 | head           | ModuleDict       | 30.9 K
3 | loss           | CrossEntropyLoss | 0     
4 | train_gpt_loss | MeanMetric       | 0     
5 | valid_gpt_loss | MeanMetric       | 0     
----------------------------------------------------
43.8 K    Trainable params
0         Non-trainable params
43.8 K    Total params
0.175     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 20: 100%|██████████| 188/188 [00:04<00:00, 45.15it/s, loss=4.72, v_num=52, gpt/valid_gpt_loss=4.670]
{'gpt/loss': tensor(4.7345), 'gpt/valid_gpt_loss': tensor(4.6724), 'gpt/train_gpt_loss': tensor(4.7042)}
CPU times: user 2min 5s, sys: 8.82 s, total: 2min 14s
Wall time: 1min 31s


### Fine-tuning

In [15]:
model_downstream = NextItemPredictionModule(
    trx_encoder=TrxEncoder(**trx_encoder_params), # model.trx_encoder,
    seq_encoder=seq_encoder, # model._seq_encoder,
    target_col='small_group',
    max_lr=0.1
)

In [16]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_f1_weighted', patience=5, mode='max')],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model_downstream, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | trx_encoder    | TrxEncoder       | 6.6 K 
1 | _seq_encoder   | RnnEncoder       | 6.4 K 
2 | head           | Head             | 24.9 K
3 | loss           | CrossEntropyLoss | 0     
4 | train_gpt_loss | MeanMetric       | 0     
5 | valid_gpt_loss | MeanMetric       | 0     
----------------------------------------------------
37.8 K    Trainable params
0         Non-trainable params
37.8 K    Total params
0.151     Total estimated model params size (MB)


logger.version = 53
                                                                           

  rank_zero_warn(


Epoch 11: 100%|██████████| 188/188 [00:04<00:00, 39.92it/s, loss=2.5, v_num=53, gpt/valid_gpt_loss=2.490, gpt/valid_f1_weighted=0.234] 
{'gpt/loss': tensor(2.4726), 'gpt/valid_gpt_loss': tensor(2.4926), 'gpt/valid_f1_weighted': tensor(0.2341, dtype=torch.float64), 'gpt/train_gpt_loss': tensor(2.5084)}


In [18]:
trainer.test(model_downstream, train_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 47/47 [00:02<00:00, 23.22it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  gpt/test_f1_weighted      0.23819629691443897
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'gpt/test_f1_weighted': 0.23819629691443897}]

## Embedding training (contrastive)

### Model definition

In [23]:
trx_encoder_params = dict(
    embeddings_noise=0.0,
    embeddings={
        'small_group': {'in': 350, 'out': 16},
        'amount_rur_bin':{'in': 11, 'out': 16}
    },
)

seq_encoder = RnnEncoder(
        input_size=32,
        hidden_size=32,
        type='gru',
)

model = GptPretrainContrastiveModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    max_lr=0.01,
    total_steps=10000
)


### Trainer

In [24]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('mlm/valid_mlm_loss')],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [25]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params
-----------------------------------------------------
0 | trx_encoder     | TrxEncoder       | 5.8 K 
1 | _seq_encoder    | RnnEncoder       | 6.4 K 
2 | fn_norm_predict | PBShell          | 0     
3 | loss_fn         | QuerySoftmaxLoss | 0     
4 | train_mlm_loss  | MeanMetric       | 0     
5 | valid_mlm_loss  | MeanMetric       | 0     
-----------------------------------------------------
12.2 K    Trainable params
0         Non-trainable params
12.2 K    Total params
0.049     Total estimated model params size (MB)


logger.version = 55
Epoch 26: 100%|██████████| 188/188 [00:08<00:00, 23.29it/s, loss=0.474, v_num=55, mlm/valid_mlm_loss=0.475]
{'mlm/loss': tensor(0.4776), 'mlm/valid_mlm_loss': tensor(0.4752), 'mlm/train_mlm_loss': tensor(0.4753)}
CPU times: user 4min 45s, sys: 11.8 s, total: 4min 56s
Wall time: 3min 36s


In [45]:
model_downstream = NextItemPredictionModule(
    trx_encoder=model.trx_encoder, #TrxEncoder(**trx_encoder_params),
    seq_encoder=model._seq_encoder, # seq_encoder,
    target_col='small_group',
    max_lr=0.1,
    total_steps=10000
)

In [46]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_f1_weighted', mode='max', patience=5)],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [47]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model_downstream, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | trx_encoder    | TrxEncoder       | 5.8 K 
1 | _seq_encoder   | RnnEncoder       | 6.4 K 
2 | head           | Head             | 24.9 K
3 | loss           | CrossEntropyLoss | 0     
4 | train_gpt_loss | MeanMetric       | 0     
5 | valid_gpt_loss | MeanMetric       | 0     
----------------------------------------------------
37.0 K    Trainable params
0         Non-trainable params
37.0 K    Total params
0.148     Total estimated model params size (MB)


logger.version = 60
                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 6: 100%|██████████| 188/188 [00:04<00:00, 39.86it/s, loss=2.85, v_num=60, gpt/valid_gpt_loss=2.870, gpt/valid_f1_weighted=0.142]
{'gpt/loss': tensor(2.8486), 'gpt/valid_gpt_loss': tensor(2.8665), 'gpt/valid_f1_weighted': tensor(0.1418, dtype=torch.float64), 'gpt/train_gpt_loss': tensor(2.8642)}


In [48]:
trainer.test(model_downstream, train_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 47/47 [00:01<00:00, 27.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  gpt/test_f1_weighted      0.14532649462306496
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'gpt/test_f1_weighted': 0.14532649462306496}]

In [49]:
from ptls.frames.bert import MLMPretrainModule
from ptls.nn import TrxEncoder, RnnEncoder
# from ptls.frames.gpt import GptPretrainModule


# trx_encoder_params = dict(
#     embeddings_noise=0.0,
#     # numeric_values={'amount_rur': 'identity'},
#     embeddings={
#         # 'trans_date': {'in': 800, 'out': 16},
#         'small_group': {'in': 350, 'out': 32},
#         # 'amount_rur':{'in': 50, 'out': 16}
#     },
# )

trx_encoder_params = dict(
    embeddings_noise=0.0,
    embeddings={
        'small_group': {'in': 350, 'out': 16},
        'amount_rur_bin':{'in': 60, 'out': 16}
    },
)


seq_encoder = RnnEncoder(
        input_size=32,
        hidden_size=32,
        type='gru',
)

# model = GptPretrainModule(
#     trx_encoder=TrxEncoder(**trx_encoder_params),
#     seq_encoder=seq_encoder,
#  #   loss_type='contrast'
# )

# model = GptPretrainContrastiveModule(
#     trx_encoder=TrxEncoder(**trx_encoder_params),
#     seq_encoder=seq_encoder,
#     max_lr=0.01,
#     total_steps=10000
# )

model = MLMPretrainModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    total_steps=30000,
    max_lr=1,
    neg_count=5,
    replace_proba=0.15
)

### Data loader

In [50]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.gpt import GptDataset
from ptls.frames.bert import MlmDataset
from ptls.frames import PtlsDataModule


# train_dl = PtlsDataModule(
#     train_data=GptDataset(
#         MemoryMapDataset(
#             data=train,
#             # i_filters=[
#             #     SeqLenFilter(min_seq_len=25),
#             # ],
#         ),
#         min_len=25, 
#         max_len=300
#     ),
#     valid_data=GptDataset(
#         MemoryMapDataset(
#             data=test,
#             # i_filters=[
#             #     SeqLenFilter(min_seq_len=25),
#             # ],
#         ),
#         min_len=25, 
#         max_len=300
#     ),
#     train_batch_size=128,
# )

train_dl = PtlsDataModule(
    train_data=MlmDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        min_len=25, 
        max_len=300
    ),
    valid_data=MlmDataset(
        MemoryMapDataset(
            data=test,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        min_len=25, 
        max_len=300
    ),
    train_batch_size=64,
)

### Trainer

In [51]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    accelerator='gpu'
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Training 

In [52]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params
-----------------------------------------------------
0 | trx_encoder     | TrxEncoder       | 6.6 K 
1 | _seq_encoder    | RnnEncoder       | 6.4 K 
2 | fn_norm_predict | PBShell          | 0     
3 | loss_fn         | QuerySoftmaxLoss | 0     
4 | train_mlm_loss  | MeanMetric       | 0     
5 | valid_mlm_loss  | MeanMetric       | 0     
-----------------------------------------------------
13.0 K    Trainable params
0         Non-trainable params
13.0 K    Total params
0.052     Total estimated model params size (MB)


logger.version = 61
                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 3:  47%|████▋     | 185/394 [00:04<00:04, 42.83it/s, loss=nan, v_num=61, mlm/valid_mlm_loss=1.640] 



Epoch 58:  52%|█████▏    | 204/394 [00:04<00:04, 43.82it/s, loss=nan, v_num=61, mlm/valid_mlm_loss=nan.0]{'mlm/loss': tensor(nan, device='cuda:0'), 'mlm/valid_mlm_loss': tensor(nan, device='cuda:0'), 'mlm/train_mlm_loss': tensor(nan, device='cuda:0')}
CPU times: user 7min 48s, sys: 56.2 s, total: 8min 44s
Wall time: 8min 16s


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Epoch 58:  52%|█████▏    | 204/394 [00:18<00:17, 10.91it/s, loss=nan, v_num=61, mlm/valid_mlm_loss=nan.0]

### Save sequence encoder for other experiments

In [None]:
torch.save(seq_encoder.state_dict(), "coles-emb.pt")