# Colab setup

In [1]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch-lifestream
    !wget https://raw.githubusercontent.com/Matteus1904/GPT-like_approach_for_event_sequences/master/models.py
    !wget https://raw.githubusercontent.com/Matteus1904/GPT-like_approach_for_event_sequences/master/dataset.py

## Data load

In [3]:
import os

if not os.path.exists('data/rosbank/train.csv'):
    !mkdir -p data/rosbank
    !curl -OL https://storage.yandexcloud.net/di-datasets/rosbank-ml-contest-boosters.pro.zip
    !unzip -j -o rosbank-ml-contest-boosters.pro.zip '*.csv' -d data/rosbank
    !mv rosbank-ml-contest-boosters.pro.zip data/rosbank/

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import pytorch_lightning as pl

from sklearn.metrics import f1_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

from ptls.nn import TrxEncoder, RnnEncoder
from ptls.frames.gpt import GptPretrainModule, GptDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule

from dataset import MlmNoSliceDataset
from models import GptPretrainContrastiveModule, NextItemPredictionModule

import os
import pandas as pd

## Data preproccessing

In [2]:
data_path = 'data/rosbank'

source_data = pd.read_csv(os.path.join(data_path, 'train.csv'))
source_data['TRDATETIME'] = pd.to_datetime(source_data['TRDATETIME'], format='%d%b%y:%H:%M:%S')
source_data = source_data.sort_values(by=['TRDATETIME'])
source_data = source_data.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
source_data.head()

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum
26790,01/10/2016,485,4121,type2,810,2016-10-07 00:00:00,242.0,POS,1,85.0
72077,01/10/2016,1290,5411,type2,810,2016-10-07 00:00:00,2465.0,POS,1,321242.09
26988,01/10/2016,485,6011,type2,810,2016-10-07 00:00:00,3600.0,WD_ATM_PARTNER,1,85.0
72068,01/10/2016,1290,6011,type2,810,2016-10-07 18:57:17,10000.0,WD_ATM_ROS,1,321242.09
189585,01/10/2016,3351,8999,type3,810,2016-10-08 00:00:00,10000.0,POS,0,0.0


In [3]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(source_data['small_group'].unique())}

source_data['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(source_data[['amount_rur']]).astype('int')
source_data['small_group'] = source_data['small_group'].map(mcc_to_id)

In [4]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='TRDATETIME',
    event_time_transformation='dt_to_timestamp',
    cols_category=['small_group'],
    cols_numerical=['amount_rur_bin'],
    return_records=True,
)

In [5]:
%%time

dataset = preprocessor.fit_transform(source_data[['client_id', 'TRDATETIME', 'small_group', 'amount_rur_bin']])

CPU times: total: 1.81 s
Wall time: 1.93 s


In [6]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [7]:
train, valid_test = train_test_split(dataset, test_size=0.2, random_state=42)

valid, test = train_test_split(valid_test, test_size=0.5, random_state=42)
len(train), len(valid), len(test)

(4000, 500, 500)

In [8]:
# popular predictor

train_clients = [d["client_id"] for d in train]
test_clients = [d["client_id"] for d in test]

train_data = source_data[source_data["client_id"].isin(train_clients)]
test_data = source_data[source_data["client_id"].isin(test_clients)]

most_pop_mcc = train_data["small_group"].value_counts().index[0]

y_true = test_data["small_group"]
y_pred = [most_pop_mcc]*test_data.shape[0]

f1_score(y_true, y_pred, average="weighted")

0.09109194206711545

In [9]:
# naive predictor

y_true = test_data.groupby("client_id").apply(lambda x: x["small_group"]).reset_index()["small_group"].rename("y_true")
y_pred = test_data.groupby("client_id").apply(lambda x: x["small_group"].shift()).reset_index()["small_group"].rename("y_pred")

y_concat = pd.concat([y_true, y_pred], axis=1).dropna()

f1_score(y_concat["y_true"], y_concat["y_pred"], average="weighted")

0.2374830911817028

In [10]:
train_dl = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(
            data=train,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
        min_len=25, 
        max_len=200
    ),
    valid_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=valid,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    test_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=test,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    train_batch_size=128,
)

## Embedding training (representation)

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

In [11]:
print("Number of unique MCC codes:", source_data['small_group'].max())

Number of unique MCC codes: 344


### Model definition

In [15]:
trx_encoder_params = dict(
    embeddings_noise=0.0,
    embeddings={
        'small_group': {'in': 345, 'out': 16},
        'amount_rur_bin':{'in': 11, 'out': 16}
    },
    linear_projection_size=32
)

seq_encoder = RnnEncoder(
        input_size=32,
        hidden_size=32,
        type='gru',
)

model = GptPretrainModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    max_lr=0.1
)

### Pre-training

In [16]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss')],
    enable_progress_bar=True
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 91


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | trx_encoder    | TrxEncoder       | 6.8 K 
1 | _seq_encoder   | RnnEncoder       | 6.4 K 
2 | head           | ModuleDict       | 27.4 K
3 | loss           | CrossEntropyLoss | 0     
4 | train_gpt_loss | MeanMetric       | 0     
5 | valid_gpt_loss | MeanMetric       | 0     
----------------------------------------------------
40.5 K    Trainable params
0         Non-trainable params
40.5 K    Total params
0.162     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'gpt/loss': tensor(4.9095), 'gpt/valid_gpt_loss': tensor(5.0274), 'gpt/train_gpt_loss': tensor(4.9657)}
CPU times: total: 37.3 s
Wall time: 40.7 s


### Fine-tuning

In [25]:
model.trx_encoder.requires_grad_(False)
model._seq_encoder.requires_grad_(True)

for x in model._seq_encoder.rnn.parameters():
    print(x)
    break

Parameter containing:
tensor([[ 0.6017, -0.2060, -0.6732,  ...,  0.0954,  0.1001,  0.0099],
        [ 0.1440, -0.1219,  0.6979,  ...,  0.3641,  0.1116, -0.0355],
        [-0.0530,  0.1641, -0.4594,  ..., -0.0890, -0.3170,  0.4077],
        ...,
        [-0.0061,  0.2025, -0.0250,  ..., -0.2322,  0.0044,  0.0723],
        [ 0.3667, -0.2494,  0.5317,  ..., -0.2269, -0.1410,  0.2347],
        [ 0.1141,  0.0317,  0.2057,  ...,  0.1222, -0.1854,  0.0400]],
       requires_grad=True)


In [26]:
model_downstream = NextItemPredictionModule(
    trx_encoder=TrxEncoder(**trx_encoder_params), # model.trx_encoder,
    seq_encoder=seq_encoder, # seq_encoder, 
    target_col='small_group',
    max_lr=0.1,
)

In [27]:
checkpoint = pl.callbacks.Che

trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', patience=5, mode='min')],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model_downstream, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | trx_encoder    | TrxEncoder       | 6.8 K 
1 | _seq_encoder   | RnnEncoder       | 6.4 K 
2 | head           | Head             | 24.5 K
3 | loss           | CrossEntropyLoss | 0     
4 | train_gpt_loss | MeanMetric       | 0     
5 | valid_gpt_loss | MeanMetric       | 0     
----------------------------------------------------
37.7 K    Trainable params
0         Non-trainable params
37.7 K    Total params
0.151     Total estimated model params size (MB)


logger.version = 93


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'gpt/loss': tensor(2.8881), 'gpt/valid_gpt_loss': tensor(2.8790), 'gpt/valid_f1_weighted': tensor(0.2312, dtype=torch.float64), 'gpt/train_gpt_loss': tensor(2.8061)}


In [29]:
trainer.test(model_downstream, train_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  gpt/test_f1_weighted      0.2366044538093604
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'gpt/test_f1_weighted': 0.2366044538093604}]

In [61]:
for x in model_downstream.trx_encoder.embeddings.small_group.parameters():
    print(x)

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7690,  1.2800,  1.8744,  ...,  0.3460,  0.4291,  0.2786],
        [-0.7777, -0.1185, -0.9769,  ...,  0.6025,  0.6557, -1.5881],
        ...,
        [-0.5787, -1.4496,  1.3006,  ..., -0.1985, -0.0092, -0.1623],
        [-0.9733, -1.5787,  1.1142,  ...,  0.9475, -0.9607,  0.8432],
        [ 0.8133,  0.6903,  0.8634,  ...,  0.0118, -1.4103, -0.7586]],
       device='cuda:0', requires_grad=True)


## Embedding training (contrastive)

### Model definition

In [181]:
trx_encoder_params = dict(
    embeddings_noise=0.0,
    embeddings={
        'small_group': {'in': 112, 'out': 4},
        'amount_rur_bin':{'in': 11, 'out': 4}
    },
    linear_projection_size=32
)

seq_encoder = RnnEncoder(
        input_size=32,
        hidden_size=32,
        type='gru',
)

model = GptPretrainContrastiveModule(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    seq_encoder=seq_encoder,
    max_lr=0.1,
    total_steps=10000,
    neg_count=10,
    loss_temperature=5
)


### Trainer

In [182]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('mlm/valid_mlm_loss')],
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [183]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type             | Params
-----------------------------------------------------
0 | trx_encoder     | TrxEncoder       | 780   
1 | _seq_encoder    | RnnEncoder       | 6.4 K 
2 | fn_norm_predict | PBShell          | 0     
3 | loss_fn         | QuerySoftmaxLoss | 0     
4 | train_mlm_loss  | MeanMetric       | 0     
5 | valid_mlm_loss  | MeanMetric       | 0     
-----------------------------------------------------
7.2 K     Trainable params
0         Non-trainable params
7.2 K     Total params
0.029     Total estimated model params size (MB)


logger.version = 34


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'mlm/loss': tensor(1.6991), 'mlm/valid_mlm_loss': tensor(1.6794), 'mlm/train_mlm_loss': tensor(1.7239)}
CPU times: total: 54.4 s
Wall time: 55.5 s


In [189]:
model.trx_encoder.requires_grad_(False)
model._seq_encoder.requires_grad_(True)

for x in model_downstream._seq_encoder.rnn.parameters():
    print(x)
    break

Parameter containing:
tensor([[-0.8619, -0.2848, -0.1983,  ..., -0.3491, -2.4682,  0.3766],
        [ 0.0749, -0.2794, -0.0122,  ...,  0.3312,  1.2060, -0.1800],
        [ 0.7626,  0.2617, -0.0402,  ...,  0.7371,  0.5646,  0.1845],
        ...,
        [ 0.9389,  0.6027,  0.0989,  ...,  1.2411,  1.2231, -0.4713],
        [ 0.0185, -0.3688, -0.7067,  ...,  0.4431,  0.0083,  0.8737],
        [-1.0392, -0.2482,  0.4103,  ..., -1.7565, -0.6526,  0.8464]],
       requires_grad=True)


In [185]:
model_downstream = NextItemPredictionModule(
    trx_encoder=TrxEncoder(**trx_encoder_params), # model.trx_encoder,
    seq_encoder=seq_encoder, # model._seq_encoder,
    target_col='small_group',
    max_lr=0.1
)

In [186]:
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', mode='min', patience=10)],
    enable_progress_bar=True
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
