# Colab setup

In [None]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch-lifestream
    !wget https://raw.githubusercontent.com/Matteus1904/GPT-like_approach_for_event_sequences/master/models.py
    !wget https://raw.githubusercontent.com/Matteus1904/GPT-like_approach_for_event_sequences/master/dataset.py

## Data load

In [1]:
import os

if not os.path.exists('data/rosbank/train.csv'):
    !mkdir -p data/rosbank
    !curl -OL https://storage.yandexcloud.net/di-datasets/rosbank-ml-contest-boosters.pro.zip
    !unzip -j -o rosbank-ml-contest-boosters.pro.zip '*.csv' -d data/rosbank
    !mv rosbank-ml-contest-boosters.pro.zip data/rosbank/

## Setup

In [4]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import pytorch_lightning as pl

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

from models import GptPretrainContrastiveModule, NextItemPredictionModule

from ptls.nn import TrxEncoder, RnnEncoder
from ptls.frames.gpt import GptPretrainModule, GptDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule

from dataset import MlmNoSliceDataset

import os
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data preproccessing

In [5]:
data_path = 'data/rosbank'

source_data = pd.read_csv(os.path.join(data_path, 'train.csv'))
source_data['TRDATETIME'] = pd.to_datetime(source_data['TRDATETIME'], format='%d%b%y:%H:%M:%S')
source_data = source_data.sort_values(by=['TRDATETIME'])
source_data = source_data.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
source_data.head()

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum
26790,01/10/2016,485,4121,type2,810,2016-10-07 00:00:00,242.0,POS,1,85.0
72077,01/10/2016,1290,5411,type2,810,2016-10-07 00:00:00,2465.0,POS,1,321242.09
26988,01/10/2016,485,6011,type2,810,2016-10-07 00:00:00,3600.0,WD_ATM_PARTNER,1,85.0
72068,01/10/2016,1290,6011,type2,810,2016-10-07 18:57:17,10000.0,WD_ATM_ROS,1,321242.09
189585,01/10/2016,3351,8999,type3,810,2016-10-08 00:00:00,10000.0,POS,0,0.0


In [6]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(source_data['small_group'].unique())}

source_data['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(source_data[['amount_rur']]).astype('int')
source_data['small_group'] = source_data['small_group'].map(mcc_to_id)

In [7]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='TRDATETIME',
    event_time_transformation='dt_to_timestamp',
    cols_category=['small_group'],
    cols_numerical=['amount_rur_bin'],
    return_records=True,
)

In [8]:
%%time

dataset = preprocessor.fit_transform(source_data[['client_id', 'TRDATETIME', 'small_group', 'amount_rur_bin']])

CPU times: total: 2.08 s
Wall time: 2.12 s


In [9]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [10]:
train, valid_test = train_test_split(dataset, test_size=0.2, random_state=42)

valid, test = train_test_split(valid_test, test_size=0.5, random_state=42)
len(train), len(valid), len(test)

(4000, 500, 500)

### Naive pop

In [14]:
from sklearn.metrics import f1_score, accuracy_score
train_clients = [d["client_id"] for d in train]
test_clients = [d["client_id"] for d in test]

train_data = source_data[source_data["client_id"].isin(train_clients)]
test_data = source_data[source_data["client_id"].isin(test_clients)]

most_pop_mcc = train_data["small_group"].value_counts().index[0]

y_true = test_data["small_group"]
y_pred = [most_pop_mcc]*test_data.shape[0]

f1_score(y_true, y_pred, average="weighted")

0.09109194206711545

### Naive prev

In [15]:
y_true = test_data.groupby("client_id").apply(lambda x: x["small_group"]).reset_index()["small_group"].rename("y_true")
y_pred = test_data.groupby("client_id").apply(lambda x: x["small_group"].shift()).reset_index()["small_group"].rename("y_pred")

y_concat = pd.concat([y_true, y_pred], axis=1).dropna()

f1_score(y_concat["y_true"], y_concat["y_pred"], average="weighted")

0.2374830911817028

In [11]:
train_dl = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(
            data=train,
        ),
        min_len=25, 
        max_len=200
    ),
    valid_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=valid,
        ),
    ),
    test_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=test,
        ),
    ),
    train_batch_size=128,
)

# No pretraining

In [None]:
for _ in range(50):
    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 350, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model_downstream = NextItemPredictionModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=20000
    )

    trainer = pl.Trainer(
        max_epochs=100,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', mode='min', patience=5)],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    data = {
        "Scores": [trainer.test(model_downstream, train_dl)[0]['gpt/test_f1_weighted']]
        }
    df = pd.DataFrame(data)

    if os.path.isfile('results/stats_basic_rosbank.csv'):

        
        df.to_csv('results/stats_basic_rosbank.csv', index = False)

    else:

        stats = pd.read_csv('results/stats_basic_rosbank.csv')

        stats = pd.concat([stats, df], ignore_index=True)

        stats.to_csv('results/stats_basic_rosbank.csv', index = False)

## Embedding training (representation)

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

In [9]:
print("Number of unique MCC codes:", source_data['small_group'].max())

Number of unique MCC codes: 344


# Experiments

In [None]:
for _ in range(50):

    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 350, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model = GptPretrainModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        max_lr=0.1,
        total_steps=20000
    )

    trainer = pl.Trainer(
        max_epochs=100,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss')],
        enable_progress_bar=False,
    )

    trainer.fit(model, train_dl)

    model.trx_encoder.requires_grad_(False)

    model_downstream = NextItemPredictionModule(
        trx_encoder=model.trx_encoder, 
        seq_encoder=seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=20000
    )


    trainer = pl.Trainer(
        max_epochs=100,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', patience=5, mode='min')],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    if os.path.isfile('results/stats_repr_rosbank.csv'):

        
        df.to_csv('results/stats_repr_rosbank.csv', index = False)

    else:

        stats = pd.read_csv('results/stats_repr_rosbank.csv')

        stats = pd.concat([stats, df], ignore_index=True)

        stats.to_csv('results/stats_repr_rosbank.csv', index = False)

# Contrastive experiments

In [None]:
for _ in range(50):
    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 350, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model = GptPretrainContrastiveModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        max_lr=0.1,
        total_steps=20000,
        neg_count=10,
        loss_temperature=10
    )

    trainer = pl.Trainer(
        max_epochs=100,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('mlm/valid_mlm_loss')],
        enable_progress_bar=False,
    )

    trainer.fit(model, train_dl)

    model.trx_encoder.requires_grad_(False)

    model_downstream = NextItemPredictionModule(
        trx_encoder=model.trx_encoder, 
        seq_encoder=seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=20000
    )

    trainer = pl.Trainer(
        max_epochs=100,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', mode='min', patience=5)],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    if os.path.isfile('results/stats_contr_rosbank.csv'):

        
        df.to_csv('results/stats_contr_rosbank.csv', index = False)

    else:

        stats = pd.read_csv('results/stats_contr_rosbank.csv')

        stats = pd.concat([stats, df], ignore_index=True)

        stats.to_csv('results/stats_contr_rosbank.csv', index = False)