## Data load

In [1]:
# import os

# if not os.path.exists('data/rosbank/train.csv'):
#     !mkdir -p data/rosbank
#     !curl -OL https://storage.yandexcloud.net/di-datasets/rosbank-ml-contest-boosters.pro.zip
#     !unzip -j -o rosbank-ml-contest-boosters.pro.zip 'data/*.csv' -d data/rosbank
#     !mv age-prediction-nti-sbebank-2019.zip data/rosbank/

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import pytorch_lightning as pl

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

from models import GptPretrainContrastiveModule, NextItemPredictionModule

from ptls.nn import TrxEncoder, RnnEncoder
from ptls.frames.gpt import GptPretrainModule, GptDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule

from dataset import MlmNoSliceDataset

import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Data preproccessing

In [2]:
source_data = pd.read_csv('transactions_sber.csv')
source_data = source_data.rename({'trans_date': 'TRDATETIME'}, axis=1)

In [3]:
# data_path = 'data/rosbank'


# source_data = pd.read_csv('transactions_sber.csv')
# # source_data['TRDATETIME'] = pd.to_datetime(source_data['TRDATETIME'], format='%d%b%y:%H:%M:%S')
source_data = source_data.sort_values(by=['TRDATETIME'])
source_data = source_data.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
source_data.head()

Unnamed: 0,client_id,TRDATETIME,small_group,amount_rur
5201569,44379,0,52,62.535
2788175,43594,0,125,10.524
2788174,43594,0,36,86.255
18975203,5882,0,12,5.132
18975202,5882,0,18,11.678


In [4]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(source_data['small_group'].unique())}

source_data['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(source_data[['amount_rur']]).astype('int')
source_data['small_group'] = source_data['small_group'].map(mcc_to_id)

# Naive prev

In [6]:
y_true = source_data.groupby("client_id").apply(lambda x: x["small_group"]).reset_index()["small_group"].rename("y_true")
y_pred = source_data.groupby("client_id").apply(lambda x: x["small_group"].shift()).reset_index()["small_group"].rename("y_pred")
y_concat = pd.concat([y_true, y_pred], axis=1).dropna()
from sklearn.metrics import f1_score, accuracy_score
f1_score(y_concat["y_true"], y_concat["y_pred"], average="weighted")

0.14406434364916318

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='TRDATETIME',
    event_time_transformation='dt_to_timestamp',
    cols_category=['small_group'],
    cols_numerical=['amount_rur_bin'],
    return_records=True,
)

In [6]:
%%time

dataset = preprocessor.fit_transform(source_data[['client_id', 'TRDATETIME', 'small_group', 'amount_rur_bin']])

CPU times: user 33.1 s, sys: 7.06 s, total: 40.1 s
Wall time: 40.1 s


In [7]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [13]:
train, valid_test = train_test_split(dataset, test_size=0.2, random_state=42)

valid, test = train_test_split(valid_test, test_size=0.5, random_state=42)
len(train), len(valid), len(test)

(24000, 3000, 3000)

# Naive pop

In [31]:
from sklearn.metrics import f1_score, accuracy_score
train_clients = [d["client_id"] for d in train]
test_clients = [d["client_id"] for d in test]

train_data = source_data[source_data["client_id"].isin(train_clients)]
test_data = source_data[source_data["client_id"].isin(test_clients)]

most_pop_mcc = train_data["small_group"].value_counts().index[0]

y_true = test_data["small_group"]
y_pred = [most_pop_mcc]*test_data.shape[0]

f1_score(y_true, y_pred, average="weighted")

0.14399694748874114

In [9]:
train_dl = PtlsDataModule(
    train_data=GptDataset(
        MemoryMapDataset(
            data=train,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
        min_len=25, 
        max_len=200
    ),
    valid_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=valid,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    test_data=MlmNoSliceDataset(
        MemoryMapDataset(
            data=test,
            # i_filters=[
            #     SeqLenFilter(min_seq_len=25),
            # ],
        ),
    ),
    train_batch_size=128,
)

# No pretraining

In [None]:
for _ in range(50):
    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 203, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model_downstream = NextItemPredictionModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=10000
    )

    trainer = pl.Trainer(
        max_epochs=50,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', mode='min', patience=5)],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    data = {
    "Scores": [trainer.test(model_downstream, train_dl)[0]['gpt/test_f1_weighted']]
    }

    df = pd.DataFrame(data)

    stats = pd.read_csv('stats_basic_sber.csv')

    stats = pd.concat([stats, df], ignore_index=True)

    stats.to_csv('stats_basic_sber.csv', index = False)

## Embedding training (representation)

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

In [10]:
print("Number of unique MCC codes:", source_data['small_group'].max())

Number of unique MCC codes: 202


In [None]:
for _ in range(50):

    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 203, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model = GptPretrainModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        max_lr=0.1,
        total_steps=10000
    )

    trainer = pl.Trainer(
        max_epochs=50,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', patience = 5)],
        enable_progress_bar=False,
    )

    trainer.fit(model, train_dl)

    model.trx_encoder.requires_grad_(False)

    model_downstream = NextItemPredictionModule(
        trx_encoder=model.trx_encoder, # model.trx_encoder,
        seq_encoder=seq_encoder, # model._seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=10000
    )


    trainer = pl.Trainer(
        max_epochs=50,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', patience=5, mode='min')],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    data = {
    "Scores": [trainer.test(model_downstream, train_dl)[0]['gpt/test_f1_weighted']]
    }

    df = pd.DataFrame(data)

    stats = pd.read_csv('stats_repr_sber.csv')

    stats = pd.concat([stats, df], ignore_index=True)

    stats.to_csv('stats_repr_sber.csv', index = False)

# Contrastive experiments

In [None]:
for _ in range(50):
    trx_encoder_params = dict(
        embeddings_noise=0.0,
        embeddings={
            'small_group': {'in': 203, 'out': 16},
            'amount_rur_bin':{'in': 11, 'out': 16}
        },
        linear_projection_size = 32
    )

    seq_encoder = RnnEncoder(
            input_size=32,
            hidden_size=32,
            type='gru',
    )

    model = GptPretrainContrastiveModule(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder,
        max_lr=0.1,
        total_steps=10000,
        neg_count=10,
        loss_temperature=10
    )

    trainer = pl.Trainer(
        max_epochs=50,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('mlm/valid_mlm_loss')],
        enable_progress_bar=False,
    )

    trainer.fit(model, train_dl)

    model.trx_encoder.requires_grad_(False)

    model_downstream = NextItemPredictionModule(
        trx_encoder=model.trx_encoder, #TrxEncoder(**trx_encoder_params),
        seq_encoder=seq_encoder, # seq_encoder,
        target_col='small_group',
        max_lr=0.01,
        total_steps=10000
    )

    trainer = pl.Trainer(
        max_epochs=50,
        gpus=1 if torch.cuda.is_available() else 0,
        callbacks=[pl.callbacks.EarlyStopping('gpt/valid_gpt_loss', mode='min', patience=5)],
        enable_progress_bar=False,
    )

    trainer.fit(model_downstream, train_dl)

    data = {
    "Scores": [trainer.test(model_downstream, train_dl)[0]['gpt/test_f1_weighted']]
    }

    df = pd.DataFrame(data)

    stats = pd.read_csv('stats_contr_sber.csv')

    stats = pd.concat([stats, df], ignore_index=True)

    stats.to_csv('stats_contr_sber.csv', index = False)