In [1]:
import os
import sys
import pickle
from functools import partial

import pandas as pd

import torch

import pytorch_lightning as pl

from sklearn.model_selection import train_test_split

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

In [2]:
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [3]:
orig_df = pd.read_parquet('data/new_data/preprocessed/preproc_dataset.parquet')

In [4]:
orig_df.drop(columns=['sample_label', 'target'], inplace=True)

In [5]:
orig_df

Unnamed: 0,user_id,mcc_code,transaction_amt,transaction_dttm,is_income
0,1,1,5.889078,2020-08-03 08:05:23,0
1,1,2,4.922270,2020-08-05 01:27:40,0
2,1,2,4.933393,2020-08-05 03:28:11,0
3,1,3,5.734882,2020-08-06 00:36:29,0
4,1,1,4.893904,2020-08-09 00:30:13,0
...,...,...,...,...,...
19160987,22533,1,3.706910,2021-07-31 05:33:03,0
19160988,22533,1,5.625801,2021-07-31 08:57:02,0
19160989,22533,1,4.927959,2021-07-31 08:59:33,0
19160990,22533,11,4.454891,2021-08-01 23:04:41,0


In [6]:
orig_df['mcc_code'].value_counts()

mcc_code
1      5654775
2      1737826
4      1577361
9       837687
11      687291
        ...   
333          1
320          1
319          1
300          1
377          1
Name: count, Length: 377, dtype: int64

In [7]:
preprocessor = PandasDataPreprocessor(
    'user_id',
    'transaction_dttm',
    cols_category=['mcc_code', 'is_income'],
    cols_numerical=['transaction_amt'],
    return_records=True
)

In [8]:
dataset = preprocessor.fit_transform(orig_df)

In [9]:
dataset = sorted(dataset, key=lambda x: x['user_id'])

In [10]:
dataset[0]

{'user_id': 1,
 'event_time': tensor([1596441923, 1596590860, 1596598091, 1596674189, 1596933013, 1596935908,
         1597016604, 1597030018, 1598297326, 1598492791, 1598579687, 1598588546,
         1598682722, 1598763249, 1598827950, 1598906774, 1598919551, 1599007728,
         1599011081, 1599090187, 1599187559, 1599337080, 1599343800, 1599690236,
         1599691008, 1599807011, 1599807250, 1599807291, 1599812334, 1599877249,
         1599880784, 1599961992, 1599974942, 1600029302, 1600216611, 1600295886,
         1600395559, 1600397605, 1600591385, 1600634580, 1600648099, 1600669921,
         1600726127, 1600738072, 1600807681, 1600823951, 1600981153, 1601002731,
         1601006665, 1601006964, 1601010742, 1601109529, 1601263755, 1601340163,
         1601344889, 1601353268, 1601423401, 1601433062, 1601526411, 1601607303,
         1602015600, 1602037995, 1602102240, 1602199480, 1602201519, 1602209422,
         1602273000, 1602295888, 1602297346, 1602300458, 1602300714, 1602304057,

In [11]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(18026, 4507)

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={'transaction_amt': 'identity'},
    embeddings={
        'mcc_code': {'in': 377, 'out': 32},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [13]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=40),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=40,
            cnt_max=80,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [14]:
train_dl.setup('fit')
dl = train_dl.train_dataloader()



In [17]:
next(iter(dl))[0]

<ptls.data_load.padded_batch.PaddedBatch at 0x7f27b200a100>

In [14]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator='gpu',
    devices=1,
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
trainer.fit(model, train_dl)

  rank_zero_warn(
Missing logger folder: /app/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 235 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
235 K     Trainable params
0         Non-trainable params
235 K     Total params
0.943     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
