In [1]:
import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pickle
from functools import partial

from hydra import compose, initialize

import pandas as pd

import torch

import pytorch_lightning as pl

from sklearn.model_selection import train_test_split

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset, inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

from src.networks.coles import MyCoLES

%load_ext autoreload
%autoreload 2

In [3]:
with initialize(config_path='../config', version_base=None):
    cfg = compose(config_name='config')
cfg_preprop = cfg['dataset']
cfg_model = cfg['embed_model']

In [4]:
orig_df = pd.read_parquet('data/new_data/preprocessed/preproc_dataset.parquet')

In [5]:
orig_df.drop(columns=['sample_label', 'target'], inplace=True)

In [6]:
preprocessor = PandasDataPreprocessor(
    'user_id',
    'transaction_dttm',
    cols_category=['mcc_code', 'is_income'],
    cols_numerical=['transaction_amt'],
    return_records=True
)

In [7]:
dataset = preprocessor.fit_transform(orig_df)

In [8]:
dataset = sorted(dataset, key=lambda x: x['user_id'])

In [9]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(18026, 4507)

In [None]:
trx_encoder_params = dict(
    embeddings_noise=0.005,
    numeric_values={'transaction_amt': 'identity'},
    embeddings={
        'mcc_code': {'in': 377, 'out': 32},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [None]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=40),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=40,
            cnt_max=80,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [None]:
train_dl.setup('fit')
dl = train_dl.train_dataloader()



In [None]:
next(iter(dl))[0]

<ptls.data_load.padded_batch.PaddedBatch at 0x7f27b200a100>

In [None]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator='gpu',
    devices=1,
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl)

  rank_zero_warn(
Missing logger folder: /app/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 235 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
235 K     Trainable params
0         Non-trainable params
235 K     Total params
0.943     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [10]:
best_model = MyCoLES.load_from_checkpoint(
    'logs/checkpoints/coles/coles_hidden_size_32_0.ckpt',
    data_conf=cfg_preprop,
    coles_conf=cfg_model
)

In [15]:
train_dl = inference_data_loader(train, num_workers=2, batch_size=256)

In [16]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator='gpu',
    devices=[2],
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
train_embeds = torch.vstack(trainer.predict(best_model, train_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]