In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [2]:
from pathlib import Path

import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate


from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from src.coles import CustomColesDataset

In [3]:
with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_churn", overrides=["model=coles_on_coles_churn"])
    
cfg_preprop = cfg["dataset"]
cfg_model = cfg["model"]

In [4]:
cfg_model

{'trainer_coles': {'checkpoint_callback': {'_target_': 'pytorch_lightning.callbacks.ModelCheckpoint', 'dirpath': 'logs/checkpoints/coles', 'filename': 'coles_model'}, 'enable_early_stopping': False, 'early_stopping': {'_target_': 'pytorch_lightning.callbacks.EarlyStopping', 'min_delta': 0.01, 'patience': 5, 'verbose': True}, 'logger': {'_target_': 'pytorch_lightning.loggers.TensorBoardLogger', 'save_dir': 'logs/tensorboard'}, 'trainer': {'_target_': 'pytorch_lightning.Trainer', 'accelerator': 'gpu', 'devices': [0], 'max_epochs': 60, 'log_every_n_steps': 10}}, 'name': 'coles_churn', 'dataset': {'_target_': 'src.coles.CustomColesDataset', 'min_len': 15, 'col_time': 'event_time', 'splitter': {'_target_': 'ptls.frames.coles.split_strategy.NoSplit'}}, 'datamodule': {'_target_': 'ptls.frames.PtlsDataModule', 'train_batch_size': 128, 'valid_batch_size': 128, 'train_num_workers': 8, 'valid_num_workers': 8}, 'model': {'_target_': 'src.coles.CoLESonCoLES', 'frozen_encoder': {'_target_': 'src.nn.

In [5]:
cfg_model["model"]["learning_encoder"]["hidden_size"] = 64

In [6]:
df = pd.read_parquet(Path(cfg["dataset"]["dir_path"]).joinpath(cfg["dataset"]["train_file_name"]))
df.head(10)

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
5,1,3,2017-10-16 00:00:00,380.0,0,0,0,0
6,1,3,2017-10-10 00:00:00,378.0,0,0,0,0
7,1,3,2017-10-16 00:00:00,199.0,0,0,0,0
8,1,3,2017-10-11 00:00:00,400.0,0,0,0,0
9,1,1,2017-07-26 00:00:00,598.0,0,0,0,0


In [7]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_numerical=["amount"],
    return_records=True
)

In [8]:
dataset = preprocessor.fit_transform(df)

In [9]:
train, val = train_test_split(dataset, test_size=.2)

In [10]:
train_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=train)
val_data: CustomColesDataset = instantiate(cfg_model["dataset"], data=val)

In [11]:
datamodule: PtlsDataModule = instantiate(
    cfg_model["datamodule"],
    train_data=train_data,
    valid_data=val_data
)

In [12]:
model: CoLESModule = instantiate(cfg_model["model"])

In [13]:
model_checkpoint: ModelCheckpoint = instantiate(
    cfg_model["trainer_coles"]["checkpoint_callback"],
    monitor=model.metric_name,
    mode="max"
)

In [14]:
early_stopping: EarlyStopping = instantiate(
    cfg_model["trainer_coles"]["early_stopping"],
    monitor=model.metric_name,
    mode="max"
)

In [15]:
logger: TensorBoardLogger = instantiate(cfg_model["trainer_coles"]["logger"])

In [16]:
trainer: Trainer = instantiate(
    cfg_model["trainer_coles"]["trainer"],
    callbacks=[model_checkpoint, early_stopping],
    logger=logger
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer.fit(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                    | Params
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss         | 0     
1 | _seq_encoder       | PretrainedRnnSeqEncoder | 4.3 M 
2 | _validation_metric | BatchRecallTopK         | 0     
3 | _head              | Head                    | 0     
4 | learning_encoder   | RnnEncoder              | 209 K 
---------------------------------------------------------------
209 K     Trainable params
4.3 M     Non-trainable params
4.5 M     Total params
18.094    Total estimated model params size (MB)


Epoch 0:   3%|▎         | 1/35 [00:46<26:08, 46.13s/it, loss=471, v_num=55, seq_len=117.0]

In [19]:
import torch
torch.save(model.state_dict(), f"coles_on_coles_{cfg_model['model']['learning_encoder']['hidden_size']}.pth")