In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

%load_ext autoreload
%autoreload

In [10]:
from pathlib import Path

import pandas as pd

import torch

from hydra import initialize, compose
from hydra.utils import instantiate

from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesDataset

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger, CometLogger

from sklearn.model_selection import train_test_split

from src.local_validation import LocalValidationModel

from src.utils.logging_utils import get_logger
from src.preprocessing import preprocess


## Read data

In [11]:
DATASET = "churn"

with initialize(config_path="../config", version_base=None):
    cfg = compose(config_name="config_" + DATASET)
    
cfg_preprop = cfg["preprocessing"]
cfg_dataset = cfg["dataset"]
cfg_model = cfg["model"]
cfg_validation = cfg["validation"]

In [12]:
df = pd.read_parquet(Path(cfg["preprocessing"]["source"]))
df

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,10,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0
...,...,...,...,...,...,...,...,...
490508,10215,37,2016-12-17 00:00:00,2110.9,0,0,1,0
490509,10215,1,2016-12-16 00:00:00,31.0,0,0,0,0
490510,10215,1,2016-12-06 00:00:00,182.0,0,0,0,0
490511,10215,2,2016-12-06 13:39:49,5000.0,0,0,0,0


## Preprocess and split data

In [13]:
logger = get_logger(name=__name__)
train, val, test = preprocess(cfg_preprop)

[Memory]0.0s, 0.0min    : Loading _preprocess...
__________________________________________preprocess cache loaded - 6.5s, 0.1min


## Init backbone model and load weights

In [14]:
sequence_encoder = instantiate(cfg_validation["sequence_encoder"])
sequence_encoder.load_state_dict(torch.load(cfg_validation["path_to_state_dict"]))

<All keys matched successfully>

# Validation

## Use datasets with no splits for the new validation procedure

In [15]:
data_train = MemoryMapDataset(train, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_val = MemoryMapDataset(val, [SeqLenFilter(cfg_validation["model"]["seq_len"])])
data_test = MemoryMapDataset(test, [SeqLenFilter(cfg_validation["model"]["seq_len"])])

train_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_train)
val_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_val)
test_dataset: ColesDataset = instantiate(cfg_validation["dataset"], data=data_test)

datamodule: PtlsDataModule = instantiate(
    cfg_validation["datamodule"],
    train_data=train_dataset,
    valid_data=val_dataset,
    test_data=test_dataset,
)

# New validation model

In [16]:
seed_everything(42)

"""
'val_mode' options:
    * 'donwstream' - using local targets (e.g. 'churn_target' or 'default_target')
    * 'return_time' - predicting return time (COTIC-style) - NOT READY YET
    * 'event_type' - predicting next event type (COTIC-style)
"""

valid_model: LocalValidationModel = instantiate(
    cfg_validation["model"],
    backbone=sequence_encoder 
)

Global seed set to 42


In [17]:
batch, labels = next(iter(datamodule.train_dataloader()))

print("inputs event time:", batch.payload["event_time"].shape)

preds, mask = valid_model(batch)
target = valid_model._get_validation_labels(batch)

print("preds:", preds.shape)
print("mask:", mask.shape)
print("target:", target.shape)

inputs event time: torch.Size([4, 117])
preds: torch.Size([4, 86, 345])
mask: torch.Size([4, 86])
target: torch.Size([4, 86])


In [18]:
#comet_logger = CometLogger(
#    api_key="agnHNC2vEt7tOxnnxT4LzYf7Y",
#    project_name="macro-micro-coles",
#    workspace="stalex2902",
#    experiment_name="New validation CoLES Churn event_type, 100 types",
#    display_summary_level=0
#)

val_trainer: Trainer = instantiate(cfg_validation["trainer"])
    
val_trainer.fit(valid_model, datamodule)
val_trainer.test(valid_model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name      | Type             | Params
-----------------------------------------------
0 | backbone  | RnnSeqEncoder    | 4.3 M 
1 | pred_head | Sequential       | 44.2 K
2 | loss      | CrossEntropyLoss | 0     
-----------------------------------------------
44.2 K    Trainable params
4.3 M     Non-trainable params
4.4 M     Total params
17.434    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing: 0it [00:00, ?it/s]