In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sklearn.model_selection
from ptls.preprocessing import PandasDataPreprocessor
import pandas as pd

In [3]:
df = pd.read_csv('data/train_data_reduced.csv')
df_test = pd.read_csv('data/train_labels_reduced.csv')

cols_category=[
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
]

objs = [k for k,v in df.dtypes.items() if v==object and k not in ['S_2']]
df = df.set_index('customer_ID').join(df_test.set_index('customer_ID')[['target']])
df = df.reset_index()
df = df.drop('Unnamed: 0', axis=1)
#df = df.drop('Unnamed: 0')
df[objs] = df[objs].astype('category').apply(lambda s: s.cat.codes)

cols_numerical = [x for x in df.columns if x not in cols_category]

preprocessor = PandasDataPreprocessor(
    col_id='customer_ID',
    col_event_time='S_2',
    event_time_transformation='none',
    cols_category=cols_category,
    cols_numerical=[x for x in cols_numerical if x != 'target'],
    return_records=True,
)
# Transform S_2 to number of days relative to first entry (probably bad - test data!?
dt=pd.to_datetime(df.S_2, format='%Y-%m-%d')
days_relative = (dt - dt.min()).dt.days
df['S_2'] = days_relative
# Drop target variable but save into series
df_target = df[['target']]
df = df.drop('target', axis=1)
# Preprocess data
prepped = preprocessor.fit_transform(df)

In [4]:
train, val = sklearn.model_selection.train_test_split(prepped)

In [9]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import torch

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={c: 'identity' for c in cols_numerical if c != 'customer_ID' and c != 'target'},
    embeddings={col: {'in': d, 'out': 4} for col, d in preprocessor.get_category_dictionary_sizes().items()},
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.0001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [10]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import MemoryMapDataset

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=3),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=1,
            cnt_max=13,
        ),
    ),
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=val,
            i_filters=[
                SeqLenFilter(min_seq_len=3),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=1,
            cnt_max=13,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)



In [11]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    log_every_n_steps=5,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 369 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
369 K     Trainable params
0         Non-trainable params
369 K     Total params
1.478     Total estimated model params size (MB)


logger.version = 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'loss': tensor(nan), 'seq_len': tensor(6.5600), 'recall_top_k': tensor(0.0041)}
CPU times: user 49.4 s, sys: 30.5 s, total: 1min 19s
Wall time: 3min 36s
