In [1]:
!pip install pytorch_lightning
!pip install pytorch-lifestream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.0.0-py3-none-any.whl (715 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m715.6/715.6 KB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning-utilities>=0.7.0
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [2]:
import pandas as pd
import os
import torch
import pytorch_lightning as pl

In [3]:
import os

if not os.path.exists('data/transactions.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/datasouls-ods/materials/0433a4ca/transactions.zip
#    ! curl -OL https://storage.yandexcloud.net/datasouls-ods/materials/0554f0cf/clickstream.zip
    ! curl -OL https://storage.yandexcloud.net/datasouls-ods/materials/e756bf99/train.csv
 #   ! unzip clickstream.zip -d data
    ! unzip transactions.zip 
    ! cp train.csv data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  250M  100  250M    0     0  51.2M      0  0:00:04  0:00:04 --:--:-- 60.0M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  307k  100  307k    0     0   786k      0 --:--:-- --:--:-- --:--:--  786k
Archive:  transactions.zip
  inflating: transactions.csv        
  inflating: __MACOSX/._transactions.csv  


In [4]:
transactions = pd.read_csv('transactions.csv')

In [64]:
transactions.user_id.nunique()

22533

In [5]:
transactions.transaction_dttm = pd.to_datetime(transactions.transaction_dttm)

In [6]:
from ptls.preprocessing import PandasDataPreprocessor

In [7]:
preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='transaction_dttm',
    event_time_transformation='dt_to_timestamp',
    cols_category=['mcc_code', 'currency_rk'],
    cols_numerical=['transaction_amt'],
    return_records=True,
)

In [8]:
%%time 
dataset = preprocessor.fit_transform(transactions)

CPU times: user 1min 16s, sys: 12.5 s, total: 1min 28s
Wall time: 1min 43s


In [9]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [10]:
dataset = sorted(dataset, key = lambda x: x['user_id'])

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(18026, 4507)

Embedding training

In [None]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'transaction_amt': 'identity'},
    embeddings={
        'event_time': {'in': 800, 'out': 16},
        'mcc_code': {'in': 450, 'out': 16},
        'currency_rk':{'in': 4, 'out': 2}
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=800,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SplitRandom, SampleRandom
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter = SampleRandom(split_count=5,
                               cnt_min = 25,
                               cnt_max = 200)
    ),
    train_num_workers=2,
    train_batch_size=256,
)

In [None]:


import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")


logger.version = 0


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.118     Total estimated model params size (MB)


{'loss': tensor(105.7170), 'seq_len': tensor(110.8925)}
CPU times: user 21min 7s, sys: 2min 31s, total: 23min 38s
Wall time: 24min 55s


In [None]:
torch.save(seq_encoder.state_dict(), "coles_emb.pt")

In [None]:
from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(torch.Size([18026, 800]), torch.Size([4507, 800]))

Готово!

In [None]:
df_target = pd.read_csv( 'data/train.csv')
df_target.rename(columns={'bank':'user_id'},inplace=True)
df_target = df_target.set_index('user_id')

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['user_id'] = [x['user_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='user_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['user_id'] = [x['user_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='user_id')

print(train_df.shape, test_df.shape)

(18026, 802) (4507, 802)


In [None]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
print(train_df.shape, test_df.shape)

(6838, 802) (1671, 802)


In [None]:
train_df.to_csv('train_df.csv')

In [None]:
test_df.to_csv('test_df.csv')

# Learning COLES

In [35]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV


In [38]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['higher_education']
x_test, y_test = test_df[embed_columns], test_df['higher_education']



In [14]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [40]:
from catboost import CatBoostClassifier, metrics

# Catboost

In [41]:
x_train, y_train = train_df[embed_columns], train_df['higher_education']
x_test, y_test = test_df[embed_columns], test_df['higher_education']

In [49]:
#grid search, but did not give significant increase in roc auc
clf = CatBoostClassifier()
params = {'iterations': [800],
          'depth': [4, 5, 6],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
#           'eval_metric': ['Accuracy'],
#           'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [42],
          'learning_rate': [0.005] #0.005
         }
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', verbose=3,  cv=2, n_jobs = -1)

In [50]:
clf_grid.fit(x_train, y_train)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


In [56]:
CatBoostModel = clf_grid.best_estimator_
y_pred = CatBoostModel.predict(x_test)
y_proba = CatBoostModel.predict_proba(x_test)
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7666068222621185 
      f1: 0.8550185873605949, 
      precision: 0.9274193548387096
     roc auc : 0.7702922685427739


In [None]:
CatBoostModel = CatBoostClassifier(
iterations= 500,
learning_rate = 0.05,
use_best_model = True,
eval_metric ='AUC', 
loss_function='Logloss',
random_seed = 42,
logging_level = 'Silent',
depth = 5)

In [None]:
CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

In [None]:
y_pred = CatBoostModel.predict(x_test)
y_proba = CatBoostModel.predict_proba(x_test)

In [None]:
#gru
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7600239377618193 
      f1: 0.84953095684803, 
      precision: 0.9129032258064517
     roc auc : 0.7709116084125439


# Encoder type

In [16]:
%%time
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SplitRandom, SampleRandom
from ptls.frames import PtlsDataModule
import logging
from ptls.data_load.datasets import inference_data_loader

from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, metrics

for encoder in ['gru', 'lstm']:
  trx_encoder_params = dict(
      embeddings_noise=0.003,
      numeric_values={'transaction_amt': 'identity'},
      embeddings={
          'event_time': {'in': 800, 'out': 16},
          'mcc_code': {'in': 450, 'out': 16},
          'currency_rk':{'in': 4, 'out': 2}
      },
  )

  seq_encoder = RnnSeqEncoder(
      trx_encoder=TrxEncoder(**trx_encoder_params),
      hidden_size=256,
      type = encoder,
  )

  model = CoLESModule(
      seq_encoder=seq_encoder,
      optimizer_partial=partial(torch.optim.Adam, lr=0.001),
      lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
  )


  train_dl = PtlsDataModule(
      train_data=ColesDataset(
          MemoryMapDataset(
              data=train,
              i_filters=[
                  SeqLenFilter(min_seq_len=25),
              ],
          ),
          splitter = SampleRandom(split_count=5,
                                cnt_min = 25,
                                cnt_max = 200)
      ),
      train_num_workers=2,
      train_batch_size=256,
  )




  trainer = pl.Trainer(
      max_epochs=15,
      gpus=1 if torch.cuda.is_available() else 0,
      enable_progress_bar=False,
  )

  print(f'logger.version = {trainer.logger.version}')
  trainer.fit(model, train_dl)
  print(trainer.logged_metrics)


  train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

  test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
  test_embeds = torch.vstack(trainer.predict(model, test_dl))
  df_target = pd.read_csv( 'data/train.csv')
  df_target.rename(columns={'bank':'user_id'},inplace=True)
  df_target = df_target.set_index('user_id')

  train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
  train_df['user_id'] = [x['user_id'] for x in train]
  train_df = train_df.merge(df_target, how='left', on='user_id')

  test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
  test_df['user_id'] = [x['user_id'] for x in test]
  test_df = test_df.merge(df_target, how='left', on='user_id')

  train_df.dropna(inplace=True)
  test_df.dropna(inplace=True)
  print(train_df.shape, test_df.shape)

  embed_columns = [x for x in train_df.columns if x.startswith('embed')]
  x_train, y_train = train_df[embed_columns], train_df['higher_education']
  x_test, y_test = test_df[embed_columns], test_df['higher_education']
  CatBoostModel = CatBoostClassifier(
  iterations= 500,
  learning_rate = 0.05,
  use_best_model = True,
  eval_metric ='AUC', 
  loss_function='Logloss',
  random_seed = 42,
  logging_level = 'Silent',
  depth = 5)
  CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)
  y_pred = CatBoostModel.predict(x_test)
  y_proba = CatBoostModel.predict_proba(x_test)
  print(f'''{encoder}: accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 245 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
245 K     Trainable params
0         Non-trainable params
245 K     Total params
0.981     Total estimated model params size (MB)


logger.version = 1


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'loss': tensor(100.6403), 'seq_len': tensor(114.1245)}


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(6838, 258) (1671, 258)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 320 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
320 K     Trainable params
0         Non-trainable params
320 K     Total params
1.281    

gru: accuracy: 0.7630161579892281 
      f1: 0.8535502958579883, 
      precision: 0.9306451612903226
     roc auc : 0.7679384028141606
logger.version = 2


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'loss': tensor(100.9346), 'seq_len': tensor(109.7774)}


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(6838, 258) (1671, 258)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

lstm: accuracy: 0.7642130460801915 
      f1: 0.8509833585476552, 
      precision: 0.907258064516129
     roc auc : 0.7697028665519048
CPU times: user 8min 35s, sys: 37.7 s, total: 9min 13s
Wall time: 11min 57s


In [None]:
transactions = {'gru': 0.7605643290172892, 'lstm' : 0.7527879649726817}
transactions = {'gru':0.7679384028141606, 'lstm':0.7697028665519048}

# Sampling methods

In [17]:
%%time
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SplitRandom, SampleRandom
from ptls.frames import PtlsDataModule
import logging
from ptls.data_load.datasets import inference_data_loader
import torch
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, metrics

for sampling in [SampleSlices, SplitRandom, SampleRandom]:
  trx_encoder_params = dict(
      embeddings_noise=0.003,
      numeric_values={'transaction_amt': 'identity'},
      embeddings={
          'event_time': {'in': 800, 'out': 16},
          'mcc_code': {'in': 450, 'out': 16},
          'currency_rk':{'in': 4, 'out': 2}
      },
  )

  seq_encoder = RnnSeqEncoder(
      trx_encoder=TrxEncoder(**trx_encoder_params),
      hidden_size=256,
      type = 'gru',
  )

  model = CoLESModule(
      seq_encoder=seq_encoder,
      optimizer_partial=partial(torch.optim.Adam, lr=0.001),
      lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
  )


  train_dl = PtlsDataModule(
      train_data=ColesDataset(
          MemoryMapDataset(
              data=train,
              i_filters=[
                  SeqLenFilter(min_seq_len=25),
              ],
          ),
          splitter = sampling(split_count=5,
                                cnt_min = 25,
                                cnt_max = 200)
      ),
      train_num_workers=2,
      train_batch_size=256,
  )




  trainer = pl.Trainer(
      max_epochs=15,
      gpus=1 if torch.cuda.is_available() else 0,
      enable_progress_bar=False,
  )

  print(f'logger.version = {trainer.logger.version}')
  trainer.fit(model, train_dl)
  print(trainer.logged_metrics)


  train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

  test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
  test_embeds = torch.vstack(trainer.predict(model, test_dl))
  df_target = pd.read_csv( 'data/train.csv')
  df_target.rename(columns={'bank':'user_id'},inplace=True)
  df_target = df_target.set_index('user_id')

  train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
  train_df['user_id'] = [x['user_id'] for x in train]
  train_df = train_df.merge(df_target, how='left', on='user_id')

  test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
  test_df['user_id'] = [x['user_id'] for x in test]
  test_df = test_df.merge(df_target, how='left', on='user_id')
  train_df.dropna(inplace=True)
  test_df.dropna(inplace=True)
  print(train_df.shape, test_df.shape)

  embed_columns = [x for x in train_df.columns if x.startswith('embed')]
  x_train, y_train = train_df[embed_columns], train_df['higher_education']
  x_test, y_test = test_df[embed_columns], test_df['higher_education']
  CatBoostModel = CatBoostClassifier(
  iterations= 500,
  learning_rate = 0.05,
  use_best_model = True,
  eval_metric ='AUC', 
  loss_function='Logloss',
  random_seed = 42,
  logging_level = 'Silent',
  depth = 5)
  CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)
  y_pred = CatBoostModel.predict(x_test)
  y_proba = CatBoostModel.predict_proba(x_test)
  print(f'''{sampling.__class__.__name__}: accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 245 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
245 K     Trainable params
0         Non-trainable params
245 K     Total params
0.981    

logger.version = 3


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'loss': tensor(142.2618), 'seq_len': tensor(111.9170)}


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(6838, 258) (1671, 258)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 245 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
245 K     Trainable params
0         Non-trainable params
245 K     Total params
0.981    

type: accuracy: 0.7749850388988629 
      f1: 0.8576835730507192, 
      precision: 0.9137096774193548
     roc auc : 0.7703427887134197
logger.version = 4


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'loss': tensor(71.0469), 'seq_len': tensor(154.4509)}


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(6838, 258) (1671, 258)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 245 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
245 K     Trainable params
0         Non-trainable params
245 K     Total params
0.981    

type: accuracy: 0.7666068222621185 
      f1: 0.8545861297539149, 
      precision: 0.9241935483870968
     roc auc : 0.769618666267495
logger.version = 5


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'loss': tensor(99.8926), 'seq_len': tensor(115.5415)}


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(6838, 258) (1671, 258)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

type: accuracy: 0.7552363853979653 
      f1: 0.844663881503988, 
      precision: 0.896774193548387
     roc auc : 0.7603940573310382
CPU times: user 13min 26s, sys: 50.1 s, total: 14min 16s
Wall time: 17min 35s


In [None]:
#SplitRandom
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7576301615798923 
      f1: 0.8483713964807189, 
      precision: 0.9137096774193548
     roc auc : 0.7493132999027019


In [None]:
#SampleRandom
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.765409934171155 
      f1: 0.8527422990232908, 
      precision: 0.9153225806451613
     roc auc : 0.7678373624728688


In [None]:
transactions = {'SampleSlices':0.7605643290172892, 'SplitRandom':0.7493132999027019, 'SampleRandom':0.7678373624728688}
transactions = {'SampleSlices':0.7605643290172892, 'SplitRandom':0.7493132999027019, 'SampleRandom':0.7678373624728688}