In [None]:
!pip install pytorch_lightning
!pip install pytorch-lifestream
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.0.0-py3-none-any.whl (715 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m715.6/715.6 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [None]:
from ptls.data_load.datasets import ParquetDataset, ParquetFiles

iterable_train = ParquetDataset(ParquetFiles('train.parquet'))


In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter

map_processed_train = MemoryMapDataset(
    data=iterable_train,
    i_filters=[
        SeqLenFilter(min_seq_len=25),
    ],
)

# Encoders

In [None]:
%%time
import torch
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SplitRandom, SampleRandom
from ptls.frames import PtlsDataModule
import torch
import pytorch_lightning as pl
from ptls.data_load.datasets import inference_data_loader
import logging
import pandas as pd
from catboost import CatBoostClassifier, metrics
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score


for encoder in ['gru', 'lstm']:
  trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={},
    embeddings={
        'event_time': {'in': 800, 'out': 16},
        'cat_id': {'in': 410, 'out': 16},
      },
  )

  seq_encoder = RnnSeqEncoder(
      trx_encoder=TrxEncoder(**trx_encoder_params),
      hidden_size=256,
      type=encoder,
  )

  model = CoLESModule(
      seq_encoder=seq_encoder,
      optimizer_partial=partial(torch.optim.Adam, lr=0.001),
      lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
  )
  train_dl = PtlsDataModule(
    train_data=ColesDataset(
        map_processed_train,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
  )
  trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
  )


  print(f'logger.version = {trainer.logger.version}')
  trainer.fit(model, train_dl)
  print(trainer.logged_metrics)
  iterable_test = ParquetDataset(ParquetFiles('test.parquet'))

  train_dl = inference_data_loader(list(iter(iterable_train)), num_workers=0, batch_size=256)

  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))
  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

  test_dl = inference_data_loader(list(iter(iterable_test)), num_workers=0, batch_size=256)
  test_embeds = torch.vstack(trainer.predict(model, test_dl))
  df_target = pd.read_csv('target_dataset_matched.csv')
  df_target.rename(columns={'rtk':'user_id'},inplace=True)
  df_target = df_target.set_index('user_id')
  df_target.rename(columns={"higher_education": "target"}, inplace=True)

  train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
  train_df['user_id'] = [x['user_id'] for x in iter(iterable_train)]
  train_df = train_df.merge(df_target, how='left', on='user_id')

  test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
  test_df['user_id'] = [x['user_id'] for x in iter(iterable_test)]
  test_df = test_df.merge(df_target, how='left', on='user_id')
  train_df.dropna(inplace=True)
  test_df.dropna(inplace=True)
  embed_columns = [x for x in train_df.columns if x.startswith('embed')]
  x_train, y_train = train_df[embed_columns], train_df['target']
  x_test, y_test = test_df[embed_columns], test_df['target']
  CatBoostModel = CatBoostClassifier(
  iterations= 500,
  learning_rate = 0.05,
  use_best_model = True,
  eval_metric ='AUC', 
  loss_function='Logloss',
  random_seed = 42,
  logging_level = 'Silent',
  depth = 5)
  CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
  )
  y_pred = CatBoostModel.predict(x_test)
  y_proba = CatBoostModel.predict_proba(x_test)
  print(f'''{encoder} accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

In [None]:
#gru
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7145049884881044 
      f1: 0.8334825425246195, 
      precision: 1.0
     roc auc : 0.5978396451959391


In [None]:
#lstm
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7145049884881044 
      f1: 0.8334825425246195, 
      precision: 1.0
     roc auc : 0.5765825854959981


# Samplings

In [None]:
%%time
import torch
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SplitRandom, SampleRandom
from ptls.frames import PtlsDataModule
import torch
import pytorch_lightning as pl
from ptls.data_load.datasets import inference_data_loader
import logging
import pandas as pd
from catboost import CatBoostClassifier, metrics
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score


for sampling in [SampleSlices, SplitRandom, SampleRandom]:
  trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={},
    embeddings={
        'event_time': {'in': 800, 'out': 16},
        'cat_id': {'in': 410, 'out': 16},
      },
  )

  seq_encoder = RnnSeqEncoder(
      trx_encoder=TrxEncoder(**trx_encoder_params),
      hidden_size=256,
      type='gru',
  )

  model = CoLESModule(
      seq_encoder=seq_encoder,
      optimizer_partial=partial(torch.optim.Adam, lr=0.001),
      lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
  )
  train_dl = PtlsDataModule(
    train_data=ColesDataset(
        map_processed_train,
        splitter=sampling(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
  )
  trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
  )


  print(f'logger.version = {trainer.logger.version}')
  trainer.fit(model, train_dl)
  print(trainer.logged_metrics)
  iterable_test = ParquetDataset(ParquetFiles('test.parquet'))

  train_dl = inference_data_loader(list(iter(iterable_train)), num_workers=0, batch_size=256)

  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))
  train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

  test_dl = inference_data_loader(list(iter(iterable_test)), num_workers=0, batch_size=256)
  test_embeds = torch.vstack(trainer.predict(model, test_dl))
  df_target = pd.read_csv('target_dataset_matched.csv')
  df_target.rename(columns={'rtk':'user_id'},inplace=True)
  df_target = df_target.set_index('user_id')
  df_target.rename(columns={"higher_education": "target"}, inplace=True)

  train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
  train_df['user_id'] = [x['user_id'] for x in iter(iterable_train)]
  train_df = train_df.merge(df_target, how='left', on='user_id')

  test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
  test_df['user_id'] = [x['user_id'] for x in iter(iterable_test)]
  test_df = test_df.merge(df_target, how='left', on='user_id')
  train_df.dropna(inplace=True)
  test_df.dropna(inplace=True)
  embed_columns = [x for x in train_df.columns if x.startswith('embed')]
  x_train, y_train = train_df[embed_columns], train_df['target']
  x_test, y_test = test_df[embed_columns], test_df['target']
  CatBoostModel = CatBoostClassifier(
  iterations= 500,
  learning_rate = 0.05,
  use_best_model = True,
  eval_metric ='AUC', 
  loss_function='Logloss',
  random_seed = 42,
  logging_level = 'Silent',
  depth = 5)
  CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
  )
  y_pred = CatBoostModel.predict(x_test)
  y_proba = CatBoostModel.predict_proba(x_test)
  print(f'''{sampling.__class__.__name__} accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

In [None]:
#SplitRandom
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7145049884881044 
      f1: 0.8331838565022421, 
      precision: 0.9978517722878625
     roc auc : 0.6003199242345496


In [None]:
#SampleRandom
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7145049884881044 
      f1: 0.8334825425246195, 
      precision: 1.0
     roc auc : 0.5902140142984189


In [2]:
import pandas as pd

In [1]:
transactions = {'SampleSlices':0.7605643290172892, 'SplitRandom':0.7493132999027019, 'SampleRandom':0.7678373624728688}
clickstream = {'SampleSlices':0.5978396451959391, 'SplitRandom':0.6003199242345496, 'SampleRandom':0.5902140142984189}

In [2]:
tr = pd.DataFrame(clickstream, index = ['clickstream'])
tr1 = pd.DataFrame(transactions, index = ['transactions'])
roc_auc = pd.concat([tr1, tr]).transpose()

NameError: ignored

In [None]:
roc_auc

In [None]:
clickstream = {'gru':0.5978396451959391, 'lstm': 0.5765825854959981}
transactions = {'gru': 0.7605643290172892, 'lstm' : 0.7527879649726817}

In [None]:
tr = pd.DataFrame(clickstream, index = ['clickstream'])
tr1 = pd.DataFrame(transactions, index = ['transactions'])
roc_auc = pd.concat([tr1, tr]).transpose()

In [None]:
roc_auc

Unnamed: 0,transactions,clickstream
SampleSlices,0.760564,0.59784
SplitRandom,0.749313,0.60032
SampleRandom,0.767837,0.590214
