# <center> DataFusion 2024 (Задача "Прогнозирование оттока")
## <center> Ноутбук для генерации эмбеддингов `PyTorchLifestream` на данных транзакций </center>

In [None]:
!pip install pytorch-lifestream duckdb -q

In [67]:
import pandas as pd
import numpy as np

df_trx = pd.read_csv('transactions.csv', parse_dates=['transaction_dttm'])
clients = pd.read_csv('clients.csv')
clients

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age
0,3,2,ОТ 101 ДО 500,0,3
1,9,1,БОЛЕЕ 1001,0,3
2,13,6,ОТ 501 ДО 1000,0,2
3,37,5,БОЛЕЕ 1001,0,2
4,41,1,ОТ 101 ДО 500,0,2
...,...,...,...,...,...
95995,562043,12,,0,2
95996,562205,12,,0,1
95997,562312,12,,0,0
95998,562721,12,,0,2


In [68]:
df_trx = df_trx.merge(clients, how="left", on="user_id")
df_trx

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,report,employee_count_nm,bankemplstatus,customer_age
0,3,3,1,-183.883957,2022-01-28 12:05:33,2,ОТ 101 ДО 500,0,3
1,3,3,1,-3206.437012,2022-01-28 12:52:30,2,ОТ 101 ДО 500,0,3
2,3,16,1,-153866.890625,2022-02-16 14:45:56,2,ОТ 101 ДО 500,0,3
3,3,56,1,-15144.601562,2022-03-09 19:58:29,2,ОТ 101 ДО 500,0,3
4,3,0,1,5297.908691,2022-03-12 18:11:31,2,ОТ 101 ДО 500,0,3
...,...,...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,12,,0,0
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,12,,0,0
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,12,,0,0
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,12,,0,0


In [69]:
from ptls.preprocessing import PandasDataPreprocessor

# Initialize the preprocessor
preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='transaction_dttm',
    event_time_transformation='dt_to_timestamp',
    cols_category=['mcc_code', 'currency_rk', 'employee_count_nm', 'bankemplstatus', 'customer_age', 'report'],
    cols_numerical=['transaction_amt'],
)

# Preprocess the data
data = preprocessor.fit_transform(df_trx)

In [70]:
preprocessor.get_category_dictionary_sizes()

{'mcc_code': 334,
 'currency_rk': 6,
 'employee_count_nm': 12,
 'bankemplstatus': 4,
 'customer_age': 6,
 'report': 14}

In [71]:
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import torch
import torchmetrics
from functools import partial

# Transaction Encoder
trx_encoder = TrxEncoder(
    embeddings_noise=0.003,
    embeddings={'mcc_code': {'in': 334, 'out': 16}, # Adjust 'in' based on the number of unique MCC codes
               'currency_rk': {'in': 6, 'out': 1},
               'employee_count_nm': {'in': 12, 'out': 1},
               'bankemplstatus': {'in': 4, 'out': 1},
               'customer_age': {'in': 6, 'out': 1},
               'report': {'in': 14, 'out': 1}
               },
    numeric_values={'transaction_amt': 'identity'},
)

# Sequence Encoder
seq_encoder = RnnSeqEncoder(
    trx_encoder=trx_encoder,
    hidden_size=128,  # Dimension of the generated embeddings
    type='gru',
)

# CoLES Module
coles_module = CoLESModule(seq_encoder=seq_encoder,
                           optimizer_partial=partial(torch.optim.Adam, lr=0.001),
                           lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9),)



In [72]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=data,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)


In [77]:
import pytorch_lightning as pl

# Define a PyTorch Lightning Trainer
trainer = pl.Trainer(max_epochs=9, gpus=1 if torch.cuda.is_available() else 0)

# Start the training process
trainer.fit(coles_module, train_dl)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 63.9 K
2 | _validation_metric | AUC             | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
63.9 K    Trainable params
0         Non-trainable params
63.9 K    Total params
0.256     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [78]:
from ptls.data_load.datasets import inference_data_loader

coles_module.eval()
train_dl = inference_data_loader(data, num_workers=16, batch_size=256)
train_embeds = torch.vstack(trainer.predict(coles_module, train_dl, ))

train_embeds.shape

Predicting: 316it [00:00, ?it/s]

torch.Size([96000, 128])

In [79]:
import pandas as pd

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['user_id'] = [x['user_id'] for x in data]

In [80]:
train_df

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_119,embed_120,embed_121,embed_122,embed_123,embed_124,embed_125,embed_126,embed_127,user_id
0,-0.165534,0.351978,-0.350839,0.106836,0.293595,0.044469,0.414537,-0.166538,-0.351249,-0.310289,...,0.375856,0.090394,-0.129844,0.012284,0.578882,-0.004445,-0.658017,0.061427,0.675077,3
1,-0.063022,0.361478,-0.826556,-0.154087,0.046106,-0.072741,-0.035444,-0.654632,-0.685090,-0.128401,...,-0.448710,-0.232319,-0.431142,-0.145322,0.315182,-0.321269,-0.196726,-0.116659,0.896083,9
2,-0.246840,0.107814,0.341888,0.161511,0.088281,0.089849,0.682158,-0.471609,0.118671,-0.584602,...,0.036044,0.120221,-0.015794,-0.071148,0.504213,-0.205995,-0.490473,-0.324970,0.438195,13
3,-0.145627,0.025706,-0.542809,-0.197882,0.015115,-0.047794,0.429975,-0.059663,0.034127,0.237009,...,-0.432880,0.141057,-0.217942,0.160477,0.148012,0.019053,-0.345193,-0.098770,-0.708134,37
4,-0.083140,0.015028,-0.032059,0.202491,-0.145457,0.029561,-0.077098,-0.415457,-0.523322,-0.059340,...,-0.005664,0.235670,0.187565,-0.088271,-0.452673,-0.202955,-0.094149,-0.449056,0.548946,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,0.478074,0.803868,-0.467045,0.131932,0.433364,0.537462,0.540662,-0.700927,-0.933288,0.189213,...,-0.806375,-0.174554,-0.392843,-0.824385,-0.646314,-0.652553,-0.492571,-0.632376,-0.653894,562043
95996,0.118339,0.190163,-0.245375,0.118228,-0.038708,-0.152138,-0.056908,-0.575353,-0.462584,-0.251028,...,-0.271022,0.390187,-0.135735,-0.057492,-0.649834,-0.057388,0.319549,-0.345252,-0.807679,562205
95997,-0.113980,0.175626,-0.478361,-0.131604,-0.549385,0.264485,0.002883,-0.219144,0.358269,0.355836,...,-0.302204,0.144307,-0.205677,0.072926,0.051741,-0.072239,-0.575386,-0.004674,-0.442141,562312
95998,-0.226445,0.447655,-0.763607,-0.132659,0.000008,0.064495,0.594851,-0.507955,0.230716,-0.430516,...,-0.386195,0.078051,-0.317041,0.402451,0.524454,0.377779,-0.652732,0.119024,-0.200524,562721


In [81]:
train_df.to_csv('ptls4.csv', index=False)