## Библиотеки

In [1]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import warnings

In [2]:
warnings.filterwarnings('ignore')

## Загрузка данных

Для обучения возьмём данные для задачи "Атака" и их атакованную версию

In [3]:
init_train = pd.read_csv('./data/sample_submission.csv')\
.sort_values(['user_id', 'transaction_dttm'], kind='stable')
changed_train = pd.read_csv('./data/submit.csv', parse_dates=['transaction_dttm'])\
.sort_values(['user_id', 'transaction_dttm'], kind='stable')
init_train

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,66,5812,48,-1842.949000,2021-05-17 11:30:42
1,66,4112,48,-283.283720,2021-05-17 12:59:02
2,66,5814,48,-73.820390,2021-05-17 14:04:09
3,66,4111,48,10.971557,2021-05-17 15:03:10
4,66,7991,48,-214.796420,2021-05-17 15:33:43
...,...,...,...,...,...
1259995,868873,5533,48,-458.109830,2021-06-27 09:25:53
1259996,868873,5411,48,-257.352540,2021-06-29 07:00:31
1259997,868873,5921,48,-446.213780,2021-06-30 02:26:42
1259998,868873,5411,48,-261.241580,2021-06-30 11:34:35


Валидировать будем на данных для дообучения и их атакованной версии

In [4]:
init_val = pd.read_csv('./data/transactions_finetune.csv')\
.sort_values(['user_id', 'transaction_dttm'], kind='stable')
changed_val = pd.read_csv('./data/transactions_finetune_attacked.csv', parse_dates=['transaction_dttm'])\
.sort_values(['user_id', 'transaction_dttm'], kind='stable')

## Создание датасета для обучения модели

Таргетом будет: "поменялась ли транзакция?"

In [5]:
def make_df(init, changed):
    df = pd.DataFrame()
    
    # характеристика транзакции
    df['transaction_amt'] = changed['transaction_amt']
    df['mcc_code'] = changed['mcc_code']
    df['currency_rk'] = changed['currency_rk']
    
    # временные признаки
    df['year'] = changed.transaction_dttm.dt.year
    df['month'] = changed.transaction_dttm.dt.month
    df['day'] = changed.transaction_dttm.dt.day
    df['hour'] = changed.transaction_dttm.dt.hour
    
    # общие для пользователя и mcc кода
    df['amount_of_code'] = changed.groupby(['user_id', 'mcc_code']).transaction_dttm.transform('count')
    df['mean_amt_of_code'] = changed.groupby(['user_id', 'mcc_code']).transaction_amt.transform('mean')
    df['median_amt_of_code'] = changed.groupby(['user_id', 'mcc_code']).transaction_amt.transform('median')
        
    return df, (init != changed).any(axis=1)

In [6]:
X_train, y_train = make_df(init_train, changed_train)
X_val, y_val = make_df(init_val, changed_val)

X_train

Unnamed: 0,transaction_amt,mcc_code,currency_rk,year,month,day,hour,amount_of_code,mean_amt_of_code,median_amt_of_code
0,-1842.949000,5812,48,2021,5,17,11,6,-1143.749900,-1204.644750
1,-283.283720,4112,48,2021,5,17,12,6,-694.887507,-788.965030
2,-73.820390,5814,48,2021,5,17,14,54,-379.916185,-270.524305
3,10.971557,4111,48,2021,5,17,15,22,-35.465566,-16.682811
4,-3614.685300,6012,48,2021,5,17,15,5,-2939.488860,-2900.271700
...,...,...,...,...,...,...,...,...,...,...
1259995,-458.109830,5533,48,2021,6,27,9,95,-521.173062,-428.515440
1259996,-3140.378400,6012,48,2021,6,29,7,17,-3426.191951,-2647.306000
1259997,-446.213780,5921,48,2021,6,30,2,14,-547.353409,-455.396000
1259998,-183.486250,6012,48,2021,6,30,11,17,-3426.191951,-2647.306000


In [7]:
# категориальные признаки
cat_features = ['mcc_code', 'currency_rk']

In [8]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features,
    group_id=init_train.user_id
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_features,
    group_id=init_val.user_id
)

## Обучение модели

In [9]:
params = {
    'task_type': 'CPU',
    'eval_metric': 'AUC',
    'custom_metric': ['F1', 'RecallAt:top=10'],
    'iterations': 250
}

In [10]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1c5186e4e20>

In [11]:
# важность признаков
model_cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,amount_of_code,30.264742
1,mcc_code,28.65127
2,transaction_amt,21.71842
3,median_amt_of_code,5.176546
4,year,4.833624
5,mean_amt_of_code,4.652465
6,month,2.567537
7,day,1.549759
8,hour,0.479879
9,currency_rk,0.105757


In [12]:
print(
    np.max(model_cb.get_evals_result()['validation']['AUC']), 
    np.max(model_cb.get_evals_result()['validation']['F1']),
    np.max(model_cb.get_evals_result()['validation']['RecallAt:top=10'])
)

0.9260665120410632 0.4904779221773499 0.5497140390996286


In [13]:
weights = model_cb.predict_proba(val_pool)[:,0]
weights

array([0.99814175, 0.99940945, 0.99313051, ..., 0.89231627, 0.99856633,
       0.7260762 ])

Теперь полученные веса можно использовать в

transactions.sample(..., weights=weights)