## Подготовка

Установим нужные библиотечки

In [None]:
import sys
!{sys.executable} -m pip install catboost scikit-multilearn

Импортируем их

In [None]:
import random
import numpy as np
import pandas as pd
import catboost as cb
import sklearn.utils as sku
from skmultilearn.model_selection import iterative_train_test_split
import os
import json
import pickle
from sklearn.metrics import fbeta_score, classification_report

Зафиксируем random seed

In [None]:
SEED = 0xCAFEC0DE

random.seed(SEED)
np.random.seed(SEED)

Укажем пути до файлов

In [None]:
PAYMENTS_TRAIN_PATH = 'data/payments_train.csv'
TARGET_TRAIN_PATH = 'data/target_train.csv'
PAYMENTS_TEST_PATH = 'data/payments_test.csv'
CLIENT_ID_TEST_PATH = 'data/client_id_test.csv'

## Загружаем датасет

In [None]:
payments_dtypes = {
    'client_id': str,
    'contractor_id': str,
    'is_outgoing': bool,
    'amount': 'uint64',
    'dt_day': 'uint16',
    'dt_hour': 'uint8',
    'channel': pd.CategoricalDtype()
}
for i in range(12):
    payments_dtypes[f'flag_{i}'] = bool
payments_dtypes

{'amount': 'uint64',
 'channel': CategoricalDtype(categories=None, ordered=False),
 'client_id': str,
 'contractor_id': str,
 'dt_day': 'uint16',
 'dt_hour': 'uint8',
 'flag_0': bool,
 'flag_1': bool,
 'flag_10': bool,
 'flag_11': bool,
 'flag_2': bool,
 'flag_3': bool,
 'flag_4': bool,
 'flag_5': bool,
 'flag_6': bool,
 'flag_7': bool,
 'flag_8': bool,
 'flag_9': bool,
 'is_outgoing': bool}

In [None]:
payments = pd.read_csv(PAYMENTS_TRAIN_PATH, dtype=payments_dtypes)
payments

Unnamed: 0,client_id,contractor_id,is_outgoing,amount,dt_day,dt_hour,channel,flag_0,flag_1,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11
0,569703,,True,8674442,56,12,app,False,False,False,False,False,False,False,False,False,False,False,False
1,696595,3920,True,5714350,311,19,web,False,False,False,False,False,False,False,False,False,False,False,False
2,368467,,True,3720501,175,13,,False,False,False,True,False,False,False,False,False,False,False,False
3,421133,,True,311542,68,14,,False,False,False,True,False,False,False,False,False,False,False,False
4,365044,24686,True,705918747,171,15,app,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25937985,532940,,True,904019,122,1,pos,True,False,False,False,False,False,False,False,False,False,False,False
25937986,923040,,False,27261705,94,12,,False,False,False,False,False,False,False,False,False,False,False,True
25937987,807703,58960,False,18409310,119,13,,False,False,False,False,False,False,False,False,False,False,False,True
25937988,890561,,True,7132,213,8,,False,False,False,True,False,False,False,False,False,False,False,False


In [None]:
payments['month'] = payments['dt_day'] // 30.41666666666667 #.dt.month
payments['month'] = payments['month'].astype('int16')

In [None]:
target_dtypes = {
    'client_id': str
}
for i in range(35):
    target_dtypes[f'type_{i}'] = int
target_dtypes

{'client_id': str,
 'type_0': int,
 'type_1': int,
 'type_10': int,
 'type_11': int,
 'type_12': int,
 'type_13': int,
 'type_14': int,
 'type_15': int,
 'type_16': int,
 'type_17': int,
 'type_18': int,
 'type_19': int,
 'type_2': int,
 'type_20': int,
 'type_21': int,
 'type_22': int,
 'type_23': int,
 'type_24': int,
 'type_25': int,
 'type_26': int,
 'type_27': int,
 'type_28': int,
 'type_29': int,
 'type_3': int,
 'type_30': int,
 'type_31': int,
 'type_32': int,
 'type_33': int,
 'type_34': int,
 'type_4': int,
 'type_5': int,
 'type_6': int,
 'type_7': int,
 'type_8': int,
 'type_9': int}

In [None]:
target = pd.read_csv(TARGET_TRAIN_PATH, dtype=target_dtypes).set_index('client_id')
target

Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
775943,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
992314,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
255821,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,1,0
188791,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
46092,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
406742,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0
242611,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
469847,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,1,0


## Feature Engineering

Сгенерируем следующие фичи:
* Статистика сумм транзакций по клиентам
* Количество транзакций по часам и месяцам
* Количество транзакций по 4 каналам
* Количество уникальных контракторов
* Количество транзакций с флагами с 0 по 11 по клиентам

Некоторые из них нормализуем (category_count / total_count)

Обернём весь feature engineering в функцию, чтобы потом переиспользовать её для генерации фичей для тестовой выборки

In [None]:
def generate_features(pay):
    # сумма транзакций и к-во пополнений/снятий 
    fts = pay.groupby('client_id')['amount'].agg(['mean', 'median', 'std', 'min', 'max'])
    fts['count_transactions'] = pay.groupby('client_id')['is_outgoing'].agg('count')
    fts['adds'] = pay.groupby('client_id')['is_outgoing'].agg(np.count_nonzero) / fts['count_transactions']
    fts['withdraws'] = 1 - fts['adds'] 

    # По часам
    x = pd.pivot_table(pay, index='client_id', columns='dt_hour', values = 'amount', aggfunc = 'count').fillna(0)
    x['summs'] = x.sum(axis=1)
    for i in x.columns[:-1]:
        x[i] /= x['summs']
    x.columns = ['h_'+ str(i) for i in x.columns[:-1]] + ['h_summ']
    fts = fts.merge(x.drop('h_summ', axis=1), how='left', left_on='client_id',right_index=True).fillna(0)

    # По месяцам
    x = pd.pivot_table(pay, index='client_id', columns='month', values = 'amount', aggfunc = 'count').fillna(0)
    x['summs'] = x.sum(axis=1)
    for i in x.columns[:-1]:
        x[i] /= x['summs']
    x.columns = ['month_'+ str(i) for i in x.columns[:-1]] + ['month_summ']
    fts = fts.merge(x.drop('month_summ', axis=1), how='left', left_on='client_id',right_index=True).fillna(0)

    # Каналы
    x = pd.pivot_table(pay, index='client_id', columns='channel', values = 'amount', aggfunc = 'count').fillna(0)
    x['summs'] = x.sum(axis=1)
    for i in x.columns[:-1]:
        x[i] /= x['summs']
    x.columns = ['app',	'atm',	'pos',	'web'] + ['channel_count']
    fts = fts.merge(x, how='left', left_on='client_id', right_index=True).fillna(0)

    # Количество уникальных контракторов
    x = pay.groupby('client_id')['contractor_id'].agg(['nunique'])
    x.columns = ['cnt_ctr_id']
    fts = fts.merge(x, how='left', left_on='client_id', right_index=True).fillna(0)

    # Флаги
    fts[[f'flag_{i}_count' for i in range(12)]] = pay.groupby('client_id')[[f'flag_{i}' for i in range(12)]].sum()

    return fts

In [None]:
features = generate_features(payments)

In [None]:
features

Unnamed: 0_level_0,mean,median,std,min,max,count_transactions,adds,withdraws,h_0,h_1,...,flag_2_count,flag_3_count,flag_4_count,flag_5_count,flag_6_count,flag_7_count,flag_8_count,flag_9_count,flag_10_count,flag_11_count
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100045,8.362072e+06,1706470.0,1.733687e+07,22,124737631,477,0.788260,0.211740,0.050314,0.079665,...,60,47,5,0,0,0,5,0,0,0
100055,2.201262e+07,4010524.0,6.029931e+07,2238,766121312,651,0.791091,0.208909,0.036866,0.030722,...,42,151,68,0,0,0,71,0,0,0
100068,3.782283e+07,3232108.0,8.493012e+07,34043,681967564,519,0.880539,0.119461,0.107900,0.129094,...,22,8,18,0,0,0,19,0,0,0
100076,1.155591e+07,1522173.0,3.589745e+07,29,255864840,93,0.903226,0.096774,0.172043,0.075269,...,11,8,10,0,0,0,10,0,0,0
100089,3.065695e+07,21997923.0,4.485693e+07,1519,645257028,1584,0.416035,0.583965,0.065025,0.044823,...,52,112,5,0,0,0,6,0,0,923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99975,5.391718e+07,1333079.0,1.590713e+08,4660,859557956,50,0.900000,0.100000,0.040000,0.060000,...,1,19,5,0,0,0,5,0,0,0
999759,2.389214e+08,1465890.0,1.122544e+09,4742,6612336710,273,0.831502,0.168498,0.087912,0.146520,...,0,13,3,0,0,0,4,0,0,0
999784,3.565689e+08,18552982.0,7.886591e+08,28594,3594312408,85,0.811765,0.188235,0.035294,0.035294,...,0,24,12,0,0,0,12,8,0,0
99979,1.119933e+07,3683703.5,1.871622e+07,24390,103498839,74,0.810811,0.189189,0.094595,0.081081,...,4,9,8,0,0,0,9,0,0,0


### Демонстрация фичей

In [None]:
pay = payments.copy()

Количество транзакций по часам

In [None]:
x = pd.pivot_table(pay, index='client_id', columns='dt_hour', values = 'amount', aggfunc = 'count').fillna(0)
x['summs'] = x.sum(axis=1)
for i in x.columns[:-1]:
    x[i] /= x['summs']
x.columns = ['h_'+ str(i) for i in x.columns[:-1]] + ['h_summ']
x

Unnamed: 0_level_0,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,...,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,h_summ
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100045,0.050314,0.079665,0.094340,0.092243,0.075472,0.023061,0.016771,0.031447,0.037736,0.054507,...,0.050314,0.054507,0.052411,0.029350,0.020964,0.014675,0.002096,0.006289,0.006289,477.0
100055,0.036866,0.030722,0.027650,0.046083,0.036866,0.015361,0.021505,0.027650,0.047619,0.064516,...,0.081413,0.090630,0.010753,0.018433,0.010753,0.004608,0.000000,0.003072,0.007680,651.0
100068,0.107900,0.129094,0.125241,0.140655,0.111753,0.053950,0.023121,0.023121,0.026975,0.019268,...,0.034682,0.034682,0.011561,0.005780,0.001927,0.009634,0.001927,0.003854,0.005780,519.0
100076,0.172043,0.075269,0.129032,0.118280,0.064516,0.021505,0.000000,0.021505,0.010753,0.021505,...,0.032258,0.053763,0.043011,0.021505,0.021505,0.010753,0.000000,0.010753,0.010753,93.0
100089,0.065025,0.044823,0.032828,0.029672,0.018939,0.003157,0.003788,0.016414,0.037247,0.048611,...,0.039141,0.039773,0.020202,0.017677,0.051136,0.066919,0.085227,0.092172,0.086490,1584.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99975,0.040000,0.060000,0.040000,0.040000,0.000000,0.000000,0.000000,0.000000,0.020000,0.000000,...,0.100000,0.040000,0.020000,0.020000,0.020000,0.040000,0.060000,0.000000,0.000000,50.0
999759,0.087912,0.146520,0.142857,0.150183,0.091575,0.021978,0.014652,0.021978,0.010989,0.003663,...,0.021978,0.036630,0.010989,0.000000,0.014652,0.007326,0.000000,0.018315,0.036630,273.0
999784,0.035294,0.035294,0.035294,0.011765,0.070588,0.058824,0.094118,0.070588,0.117647,0.129412,...,0.058824,0.011765,0.011765,0.011765,0.023529,0.011765,0.000000,0.011765,0.035294,85.0
99979,0.094595,0.081081,0.121622,0.081081,0.148649,0.000000,0.000000,0.013514,0.000000,0.013514,...,0.027027,0.040541,0.067568,0.027027,0.027027,0.013514,0.054054,0.054054,0.040541,74.0


Количество транзакций по месяцам

In [None]:
x = pd.pivot_table(pay, index='client_id', columns='month', values = 'amount', aggfunc = 'count').fillna(0)
x['summs'] = x.sum(axis=1)
for i in x.columns[:-1]:
    x[i] /= x['summs']
x.columns = ['month_'+ str(i) for i in x.columns[:-1]] + ['month_summ']
x

Unnamed: 0_level_0,month_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_summ
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100045,0.073375,0.106918,0.106918,0.031447,0.014675,0.008386,0.027254,0.075472,0.111111,0.129979,0.180294,0.134172,477.0
100055,0.089094,0.099846,0.104455,0.069124,0.049155,0.081413,0.059908,0.102919,0.052227,0.069124,0.124424,0.098310,651.0
100068,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.077071,0.183044,0.177264,0.262042,0.152216,0.148362,519.0
100076,0.247312,0.247312,0.204301,0.139785,0.161290,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,93.0
100089,0.069444,0.063763,0.080808,0.054293,0.062500,0.054924,0.053030,0.099116,0.118687,0.119949,0.111742,0.111742,1584.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99975,0.000000,0.480000,0.120000,0.060000,0.340000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,50.0
999759,0.069597,0.087912,0.058608,0.084249,0.058608,0.040293,0.080586,0.084249,0.076923,0.098901,0.139194,0.120879,273.0
999784,0.058824,0.023529,0.200000,0.152941,0.094118,0.105882,0.082353,0.105882,0.082353,0.070588,0.023529,0.000000,85.0
99979,0.040541,0.040541,0.094595,0.108108,0.135135,0.067568,0.013514,0.000000,0.135135,0.229730,0.027027,0.108108,74.0


Количество транзакций по 4 каналам

In [None]:
x = pd.pivot_table(pay, index='client_id', columns='channel', values = 'amount', aggfunc = 'count').fillna(0)
x['summs'] = x.sum(axis=1)
for i in x.columns[:-1]:
    x[i] /= x['summs']
x.columns = ['app',	'atm',	'pos',	'web'] + ['channel_count']
x

Unnamed: 0_level_0,app,atm,pos,web,channel_count
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100045,0.005181,0.279793,0.626943,0.088083,193
100055,0.000000,0.235119,0.294643,0.470238,336
100068,0.094148,0.055980,0.814249,0.035623,393
100076,0.122449,0.204082,0.530612,0.142857,49
100089,0.581340,0.148325,0.220096,0.050239,418
...,...,...,...,...,...
99975,0.333333,0.666667,0.000000,0.000000,3
999759,0.000000,0.000000,0.918367,0.081633,147
999784,0.046512,0.000000,0.000000,0.953488,43
99979,0.021277,0.085106,0.638298,0.255319,47


Количество уникальных контракторов

In [None]:
x = pay.groupby('client_id')['contractor_id'].agg(['nunique'])
x.columns = ['cnt_ctr_id']
x

Unnamed: 0_level_0,cnt_ctr_id
client_id,Unnamed: 1_level_1
100045,14
100055,42
100068,21
100076,9
100089,22
...,...
99975,4
999759,9
999784,14
99979,7


## Разбиение датасета на train / val

Реализуем функцию для разбиения датасета на обучающую и валидационную выборки в пропорциях 85%/15% соответвенно, примерно сохраняя распределение таргетов в обеих выборках. Из соображений быстродействия и детерминированности будем кешировать разбиение в JSON-файл. 

Для стратификации используется функция [iterative_train_test_split](http://scikit.ml/_modules/skmultilearn/model_selection/iterative_stratification.html#iterative_train_test_split) из библиотечки scikit-multilearn.

In [None]:
def stratified_split_cached(X, y, split_idx_file):
    if os.path.isfile(split_idx_file):
        with open(split_idx_file, 'r') as f:
            split_json = json.load(f)
        train_idx, val_idx = split_json['train'], split_json['val']
    else:
        y_shuffle = sku.shuffle(y, random_state=SEED)
        train_idx, _, val_idx, _ = iterative_train_test_split(np.expand_dims(y_shuffle.index, 1), np.array(y_shuffle), test_size=0.15)
        train_idx, val_idx = train_idx.squeeze(1), val_idx.squeeze(1)
        with open(split_idx_file, 'w') as f:
            json.dump({'train': list(train_idx), 'val': list(val_idx)}, f)
    return X.loc[train_idx], y.loc[train_idx], X.loc[val_idx], y.loc[val_idx]

In [None]:
X_train, y_train, X_val, y_val = stratified_split_cached(features, target, 'split_cache.json')

In [None]:
len(X_train), len(y_train), len(X_val), len(y_val)

(60378, 60378, 6627, 6627)

In [None]:
def make_pool(X, y=None):
    return cb.Pool(X, y)

## Моделлинг

Будем использовать 35 CatBoostClassifier'ов, по одному на каждый род деятельности.

In [None]:
models = []
for i in range(35):
    print('Fitting model', i)
    
    model = cb.CatBoostClassifier(iterations=2800, loss_function='Logloss', random_seed=SEED, 
                                  eval_metric='F:beta=0.5', 
                                  bootstrap_type='Bayesian', 
                                  boost_from_average=True,
                                  depth=8, l2_leaf_reg=5, task_type='GPU')
    
    pool_train, pool_val = make_pool(X_train, y_train[f'type_{i}']), make_pool(X_val, y_val[f'type_{i}'])
    
    model.fit(pool_train, eval_set=pool_val, plot=False, verbose=100)
    
    models.append(model)

Grid search для нахождения оптимальных параметров модели

In [None]:
model = cb.CatBoostClassifier(iterations=2000, loss_function='Logloss', random_seed=SEED, eval_metric='F:beta=0.5', bootstrap_type='Bayesian', boost_from_average=True)

grid = {
        'depth': [6, 7, 8, 9],
        'l2_leaf_reg': [3, 5, 7, 9, 12]
        }
        
grid_search_result = model.grid_search(grid, 
                                   X=X_train,
                                   y=y_train['type_0'])
print(grid_search_result)

Изменение трэшхолдов обученных моделей

In [None]:
scores = {}
for model in models:
    scores[(model.get_best_score()['validation']['F:beta=0.5'])] = models.index(model)
list_of_scores = list(scores.keys())
list_of_scores.sort(reverse=True)
list_of_scores

[0.6098336335699904,
 0.5772773797338792,
 0.494148244473342,
 0.42787825319805917,
 0.4095760023074705,
 0.3934987168520103,
 0.3373015873015873,
 0.28935185185185186,
 0.25296017222820233,
 0.17635843660629172,
 0.1723076923076923,
 0.15683814303638646,
 0.13440860215053763,
 0.12734584450402142,
 0.1273344651952462,
 0.08771929824561404,
 0.08291873963515754,
 0.08002560819462227,
 0.07468879668049792,
 0.07154213036565978,
 0.040540540540540536,
 0.03910614525139665,
 0.029655990510083035,
 0.025826446280991736,
 0.02525252525252525,
 0.023841961852861037,
 0.007936507936507936,
 0.005128205128205128,
 0.0]

In [None]:
# Обнуляем трэшхолды
for model in models:
    model.set_probability_threshold(0.5)

# Ставим условия для трэшхолдов
for score in list_of_scores:
    if score > 0.5:
        models[scores[score]].set_probability_threshold(0.2)
    elif  0.1 < score < 0.5:
         models[scores[score]].set_probability_threshold(0.3)
    elif score < 0.001:
         models[scores[score]].set_probability_threshold(0.95) 

In [None]:
probabilities = []
for model in models:
    probabilities.append(model.get_probability_threshold())
probabilities

[0.2,
 0.5,
 0.3,
 0.5,
 0.5,
 0.3,
 0.5,
 0.5,
 0.5,
 0.3,
 0.5,
 0.3,
 0.3,
 0.3,
 0.5,
 0.3,
 0.3,
 0.3,
 0.3,
 0.5,
 0.3,
 0.5,
 0.3,
 0.5,
 0.5,
 0.5,
 0.3,
 0.5,
 0.5,
 0.5,
 0.2,
 0.5,
 0.5,
 0.5,
 0.95]

In [None]:
model = models[0]
model.get_all_params()

{'add_ridge_penalty_to_loss_function': False,
 'auto_class_weights': 'None',
 'bagging_temperature': 1,
 'bayesian_matrix_reg': 0.10000000149011612,
 'best_model_min_trees': 1,
 'boost_from_average': True,
 'boosting_type': 'Plain',
 'bootstrap_type': 'Bayesian',
 'border_count': 128,
 'class_names': [0, 1],
 'classes_count': 0,
 'data_partition': 'DocParallel',
 'depth': 8,
 'devices': '-1',
 'eval_metric': 'F:beta=0.5',
 'feature_border_type': 'GreedyLogSum',
 'fold_size_loss_normalization': False,
 'force_unit_auto_pair_weights': False,
 'gpu_cat_features_storage': 'GpuRam',
 'gpu_ram_part': 0.95,
 'grow_policy': 'SymmetricTree',
 'iterations': 2800,
 'l2_leaf_reg': 5,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'leaf_estimation_iterations': 10,
 'leaf_estimation_method': 'Newton',
 'learning_rate': 0.029999999329447743,
 'loss_function': 'Logloss',
 'max_leaves': 256,
 'meta_l2_exponent': 1,
 'meta_l2_frequency': 0,
 'min_data_in_leaf': 1,
 'min_fold_size': 100,
 'model_siz

## Проверяем качество модели на валидации

Функция формирования датафрейма с предсказаниями

In [None]:
def predict(X, model_zoo):
    preds = [model.predict(make_pool(X)) for i, model in enumerate(model_zoo)]
    preds = pd.DataFrame(np.array(preds).transpose(1, 0), index=X.index, columns=[f'type_{i}' for i in range(35)]).astype(int)
    return preds

Предскажем значения для валидационного набора и посчитаем micro F0.5-score на валидации

In [None]:
preds = predict(X_val, models)
print(fbeta_score(y_val, preds, beta=0.5, average='micro', zero_division=0))
preds 

0.4006173667964032


Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31053,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
726736,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
592788,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
84791,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
492139,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933238,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
601688,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
791349,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
378167,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Загружаем тестовые данные

In [None]:
payments_test = pd.read_csv(PAYMENTS_TEST_PATH, dtype=payments_dtypes)
payments_test['month'] = payments_test['dt_day'] // 30.41666666666667 
payments_test['month'] = payments_test['month'].astype('int16')
payments_test 

Unnamed: 0,client_id,contractor_id,is_outgoing,amount,dt_day,dt_hour,channel,flag_0,flag_1,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11,month
0,303546,,True,378449,50,2,,False,False,False,True,False,False,False,False,False,False,False,False,1
1,59719,,True,281527,267,2,pos,True,False,False,False,False,False,False,False,False,False,False,False,8
2,25428,90165,False,2105146744,294,9,,False,False,False,False,False,False,False,False,False,False,False,False,9
3,766314,,True,2156082,207,14,pos,True,False,False,False,False,False,False,False,False,False,False,False,6
4,465049,,True,51694546,211,3,atm,True,False,True,False,False,False,False,False,False,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8553957,794745,773056,False,858062290,198,16,,False,False,False,False,False,False,False,False,False,False,False,False,6
8553958,362168,904134,False,8540123,130,13,,False,False,False,False,False,False,False,False,False,False,False,False,4
8553959,500006,471487,False,65346645,196,12,,False,False,False,False,False,False,False,False,False,False,False,False,6
8553960,653741,,True,3296030,43,0,,True,False,False,False,False,False,False,False,False,False,False,False,1


## Генерируем фичи для тестовых данных

In [None]:
features_test = generate_features(payments_test)
features_test

Unnamed: 0_level_0,mean,median,std,min,max,count_transactions,adds,withdraws,h_0,h_1,...,flag_2_count,flag_3_count,flag_4_count,flag_5_count,flag_6_count,flag_7_count,flag_8_count,flag_9_count,flag_10_count,flag_11_count
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100050,3.536920e+07,4800238.5,7.702827e+07,798,474601237,160,0.731250,0.268750,0.043750,0.056250,...,0,35,13,0,0,0,13,0,0,0
100128,2.074924e+08,42574180.0,5.213882e+08,2401,4263233615,97,0.804124,0.195876,0.010309,0.020619,...,0,19,11,0,0,0,11,24,0,0
100159,1.516675e+08,79896060.0,2.081792e+08,17615,1348514255,746,0.600536,0.399464,0.001340,0.008043,...,0,11,52,0,0,0,53,0,0,0
10018,5.424148e+07,26540880.5,1.042072e+08,43,808270306,350,0.280000,0.720000,0.014286,0.002857,...,0,17,13,0,0,0,14,0,0,0
100237,2.005865e+08,50737877.0,3.939886e+08,31511,2521806684,106,0.839623,0.160377,0.037736,0.066038,...,12,28,2,0,0,0,2,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999572,4.462335e+07,21529563.0,8.773594e+07,2137,1586323663,770,0.566234,0.433766,0.009091,0.005195,...,0,22,76,0,0,0,80,0,3,0
99966,5.238860e+07,950112.0,2.225828e+08,578,2924402093,858,0.941725,0.058275,0.136364,0.144522,...,14,17,26,0,0,0,26,0,5,5
999662,3.545021e+07,5060146.5,9.601183e+07,19606,947093072,400,0.827500,0.172500,0.100000,0.100000,...,20,8,20,0,0,0,21,0,0,0
999674,1.040799e+08,14518077.0,2.235370e+08,31020,2255212467,283,0.802120,0.197880,0.070671,0.088339,...,14,22,15,0,2,0,16,0,0,0


## Предскажем значения для тестовых данных и сгенерируем сабмит

In [None]:
preds_test = predict(features_test, models)
preds_test

Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100050,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
100128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100159,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10018,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100237,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999572,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99966,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
999662,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
999674,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
preds_test.to_csv('submission_tochka.csv')