### Imports and requirements

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import sys
import pickle
import math
import numpy as np

from sklearn.model_selection import train_test_split
import tqdm

from scipy.stats import gmean, hmean
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from catboost import Pool, CatBoostClassifier

# добавим родительскую директорию, в ней лежат все необходимые полезные функции для обработки данных
sys.path.append("../")

import warnings
warnings.filterwarnings("ignore")

In [2]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-pastel")

### Загрузка данных

In [3]:
TRAIN_TARGET_PATH = "G:\\Alfa_Bank_competition\\data_original\\train_target.csv"
train_target = pd.read_csv(TRAIN_TARGET_PATH)
train_target

Unnamed: 0,id,flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2999995,2999995,0
2999996,2999996,0
2999997,2999997,0
2999998,2999998,0


Загрузим посчитанные ранее предсказания на тренировочных данных (OOB)

In [5]:
train_preds_total = pd.read_csv('train_preds_total.csv', index_col=0, dtype=np.float32)
oof_catboost_base = pd.read_csv('oof_catboost_base_train.csv', dtype=np.float32)
oof_catboost_ranker = pd.read_csv('oof_catboost_ranker_train.csv', dtype=np.float32)
oof_lgbm = pd.read_csv('oof_lgbm_train.csv', dtype=np.float32)


In [6]:
train_preds_total = train_preds_total.merge(oof_catboost_base, on='id').rename(columns={'score': 'score_catboost_base'})
train_preds_total = train_preds_total.merge(oof_catboost_ranker, on='id').rename(columns={'score': 'score_catboost_ranker'})
train_preds_total = train_preds_total.merge(oof_lgbm, on='id').rename(columns={'score': 'score_lgbm'})
train_preds_total = train_preds_total.merge(train_target, on='id')

In [7]:
train_preds_total

Unnamed: 0,id,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,flag
0,299424.0,0.222116,0.352430,0.283658,0.021826,0.254688,1.603554e-09,0.200043,-0.606840,0.151729,0
1,430606.0,0.167748,0.407255,0.395922,0.073195,0.330099,4.938549e-02,0.676817,0.582111,0.563486,0
2,164999.0,0.067053,0.256659,0.242773,0.015040,0.249302,8.569962e-03,0.229891,-1.491362,0.389147,0
3,316775.0,0.837270,0.523706,0.601104,0.421531,0.400328,3.643731e-03,0.872699,1.613747,0.803610,1
4,164305.0,0.120267,0.329662,0.340208,0.020626,0.258009,2.108688e-03,0.206625,-1.191789,0.126573,0
...,...,...,...,...,...,...,...,...,...,...,...
2999995,2933374.0,0.277024,0.251996,0.188164,0.248241,0.195647,1.987233e-01,0.242755,-0.991078,0.220097,0
2999996,2933378.0,0.401817,0.461779,0.346996,0.359805,0.337462,4.621871e-01,0.534371,0.404493,0.481970,0
2999997,2639547.0,0.558675,0.547448,0.362801,0.527204,0.372062,5.274813e-01,0.667823,0.442747,0.637490,0
2999998,2639502.0,0.502100,0.487539,0.526418,0.492289,0.438868,5.741965e-01,0.766747,1.001667,0.759137,0


In [9]:
test_ensemble = pd.read_csv('test_ensemble_mean.csv', dtype=np.float32)
submission_catboost_base = pd.read_csv('catboost_base_test.csv', dtype=np.float32)
submission_catboost_ranker = pd.read_csv('catboost_ranker_test.csv', dtype=np.float32)
submission_lgbm = pd.read_csv('lgbm_test.csv', dtype=np.float32)


In [10]:
test_ensemble = test_ensemble.merge(submission_catboost_base[['id', 'score_mean']], on='id').rename(columns={'score_mean': 'score_catboost_base'})
test_ensemble = test_ensemble.merge(submission_catboost_ranker[['id', 'score_mean']], on='id').rename(columns={'score_mean': 'score_catboost_ranker'})
test_ensemble = test_ensemble.merge(submission_lgbm[['id', 'score_mean']], on='id').rename(columns={'score_mean': 'score_lgbm'})

In [11]:
test_ensemble

Unnamed: 0,id,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm
0,3047012.0,0.000438,0.191064,0.197535,0.000176,0.158888,6.428444e-08,0.050245,-4.590703,0.072161
1,3000786.0,0.000480,0.206505,0.213709,0.000205,0.166679,9.907824e-08,0.077669,-4.116349,0.122851
2,3019111.0,0.015074,0.270507,0.323749,0.009136,0.266598,1.850942e-04,0.119329,-2.872110,0.188742
3,3345810.0,0.033989,0.180308,0.352357,0.015890,0.086264,8.458280e-05,0.282501,-1.489228,0.270906
4,3434512.0,0.029337,0.137022,0.332252,0.018969,0.044439,5.007834e-04,0.279360,-1.269662,0.198602
...,...,...,...,...,...,...,...,...,...,...
499995,3464951.0,0.430107,0.437442,0.396564,0.411256,0.329756,4.919842e-01,0.484658,-0.253584,0.517829
499996,3028533.0,0.378658,0.360421,0.356849,0.386809,0.294696,4.476581e-01,0.366196,-0.422444,0.364558
499997,3464956.0,0.422197,0.431431,0.347834,0.412099,0.310514,4.864063e-01,0.467195,-0.069230,0.446837
499998,3366101.0,0.343227,0.333384,0.347950,0.344571,0.286252,3.972217e-01,0.376374,-0.840744,0.341062


### Логистическая регрессия

Обучим логистическую регрессию на мета-признаках, добавив к признакам попарные произведения прогнозов всех моделей.

In [12]:
feature_cols_base = list(train_preds_total.columns.values)

feature_cols_base.remove("flag")
feature_cols_base.remove("id")
len(feature_cols_base)

9

In [13]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)

Poly_features = poly.fit_transform(train_preds_total[feature_cols_base])

Poly_features_train = pd.DataFrame(Poly_features, columns=poly.get_feature_names(feature_cols_base))
Poly_features_train['id'] = train_preds_total['id']
Poly_features_train['flag'] = train_preds_total['flag']
Poly_features_train


Unnamed: 0,1,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,...,score_LAST_HIDDEN_SHUFFLE score_catboost_ranker,score_LAST_HIDDEN_SHUFFLE score_lgbm,score_catboost_base^2,score_catboost_base score_catboost_ranker,score_catboost_base score_lgbm,score_catboost_ranker^2,score_catboost_ranker score_lgbm,score_lgbm^2,id,flag
0,1.0,0.222116,0.352430,0.283658,0.021826,0.254688,1.603554e-09,0.200043,-0.606840,0.151729,...,-9.731013e-10,2.433062e-10,0.040017,-0.121394,0.030352,0.368255,-0.092075,0.023022,299424.0,0
1,1.0,0.167748,0.407255,0.395922,0.073195,0.330099,4.938549e-02,0.676817,0.582111,0.563486,...,2.874785e-02,2.782805e-02,0.458081,0.393983,0.381377,0.338853,0.328012,0.317517,430606.0,0
2,1.0,0.067053,0.256659,0.242773,0.015040,0.249302,8.569962e-03,0.229891,-1.491362,0.389147,...,-1.278092e-02,3.334977e-03,0.052850,-0.342851,0.089461,2.224160,-0.580359,0.151436,164999.0,0
3,1.0,0.837270,0.523706,0.601104,0.421531,0.400328,3.643731e-03,0.872699,1.613747,0.803610,...,5.880059e-03,2.928137e-03,0.761603,1.408315,0.701309,2.604179,1.296823,0.645788,316775.0,1
4,1.0,0.120267,0.329662,0.340208,0.020626,0.258009,2.108688e-03,0.206625,-1.191789,0.126573,...,-2.513111e-03,2.669031e-04,0.042694,-0.246254,0.026153,1.420360,-0.150848,0.016021,164305.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,1.0,0.277024,0.251996,0.188164,0.248241,0.195647,1.987233e-01,0.242755,-0.991078,0.220097,...,-1.969502e-01,4.373848e-02,0.058930,-0.240589,0.053430,0.982235,-0.218134,0.048443,2933374.0,0
2999996,1.0,0.401817,0.461779,0.346996,0.359805,0.337462,4.621871e-01,0.534371,0.404493,0.481970,...,1.869514e-01,2.227605e-01,0.285553,0.216149,0.257551,0.163614,0.194954,0.232295,2933378.0,0
2999997,1.0,0.558675,0.547448,0.362801,0.527204,0.372062,5.274813e-01,0.667823,0.442747,0.637490,...,2.335408e-01,3.362642e-01,0.445987,0.295677,0.425731,0.196025,0.282247,0.406394,2639547.0,0
2999998,1.0,0.502100,0.487539,0.526418,0.492289,0.438868,5.741965e-01,0.766747,1.001667,0.759137,...,5.751537e-01,4.358940e-01,0.587901,0.768025,0.582066,1.003337,0.760403,0.576289,2639502.0,0


In [14]:
feature_cols = list(Poly_features_train.columns.values)

feature_cols.remove("flag")
feature_cols.remove("id")
len(feature_cols)

55

In [16]:
targets = Poly_features_train["flag"].values
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

oof = np.zeros(len(Poly_features_train))
train_preds = np.zeros(len(Poly_features_train))

models_logreg = []
scores = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(Poly_features_train, targets), 1):
    print(f'train fold {fold_}')
    logreg_model = LogisticRegression(class_weight='balanced', 
                                      random_state=42, 
                                      #C=0.1,
                                      n_jobs=-1)
    train, val = Poly_features_train.iloc[train_idx], Poly_features_train.iloc[val_idx]
    
    logreg_model.fit(train[feature_cols], train.flag.values)
       
    
    pred_val = logreg_model.predict_proba(val[feature_cols])[:, 1]

    cur_score = roc_auc_score(val.flag.values, pred_val)
    scores.append(cur_score)
    print(cur_score)
    print(logreg_model.coef_)

    oof[val_idx] = logreg_model.predict_proba(val[feature_cols])[:, 1]
    train_preds[train_idx] += logreg_model.predict_proba(train[feature_cols])[:, 1] / (cv.n_splits-1)
    models_logreg.append(logreg_model)

train fold 1
0.785630599188456
[[-0.98394189  0.89687528  0.8469618   0.58975062  0.46115497 -0.59947396
  -0.24566547  0.23342305  1.23409308 -0.41892369  0.92673691  0.5726009
   0.04178788  0.51871107  0.09772004  0.36591519  0.25712399 -1.02310282
   0.09293145  0.83979222  0.26100496  0.46317181  0.21838134  0.27543196
   0.45288143  0.1310559   0.43052047  0.53215708 -0.0372258   0.09426344
  -0.05732365  0.20881428 -0.4472231   0.248514    0.64150755 -0.12664761
   0.31747529  0.21333709 -0.48657512  0.08877054 -0.14455109 -0.23728532
  -0.17260166  0.30177378 -0.14279104  0.69543373 -0.13308523 -0.36506486
  -0.1237291   0.25770706 -0.38634378 -0.06516808  0.12793434 -0.01555164
  -0.02622003]]
train fold 2
0.7860329700063138
[[-1.00674742  0.94338442  0.96679395  0.69941821  0.52091377 -0.55791995
  -0.2947191   0.23189972  1.24212225 -0.46684894  0.97580691  0.51853171
  -0.13537994  0.46968809  0.06073275  0.31978799  0.24315671 -0.97588327
   0.05893084  0.85623721  0.17161

Тестовая выборка:

In [20]:
Poly_features_test = poly.transform(test_ensemble[feature_cols_base])
Poly_features_test = pd.DataFrame(Poly_features_test, columns=poly.get_feature_names(feature_cols_base))
Poly_features_test['id'] = test_ensemble['id']
Poly_features_test

Unnamed: 0,1,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,...,score_LAST_HIDDEN_SHUFFLE score_catboost_base,score_LAST_HIDDEN_SHUFFLE score_catboost_ranker,score_LAST_HIDDEN_SHUFFLE score_lgbm,score_catboost_base^2,score_catboost_base score_catboost_ranker,score_catboost_base score_lgbm,score_catboost_ranker^2,score_catboost_ranker score_lgbm,score_lgbm^2,id
0,1.0,0.000438,0.191064,0.197535,0.000176,0.158888,6.428444e-08,0.050245,-4.590703,0.072161,...,3.229977e-09,-2.951108e-07,4.638846e-09,0.002525,-0.230660,0.003626,21.074554,-0.331271,0.005207,3047012.0
1,1.0,0.000480,0.206505,0.213709,0.000205,0.166679,9.907824e-08,0.077669,-4.116349,0.122851,...,7.695294e-09,-4.078406e-07,1.217183e-08,0.006032,-0.319712,0.009542,16.944326,-0.505696,0.015092,3000786.0
2,1.0,0.015074,0.270507,0.323749,0.009136,0.266598,1.850942e-04,0.119329,-2.872110,0.188742,...,2.208711e-05,-5.316108e-04,3.493504e-05,0.014239,-0.342726,0.022522,8.249015,-0.542088,0.035624,3019111.0
3,1.0,0.033989,0.180308,0.352357,0.015890,0.086264,8.458280e-05,0.282501,-1.489228,0.270906,...,2.389472e-05,-1.259631e-04,2.291400e-05,0.079807,-0.420708,0.076531,2.217799,-0.403441,0.073390,3345810.0
4,1.0,0.029337,0.137022,0.332252,0.018969,0.044439,5.007834e-04,0.279360,-1.269662,0.198602,...,1.398990e-04,-6.358257e-04,9.945683e-05,0.078042,-0.354693,0.055482,1.612042,-0.252158,0.039443,3434512.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,1.0,0.430107,0.437442,0.396564,0.411256,0.329756,4.919842e-01,0.484658,-0.253584,0.517829,...,2.384441e-01,-1.247594e-01,2.547638e-01,0.234893,-0.122902,0.250970,0.064305,-0.131313,0.268147,3464951.0
499996,1.0,0.378658,0.360421,0.356849,0.386809,0.294696,4.476581e-01,0.366196,-0.422444,0.364558,...,1.639305e-01,-1.891104e-01,1.631972e-01,0.134099,-0.154697,0.133499,0.178459,-0.154005,0.132902,3028533.0
499997,1.0,0.422197,0.431431,0.347834,0.412099,0.310514,4.864063e-01,0.467195,-0.069230,0.446837,...,2.272465e-01,-3.367412e-02,2.173442e-01,0.218271,-0.032344,0.208760,0.004793,-0.030935,0.199663,3464956.0
499998,1.0,0.343227,0.333384,0.347950,0.344571,0.286252,3.972217e-01,0.376374,-0.840744,0.341062,...,1.495040e-01,-3.339616e-01,1.354774e-01,0.141658,-0.316434,0.128367,0.706850,-0.286746,0.116324,3366101.0


In [22]:
score = np.zeros(len(Poly_features_test))

preds = []

for model in tqdm.tqdm_notebook(models_logreg):
    score += model.predict_proba(Poly_features_test[feature_cols])[:, 1] / len(models_logreg)
    preds.append(model.predict_proba(Poly_features_test[feature_cols])[:, 1])
    
test_logreg = pd.DataFrame({
    "id" : Poly_features_test["id"].values,
    "score": score, 
}) 

test_logreg['id'] = test_logreg['id'].astype(int)
test_logreg

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,3047012,0.014094
1,3000786,0.015420
2,3019111,0.028571
3,3345810,0.075005
4,3434512,0.080471
...,...,...
499995,3464951,0.567122
499996,3028533,0.433187
499997,3464956,0.553144
499998,3366101,0.351431


In [23]:
test_logreg.to_csv("test_logreg_poly_final.csv", index=None)

### Catboost Classifier

Для обучения Catboost Classifier на мета-признаках помимо непоредственно прогнозов моделей первого уровня будем использовать:
 - среднее арифметическое, среднее герметрическое и среднее гармоническое данных прогнозов,
 - минимальное, максимальное значение предсказания для каждого объекта, разность между минимальным и максимальным, среднее квардатичное отклонение,
 - ранговый скор для каждого прогноза и среднее, минимальное, максимальное значение для рангов, а также разность между минимальным и максимальным значением и std значений рангов.
 - кроме того, как и в логистической регрессии добавим полиномиальные признаки степени 4.

Для подготовки данных признаков реализована функция `prepare_data`.

In [27]:
def prepare_data(data: pd.DataFrame, model_names: list) -> pd.DataFrame:
    # data - изменяемый DataFrame
    # model_names - наименование моделей
    # Возвращает DataFrame с добавленными признаками
    for col_name in model_names:
        data[f'rank_{col_name}'] = data[f'score_{col_name}'].rank()
        
    data['score_mean'] = np.mean([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    data['score_gmean'] = gmean([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    data['score_hmean'] = hmean([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    data['score_min'] = np.min([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    data['score_max'] = np.max([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    data['score_diff'] =  data['score_max'] - data['score_min']
    data['score_std'] = np.std([data[f'score_{col_name}'] for col_name in model_names], axis=0)
    
    data['rank_mean'] = np.mean([data[f'rank_{col_name}'] for col_name in model_names], axis=0)
    data['rank_min'] = np.min([data[f'rank_{col_name}'] for col_name in model_names], axis=0)
    data['rank_max'] = np.max([data[f'rank_{col_name}'] for col_name in model_names], axis=0)
    data['rank_diff'] =  data['rank_max'] - data['rank_min']
    data['rank_std'] = np.std([data[f'rank_{col_name}'] for col_name in model_names], axis=0)

In [28]:
feature_cols_base = list(train_preds_total.columns.values)

feature_cols_base.remove("flag")
feature_cols_base.remove("id")
len(feature_cols_base)

9

In [29]:
feature_cols_base

['score_RNN_LAST_HIDDEN',
 'score_RNN_BIDIRECTIONAL_SIMPLE',
 'score_TRANSFORMER',
 'score_RNN_ATTENTION',
 'score_CONV',
 'score_LAST_HIDDEN_SHUFFLE',
 'score_catboost_base',
 'score_catboost_ranker',
 'score_lgbm']

In [30]:
def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [31]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)

train_preds_total['score_catboost_ranker'] = train_preds_total['score_catboost_ranker'].apply(lambda x: sigmoid(x))

Poly_features = poly.fit_transform(train_preds_total[feature_cols_base])

Poly_features_train = pd.DataFrame(Poly_features, columns=poly.get_feature_names(feature_cols_base))
Poly_features_train['id'] = train_preds_total['id']
Poly_features_train['flag'] = train_preds_total['flag']

Poly_features_train

Unnamed: 0,1,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,...,score_catboost_base score_catboost_ranker^2 score_lgbm,score_catboost_base score_catboost_ranker score_lgbm^2,score_catboost_base score_lgbm^3,score_catboost_ranker^4,score_catboost_ranker^3 score_lgbm,score_catboost_ranker^2 score_lgbm^2,score_catboost_ranker score_lgbm^3,score_lgbm^4,id,flag
0,1.0,0.222116,0.352430,0.283658,0.021826,0.254688,1.603554e-09,0.200043,0.352780,0.151729,...,0.003777,0.001625,0.000699,0.015489,0.006662,0.002865,0.001232,0.000530,299424.0,0
1,1.0,0.167748,0.407255,0.395922,0.073195,0.330099,4.938549e-02,0.676817,0.641553,0.563486,...,0.156971,0.137870,0.121094,0.169407,0.148793,0.130687,0.114784,0.100817,430606.0,0
2,1.0,0.067053,0.256659,0.242773,0.015040,0.249302,8.569962e-03,0.229891,0.183717,0.389147,...,0.003020,0.006396,0.013548,0.001139,0.002413,0.005111,0.010827,0.022933,164999.0,0
3,1.0,0.837270,0.523706,0.601104,0.421531,0.400328,3.643731e-03,0.872699,0.833931,0.803610,...,0.487719,0.469986,0.452897,0.483638,0.466053,0.449108,0.432778,0.417043,316775.0,1
4,1.0,0.120267,0.329662,0.340208,0.020626,0.258009,2.108688e-03,0.206625,0.232939,0.126573,...,0.001419,0.000771,0.000419,0.002944,0.001600,0.000869,0.000472,0.000257,164305.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,1.0,0.277024,0.251996,0.188164,0.248241,0.195647,1.987233e-01,0.242755,0.270699,0.220097,...,0.003915,0.003183,0.002588,0.005370,0.004366,0.003550,0.002886,0.002347,2933374.0,0
2999996,1.0,0.401817,0.461779,0.346996,0.359805,0.337462,4.621871e-01,0.534371,0.599767,0.481970,...,0.092646,0.074450,0.059828,0.129398,0.103984,0.083561,0.067150,0.053961,2933378.0,0
2999997,1.0,0.558675,0.547448,0.362801,0.527204,0.372062,5.274813e-01,0.667823,0.608913,0.637490,...,0.157851,0.165259,0.173014,0.137475,0.143926,0.150681,0.157753,0.165156,2639547.0,0
2999998,1.0,0.502100,0.487539,0.526418,0.492289,0.438868,5.741965e-01,0.766747,0.731386,0.759137,...,0.311362,0.323176,0.335439,0.286146,0.297003,0.308272,0.319969,0.332110,2639502.0,0


In [32]:
model_names=['RNN_LAST_HIDDEN',
             'RNN_BIDIRECTIONAL_SIMPLE',
             'TRANSFORMER','RNN_ATTENTION',
             'CONV',
             'LAST_HIDDEN_SHUFFLE',
             'catboost_base',
             'catboost_ranker','lgbm']

In [33]:
prepare_data(Poly_features_train, model_names)
Poly_features_train

Unnamed: 0,1,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,...,score_hmean,score_min,score_max,score_diff,score_std,rank_mean,rank_min,rank_max,rank_diff,rank_std
0,1.0,0.222116,0.352430,0.283658,0.021826,0.254688,1.603554e-09,0.200043,0.352780,0.151729,...,1.443199e-08,1.603554e-09,0.352780,0.352780,0.120628,8.073293e+05,45.0,1620747.5,1620702.5,6.043184e+05
1,1.0,0.167748,0.407255,0.395922,0.073195,0.330099,4.938549e-02,0.676817,0.641553,0.563486,...,1.707979e-01,4.938549e-02,0.676817,0.627431,0.221282,1.570823e+06,890.0,2659898.0,2659008.0,1.107497e+06
2,1.0,0.067053,0.256659,0.242773,0.015040,0.249302,8.569962e-03,0.229891,0.183717,0.389147,...,4.045316e-02,8.569962e-03,0.389147,0.380577,0.120263,6.744181e+05,124.0,1648646.0,1648522.0,5.441319e+05
3,1.0,0.837270,0.523706,0.601104,0.421531,0.400328,3.643731e-03,0.872699,0.833931,0.803610,...,3.128573e-02,3.643731e-03,0.872699,0.869055,0.270568,2.522704e+06,398.0,2999992.0,2999594.0,9.164009e+05
4,1.0,0.120267,0.329662,0.340208,0.020626,0.258009,2.108688e-03,0.206625,0.232939,0.126573,...,1.613170e-02,2.108688e-03,0.340208,0.338099,0.115997,7.324184e+05,148.0,1738061.5,1737913.5,6.237985e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,1.0,0.277024,0.251996,0.188164,0.248241,0.195647,1.987233e-01,0.242755,0.270699,0.220097,...,2.283031e-01,1.881642e-01,0.277024,0.088859,0.031257,6.673166e+05,79872.0,1174600.5,1094728.5,3.102092e+05
2999996,1.0,0.401817,0.461779,0.346996,0.359805,0.337462,4.621871e-01,0.534371,0.599767,0.481970,...,4.274356e-01,3.374622e-01,0.599767,0.262304,0.084417,2.123819e+06,1685039.5,2570627.5,885588.0,2.797809e+05
2999997,1.0,0.558675,0.547448,0.362801,0.527204,0.372062,5.274813e-01,0.667823,0.608913,0.637490,...,5.124821e-01,3.628013e-01,0.667823,0.305022,0.100355,2.615472e+06,1915371.5,2956729.0,1041357.5,2.976525e+05
2999998,1.0,0.502100,0.487539,0.526418,0.492289,0.438868,5.741965e-01,0.766747,0.731386,0.759137,...,5.631115e-01,4.388683e-01,0.766747,0.327879,0.122308,2.823634e+06,2715630.5,2875257.0,159626.5,4.472085e+04


In [34]:
test_ensemble['score_catboost_ranker'] = test_ensemble['score_catboost_ranker'].apply(lambda x: sigmoid(x))

Poly_features = poly.transform(test_ensemble[feature_cols_base])

Poly_features_test = pd.DataFrame(Poly_features, columns=poly.get_feature_names(feature_cols_base))
Poly_features_test['id'] = test_ensemble['id'].astype(int)

prepare_data(Poly_features_test, model_names)

Poly_features_test

Unnamed: 0,1,score_RNN_LAST_HIDDEN,score_RNN_BIDIRECTIONAL_SIMPLE,score_TRANSFORMER,score_RNN_ATTENTION,score_CONV,score_LAST_HIDDEN_SHUFFLE,score_catboost_base,score_catboost_ranker,score_lgbm,...,score_hmean,score_min,score_max,score_diff,score_std,rank_mean,rank_min,rank_max,rank_diff,rank_std
0,1.0,0.000438,0.191064,0.197535,0.000176,0.158888,6.428444e-08,0.050245,0.010044,0.072161,...,5.782587e-07,6.428444e-08,0.197535,0.197535,0.079682,19768.000000,1.0,89646.0,89645.0,27527.646495
1,1.0,0.000480,0.206505,0.213709,0.000205,0.166679,9.907824e-08,0.077669,0.016042,0.122851,...,8.910811e-07,9.907824e-08,0.213709,0.213709,0.085395,33135.000000,2.0,117041.0,117039.0,37660.690995
2,1.0,0.015074,0.270507,0.323749,0.009136,0.266598,1.850942e-04,0.119329,0.053550,0.188742,...,1.601043e-03,1.850942e-04,0.323749,0.323564,0.119889,101740.611111,3.0,293660.0,293657.0,106720.211329
3,1.0,0.033989,0.180308,0.352357,0.015890,0.086264,8.458280e-05,0.282501,0.184038,0.270906,...,7.532812e-04,8.458280e-05,0.352357,0.352272,0.121485,107908.777778,3.0,333571.0,333568.0,122524.744564
4,1.0,0.029337,0.137022,0.332252,0.018969,0.044439,5.007834e-04,0.279360,0.219315,0.198602,...,4.226024e-03,5.007834e-04,0.332252,0.331751,0.116239,98959.000000,2.0,306030.0,306028.0,116490.905044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,1.0,0.430107,0.437442,0.396564,0.411256,0.329756,4.919842e-01,0.484658,0.436942,0.517829,...,4.303651e-01,3.297560e-01,0.517829,0.188073,0.053410,393604.555556,381076.0,419147.0,38071.0,11530.371596
499996,1.0,0.378658,0.360421,0.356849,0.386809,0.294696,4.476581e-01,0.366196,0.395932,0.364558,...,3.684042e-01,2.946963e-01,0.447658,0.152962,0.037988,325021.000000,294038.0,363266.0,69228.0,21486.029280
499997,1.0,0.422197,0.431431,0.347834,0.412099,0.310514,4.864063e-01,0.467195,0.482699,0.446837,...,4.144716e-01,3.105142e-01,0.486406,0.175892,0.056371,377264.833333,327588.0,414744.0,87156.0,28188.869596
499998,1.0,0.343227,0.333384,0.347950,0.344571,0.286252,3.972217e-01,0.376374,0.301378,0.341062,...,3.382550e-01,2.862523e-01,0.397222,0.110969,0.031803,282294.111111,247367.0,327776.0,80409.0,24475.671542


In [35]:
feature_cols = list(Poly_features_train.columns.values)
feature_cols.remove("flag")
len(feature_cols)

737

In [36]:
feature_names = feature_cols

target = 'flag'  
targets = Poly_features_train["flag"].values

In [37]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

cv = StratifiedKFold(n_splits=10, random_state=5356, shuffle=True)

oof = np.zeros(len(Poly_features_train))
train_preds = np.zeros(len(Poly_features_train))

models_catboost_final = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(Poly_features_train, targets), 1):
    print(f"Training with fold {fold_} started")
    
    
    model_catboost = CatBoostClassifier(
                    bootstrap_type='Bayesian',
                    verbose=500,
                    loss_function='Logloss',
                    eval_metric='AUC',
                    early_stopping_rounds=500,
                    task_type="GPU",
                    iterations=50000,
                    learning_rate=0.01, 
                    auto_class_weights = 'Balanced',
                    depth=2, 
                    l2_leaf_reg= 1,
                    random_state=42,
    )  
    
    train, val = Poly_features_train.iloc[train_idx], Poly_features_train.iloc[val_idx]
    
    model_catboost.fit(
                  train[feature_names], train[target], 
                  eval_set=(val[feature_names], val[target]),
                  plot=False
    )
    oof[val_idx] = model_catboost.predict_proba(val[feature_cols])[:, 1]
    train_preds[train_idx] += model_catboost.predict_proba(train[feature_cols])[:, 1] / (cv.n_splits-1)
    models_catboost_final.append(model_catboost)
    print(f"Training with fold {fold_} completed")

Training with fold 1 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7407542	best: 0.7407542 (0)	total: 59.6ms	remaining: 49m 37s
500:	test: 0.7852858	best: 0.7852862 (499)	total: 16s	remaining: 26m 18s
1000:	test: 0.7856353	best: 0.7856353 (1000)	total: 31s	remaining: 25m 18s
1500:	test: 0.7857673	best: 0.7857674 (1496)	total: 46.3s	remaining: 24m 56s
2000:	test: 0.7858364	best: 0.7858364 (2000)	total: 1m 1s	remaining: 24m 42s
2500:	test: 0.7858863	best: 0.7858864 (2490)	total: 1m 17s	remaining: 24m 28s
3000:	test: 0.7859062	best: 0.7859064 (2999)	total: 1m 32s	remaining: 24m 13s
3500:	test: 0.7859310	best: 0.7859314 (3497)	total: 1m 48s	remaining: 24m
4000:	test: 0.7859541	best: 0.7859551 (3988)	total: 2m 4s	remaining: 23m 49s
4500:	test: 0.7859629	best: 0.7859636 (4496)	total: 2m 19s	remaining: 23m 35s
5000:	test: 0.7859538	best: 0.7859639 (4519)	total: 2m 35s	remaining: 23m 18s
bestTest = 0.7859639227
bestIteration = 4519
Shrink model to first 4520 iterations.
Training with fold 1 completed
Training with fold 2 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7462979	best: 0.7462979 (0)	total: 34.3ms	remaining: 28m 37s
500:	test: 0.7891055	best: 0.7891055 (500)	total: 15.8s	remaining: 25m 57s
1000:	test: 0.7894019	best: 0.7894019 (1000)	total: 31.1s	remaining: 25m 23s
1500:	test: 0.7894678	best: 0.7894682 (1497)	total: 47s	remaining: 25m 18s
2000:	test: 0.7894863	best: 0.7894873 (1979)	total: 1m 3s	remaining: 25m 12s
2500:	test: 0.7894934	best: 0.7894978 (2362)	total: 1m 19s	remaining: 25m 16s
3000:	test: 0.7894981	best: 0.7894986 (2699)	total: 1m 35s	remaining: 24m 54s
3500:	test: 0.7894900	best: 0.7895010 (3173)	total: 1m 50s	remaining: 24m 33s
bestTest = 0.7895009816
bestIteration = 3173
Shrink model to first 3174 iterations.
Training with fold 2 completed
Training with fold 3 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7432939	best: 0.7432939 (0)	total: 34.9ms	remaining: 29m 6s
500:	test: 0.7844144	best: 0.7844144 (500)	total: 16s	remaining: 26m 17s
1000:	test: 0.7847504	best: 0.7847504 (1000)	total: 31.1s	remaining: 25m 24s
1500:	test: 0.7848832	best: 0.7848838 (1498)	total: 47s	remaining: 25m 17s
2000:	test: 0.7849380	best: 0.7849390 (1992)	total: 1m 2s	remaining: 24m 58s
2500:	test: 0.7849658	best: 0.7849668 (2420)	total: 1m 18s	remaining: 24m 42s
3000:	test: 0.7849844	best: 0.7849847 (2980)	total: 1m 33s	remaining: 24m 23s
3500:	test: 0.7849998	best: 0.7849999 (3499)	total: 1m 49s	remaining: 24m 8s
4000:	test: 0.7849968	best: 0.7850034 (3556)	total: 2m 4s	remaining: 23m 53s
bestTest = 0.7850034237
bestIteration = 3556
Shrink model to first 3557 iterations.
Training with fold 3 completed
Training with fold 4 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7452216	best: 0.7452216 (0)	total: 34.7ms	remaining: 28m 54s
500:	test: 0.7857386	best: 0.7857386 (500)	total: 15.8s	remaining: 26m 5s
1000:	test: 0.7859437	best: 0.7859437 (1000)	total: 30.9s	remaining: 25m 11s
1500:	test: 0.7860113	best: 0.7860113 (1455)	total: 46.2s	remaining: 24m 54s
2000:	test: 0.7860464	best: 0.7860466 (1993)	total: 1m 1s	remaining: 24m 45s
2500:	test: 0.7860687	best: 0.7860687 (2500)	total: 1m 17s	remaining: 24m 33s
3000:	test: 0.7860951	best: 0.7860951 (3000)	total: 1m 33s	remaining: 24m 19s
3500:	test: 0.7861049	best: 0.7861055 (3496)	total: 1m 48s	remaining: 24m 5s
4000:	test: 0.7861184	best: 0.7861199 (3956)	total: 2m 4s	remaining: 23m 50s
4500:	test: 0.7861291	best: 0.7861300 (4491)	total: 2m 19s	remaining: 23m 34s
5000:	test: 0.7861352	best: 0.7861392 (4727)	total: 2m 35s	remaining: 23m 20s
5500:	test: 0.7861374	best: 0.7861449 (5347)	total: 2m 51s	remaining: 23m 5s
bestTest = 0.786144942
bestIteration = 5347
Shrink model to first 5348 iteration

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7491357	best: 0.7491357 (0)	total: 36.5ms	remaining: 30m 26s
500:	test: 0.7901974	best: 0.7901974 (500)	total: 16s	remaining: 26m 22s
1000:	test: 0.7903869	best: 0.7903870 (998)	total: 30.9s	remaining: 25m 15s
1500:	test: 0.7904219	best: 0.7904230 (1493)	total: 46.4s	remaining: 24m 59s
2000:	test: 0.7904337	best: 0.7904374 (1920)	total: 1m 2s	remaining: 24m 48s
2500:	test: 0.7904457	best: 0.7904461 (2449)	total: 1m 17s	remaining: 24m 33s
3000:	test: 0.7904433	best: 0.7904479 (2656)	total: 1m 33s	remaining: 24m 19s
3500:	test: 0.7904489	best: 0.7904534 (3360)	total: 1m 48s	remaining: 24m 5s
bestTest = 0.7904533744
bestIteration = 3360
Shrink model to first 3361 iterations.
Training with fold 5 completed
Training with fold 6 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7469078	best: 0.7469078 (0)	total: 34.9ms	remaining: 29m 3s
500:	test: 0.7887229	best: 0.7887229 (500)	total: 15.8s	remaining: 26m 2s
1000:	test: 0.7889782	best: 0.7889782 (999)	total: 30.8s	remaining: 25m 5s
1500:	test: 0.7890720	best: 0.7890726 (1476)	total: 46.1s	remaining: 24m 50s
2000:	test: 0.7891112	best: 0.7891113 (1999)	total: 1m 1s	remaining: 24m 39s
2500:	test: 0.7891458	best: 0.7891460 (2489)	total: 1m 17s	remaining: 24m 28s
3000:	test: 0.7891664	best: 0.7891664 (3000)	total: 1m 32s	remaining: 24m 13s
3500:	test: 0.7891753	best: 0.7891760 (3435)	total: 1m 48s	remaining: 24m
4000:	test: 0.7891847	best: 0.7891908 (3898)	total: 2m 4s	remaining: 23m 45s
bestTest = 0.7891908288
bestIteration = 3898
Shrink model to first 3899 iterations.
Training with fold 6 completed
Training with fold 7 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7449428	best: 0.7449428 (0)	total: 35.8ms	remaining: 29m 52s
500:	test: 0.7884979	best: 0.7884979 (500)	total: 15.8s	remaining: 26m 2s
1000:	test: 0.7887264	best: 0.7887264 (1000)	total: 30.9s	remaining: 25m 14s
1500:	test: 0.7887908	best: 0.7887916 (1493)	total: 46.4s	remaining: 24m 58s
2000:	test: 0.7888117	best: 0.7888120 (1999)	total: 1m 1s	remaining: 24m 43s
2500:	test: 0.7888282	best: 0.7888286 (2490)	total: 1m 17s	remaining: 24m 29s
3000:	test: 0.7888381	best: 0.7888393 (2983)	total: 1m 32s	remaining: 24m 13s
3500:	test: 0.7888520	best: 0.7888547 (3463)	total: 1m 48s	remaining: 23m 59s
4000:	test: 0.7888545	best: 0.7888570 (3659)	total: 2m 3s	remaining: 23m 43s
4500:	test: 0.7888710	best: 0.7888710 (4500)	total: 2m 19s	remaining: 23m 28s
5000:	test: 0.7888607	best: 0.7888710 (4500)	total: 2m 34s	remaining: 23m 14s
bestTest = 0.7888709903
bestIteration = 4500
Shrink model to first 4501 iterations.
Training with fold 7 completed
Training with fold 8 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7476644	best: 0.7476644 (0)	total: 34.8ms	remaining: 28m 58s
500:	test: 0.7883364	best: 0.7883364 (500)	total: 15.9s	remaining: 26m 7s
1000:	test: 0.7886087	best: 0.7886090 (998)	total: 30.9s	remaining: 25m 11s
1500:	test: 0.7887104	best: 0.7887104 (1500)	total: 46.3s	remaining: 24m 55s
2000:	test: 0.7887592	best: 0.7887604 (1995)	total: 1m 1s	remaining: 24m 42s
2500:	test: 0.7887763	best: 0.7887779 (2472)	total: 1m 17s	remaining: 24m 26s
3000:	test: 0.7887923	best: 0.7887926 (2885)	total: 1m 32s	remaining: 24m 12s
3500:	test: 0.7887906	best: 0.7887950 (3068)	total: 1m 48s	remaining: 23m 58s
bestTest = 0.7887949944
bestIteration = 3068
Shrink model to first 3069 iterations.
Training with fold 8 completed
Training with fold 9 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7454972	best: 0.7454972 (0)	total: 34.6ms	remaining: 28m 49s
500:	test: 0.7863895	best: 0.7863895 (500)	total: 15.8s	remaining: 25m 59s
1000:	test: 0.7866260	best: 0.7866260 (996)	total: 30.8s	remaining: 25m 7s
1500:	test: 0.7866595	best: 0.7866595 (1500)	total: 46.2s	remaining: 24m 53s
2000:	test: 0.7866715	best: 0.7866718 (1984)	total: 1m 1s	remaining: 24m 40s
2500:	test: 0.7866704	best: 0.7866750 (2456)	total: 1m 17s	remaining: 24m 26s
3000:	test: 0.7866803	best: 0.7866844 (2859)	total: 1m 32s	remaining: 24m 11s
3500:	test: 0.7866858	best: 0.7866883 (3460)	total: 1m 48s	remaining: 23m 56s
4000:	test: 0.7866794	best: 0.7866893 (3595)	total: 2m 3s	remaining: 23m 42s
bestTest = 0.7866893411
bestIteration = 3595
Shrink model to first 3596 iterations.
Training with fold 9 completed
Training with fold 10 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7456331	best: 0.7456331 (0)	total: 34.8ms	remaining: 29m 1s
500:	test: 0.7880587	best: 0.7880587 (500)	total: 15.9s	remaining: 26m 13s
1000:	test: 0.7883778	best: 0.7883778 (1000)	total: 31.1s	remaining: 25m 21s
1500:	test: 0.7884523	best: 0.7884537 (1457)	total: 46.5s	remaining: 25m 2s
2000:	test: 0.7884510	best: 0.7884600 (1723)	total: 1m 2s	remaining: 24m 54s
bestTest = 0.7884600163
bestIteration = 1723
Shrink model to first 1724 iterations.
Training with fold 10 completed


Делаем прогноз для тестовых данных:

In [39]:
score = np.zeros(len(Poly_features_test))

preds = []

for model in tqdm.tqdm_notebook(models_catboost_final):
    score += model.predict_proba(Poly_features_test[feature_cols])[:, 1] / len(models_catboost_final)
    preds.append(model.predict_proba(Poly_features_test[feature_cols])[:, 1])
    
test_catboost = pd.DataFrame({
    "id" : Poly_features_test["id"].values,
    "score": score, 
}) 

test_catboost['id'] = test_catboost['id'].astype(int)
test_catboost

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,score
0,3047012,0.051585
1,3000786,0.054032
2,3019111,0.069286
3,3345810,0.083965
4,3434512,0.083670
...,...,...
499995,3464951,0.514241
499996,3028533,0.388904
499997,3464956,0.493353
499998,3366101,0.309827


In [40]:
test_catboost.to_csv("test_catboost.csv", index=None)

Теперь усредним прогнозы логистической регрессии и catboost с весами 0.85 и 0.15 (значения весов получены экспериментально).

In [42]:
test_final = test_catboost.copy().rename(columns={'score': 'score_catboost'})
test_final = test_final.merge(test_logreg, on='id').rename(columns={'score': 'score_logreg'})
test_final['score'] = 0.85 * test_final['score_catboost'] + 0.15 * test_final['score_logreg']
test_final

Unnamed: 0,id,score_catboost,score_logreg,score
0,3047012,0.051585,0.014094,0.045961
1,3000786,0.054032,0.015420,0.048240
2,3019111,0.069286,0.028571,0.063179
3,3345810,0.083965,0.075005,0.082621
4,3434512,0.083670,0.080471,0.083190
...,...,...,...,...
499995,3464951,0.514241,0.567122,0.522173
499996,3028533,0.388904,0.433187,0.395546
499997,3464956,0.493353,0.553144,0.502321
499998,3366101,0.309827,0.351431,0.316068


In [43]:
test_final[['id', 'score']].to_csv("test_final.csv", index=None)

Данное решение показало ROC_AUC_score  0,779138 на public leaderboard (9 место). На private leaderboard с ROC_AUC = 0,777522 решение поднялось на 7 место.