# Imports and requirements

In [3]:
%load_ext autoreload
%autoreload 2

import tqdm
import pandas as pd
import sys
import numpy as np

from catboost import Pool, CatBoostClassifier, CatBoostRanker
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

# добавим родительскую директорию, в ней лежат все необходимые полезные функции для обработки данных
sys.path.append("../")

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Загрузка данных

Загрузим ранее подготовденные признаки для обучения бустингов (подготовка данных - в файле "Prepare_data_for_boosting.ipynb")

In [4]:
boosting_features_train = pd.read_csv('G:\\Alfa_Bank_competition\\boosting_data_weight_mean_count.csv', 
                                      index_col=0, dtype=np.float32)
boosting_features_train[['id', 'flag']] = boosting_features_train[['id', 'flag']].astype(int)
boosting_features_train

Unnamed: 0,id,pre_since_opened_0_x,pre_since_opened_1_x,pre_since_opened_2_x,pre_since_opened_3_x,pre_since_opened_4_x,pre_since_opened_5_x,pre_since_opened_6_x,pre_since_opened_7_x,pre_since_opened_8_x,...,pre_loans90_3_y,pre_loans3060_4_y,pre_loans3060_6_y,pre_loans6090_0_y,pre_loans5_10_y,pre_loans530_5_y,pre_loans530_8_y,pre_loans530_9_y,pre_loans530_17_y,flag
0.0,0,0.000000,0.184490,0.156444,0.129769,0.059281,0.185599,0.000000,0.213813,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1.0,1,0.000000,0.000000,0.127095,0.000000,0.000000,0.000000,0.000000,0.142166,0.014265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2.0,2,0.120561,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3.0,3,0.000000,0.338520,0.121126,0.000000,0.133769,0.083891,0.079841,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4.0,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995.0,2999995,0.047641,0.320093,0.000000,0.000000,0.230013,0.000000,0.065111,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999996.0,2999996,0.000000,0.168785,0.000000,0.150892,0.057177,0.507837,0.000000,0.000000,0.032411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999997.0,2999997,0.000000,0.000000,0.129769,0.104579,0.070604,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2999998.0,2999998,0.000000,0.000000,0.000000,0.000000,0.040360,0.000000,0.000000,0.187896,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
boosting_features_test= pd.read_csv('G:\\Alfa_Bank_competition\\boosting_data_weight_mean_count_test.csv', 
                                    index_col=0, dtype=np.float32)
boosting_features_test['id'] = boosting_features_test['id'].astype(int)
boosting_features_test

Unnamed: 0,id,pre_since_opened_0_x,pre_since_opened_1_x,pre_since_opened_2_x,pre_since_opened_3_x,pre_since_opened_4_x,pre_since_opened_5_x,pre_since_opened_6_x,pre_since_opened_7_x,pre_since_opened_8_x,...,pre_loans3060_3_y,pre_loans90_3_y,pre_loans3060_4_y,pre_loans3060_6_y,pre_loans6090_0_y,pre_loans5_10_y,pre_loans530_5_y,pre_loans530_8_y,pre_loans530_9_y,pre_loans530_17_y
0.0,3000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,3000001,0.081020,0.000000,0.000000,0.000000,0.000000,0.156444,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,3000002,0.000000,0.305102,0.000000,0.000000,0.000000,0.172949,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,3000003,0.000000,0.000000,0.103054,0.125809,0.000000,0.039400,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,3000004,0.032995,0.108049,0.194837,0.000000,0.141297,0.080379,0.081938,0.000000,0.005540,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995.0,3499995,0.000000,0.187896,0.000000,0.000000,0.000000,0.106510,0.000000,0.000000,0.040360,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499996.0,3499996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.065535,0.305102,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499997.0,3499997,0.000000,0.000000,0.000000,0.000000,0.000000,0.133021,0.000000,0.000000,0.178458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499998.0,3499998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **Catboost Classifier**

Обучим catboost classifier на всех сгенерированных числовых признаках. Обучение будем проводить на 10 фолдах, прогнозы полученных моделей усредним.
Учитывая сильный дисбаланс классов, будем использовать StratifiedKFold.

In [6]:
feature_cols = list(boosting_features_train.columns.values)
feature_cols.remove("flag")
len(feature_cols)

840

In [7]:
feature_names = feature_cols

target = 'flag'  
targets = boosting_features_train["flag"].values

In [8]:
cv = StratifiedKFold(n_splits=10, random_state=596, shuffle=True)

oof = np.zeros(len(boosting_features_train))
train_preds = np.zeros(len(boosting_features_train))

models_catboost_base = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(boosting_features_train, targets), 1):
    print(f"Training with fold {fold_} started")
    
    model_catboost = CatBoostClassifier(
                    verbose=3000,
                    loss_function='Logloss',
                    eval_metric='AUC',
                    early_stopping_rounds=500,
                    task_type="GPU",
                    iterations=50000,
                    learning_rate=0.01, 
                    auto_class_weights = 'Balanced',
                    depth=5, 
                    l2_leaf_reg= 1,
                    random_state=42
    )
      
    train, val = boosting_features_train.iloc[train_idx], boosting_features_train.iloc[val_idx]
    
    model_catboost.fit(
                  train[feature_names], train[target], 
                  eval_set=(val[feature_names], val[target]),
                  plot=False
    )
    oof[val_idx] = model_catboost.predict_proba(val[feature_cols])[:, 1]
    train_preds[train_idx] += model_catboost.predict_proba(train[feature_cols])[:, 1] / (cv.n_splits-1)
    models_catboost_base.append(model_catboost)
    print(f"Training with fold {fold_} completed")

Training with fold 1 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6810089	best: 0.6810089 (0)	total: 62.3ms	remaining: 51m 52s
3000:	test: 0.7706866	best: 0.7706866 (3000)	total: 1m 51s	remaining: 29m 11s
6000:	test: 0.7752271	best: 0.7752271 (6000)	total: 3m 39s	remaining: 26m 50s
9000:	test: 0.7771388	best: 0.7771470 (8934)	total: 5m 26s	remaining: 24m 45s
12000:	test: 0.7780997	best: 0.7781027 (11988)	total: 7m 11s	remaining: 22m 47s
15000:	test: 0.7787269	best: 0.7787341 (14939)	total: 8m 57s	remaining: 20m 53s
18000:	test: 0.7791522	best: 0.7791603 (17960)	total: 10m 43s	remaining: 19m 3s
bestTest = 0.7791739106
bestIteration = 18224
Shrink model to first 18225 iterations.
Training with fold 1 completed
Training with fold 2 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6786606	best: 0.6786606 (0)	total: 47.4ms	remaining: 39m 29s
3000:	test: 0.7659617	best: 0.7659617 (3000)	total: 1m 49s	remaining: 28m 37s
6000:	test: 0.7702966	best: 0.7702966 (6000)	total: 3m 36s	remaining: 26m 30s
9000:	test: 0.7721184	best: 0.7721188 (8999)	total: 5m 22s	remaining: 24m 31s
12000:	test: 0.7732371	best: 0.7732371 (11999)	total: 7m 8s	remaining: 22m 36s
15000:	test: 0.7738802	best: 0.7738920 (14884)	total: 8m 53s	remaining: 20m 44s
18000:	test: 0.7743382	best: 0.7743382 (17999)	total: 10m 38s	remaining: 18m 55s
21000:	test: 0.7745678	best: 0.7745678 (21000)	total: 12m 23s	remaining: 17m 6s
bestTest = 0.774597466
bestIteration = 21735
Shrink model to first 21736 iterations.
Training with fold 2 completed
Training with fold 3 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6761844	best: 0.6761844 (0)	total: 47.7ms	remaining: 39m 45s
3000:	test: 0.7655996	best: 0.7655996 (3000)	total: 1m 50s	remaining: 28m 44s
6000:	test: 0.7698357	best: 0.7698357 (6000)	total: 3m 37s	remaining: 26m 34s
9000:	test: 0.7716465	best: 0.7716465 (9000)	total: 5m 23s	remaining: 24m 32s
12000:	test: 0.7726166	best: 0.7726166 (12000)	total: 7m 8s	remaining: 22m 37s
15000:	test: 0.7732562	best: 0.7732609 (14997)	total: 8m 54s	remaining: 20m 46s
18000:	test: 0.7736686	best: 0.7736713 (17970)	total: 10m 39s	remaining: 18m 56s
21000:	test: 0.7738927	best: 0.7739012 (20971)	total: 12m 24s	remaining: 17m 7s
bestTest = 0.7739335895
bestIteration = 21823
Shrink model to first 21824 iterations.
Training with fold 3 completed
Training with fold 4 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6762877	best: 0.6762877 (0)	total: 47.4ms	remaining: 39m 32s
3000:	test: 0.7662227	best: 0.7662227 (3000)	total: 1m 50s	remaining: 28m 44s
6000:	test: 0.7704720	best: 0.7704722 (5999)	total: 3m 37s	remaining: 26m 33s
9000:	test: 0.7724203	best: 0.7724203 (9000)	total: 5m 23s	remaining: 24m 32s
12000:	test: 0.7734082	best: 0.7734091 (11998)	total: 7m 8s	remaining: 22m 36s
15000:	test: 0.7742113	best: 0.7742137 (14997)	total: 8m 53s	remaining: 20m 44s
18000:	test: 0.7747421	best: 0.7747468 (17934)	total: 10m 38s	remaining: 18m 54s
21000:	test: 0.7750865	best: 0.7751049 (20942)	total: 12m 25s	remaining: 17m 9s
24000:	test: 0.7753102	best: 0.7753145 (23918)	total: 14m 10s	remaining: 15m 21s
27000:	test: 0.7754424	best: 0.7754487 (26958)	total: 15m 55s	remaining: 13m 34s
bestTest = 0.7754753828
bestIteration = 27677
Shrink model to first 27678 iterations.
Training with fold 4 completed
Training with fold 5 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6805815	best: 0.6805815 (0)	total: 47.7ms	remaining: 39m 46s
3000:	test: 0.7724898	best: 0.7724910 (2999)	total: 1m 50s	remaining: 28m 47s
6000:	test: 0.7764486	best: 0.7764486 (6000)	total: 3m 38s	remaining: 26m 39s
9000:	test: 0.7781023	best: 0.7781034 (8998)	total: 5m 24s	remaining: 24m 39s
12000:	test: 0.7790263	best: 0.7790263 (11999)	total: 7m 10s	remaining: 22m 44s
15000:	test: 0.7796010	best: 0.7796032 (14996)	total: 8m 56s	remaining: 20m 51s
18000:	test: 0.7800078	best: 0.7800173 (17938)	total: 10m 42s	remaining: 19m 1s
bestTest = 0.7802325487
bestIteration = 19929
Shrink model to first 19930 iterations.
Training with fold 5 completed
Training with fold 6 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6833912	best: 0.6833912 (0)	total: 52ms	remaining: 43m 18s
3000:	test: 0.7690352	best: 0.7690352 (3000)	total: 1m 51s	remaining: 29m 5s
6000:	test: 0.7728502	best: 0.7728507 (5994)	total: 3m 41s	remaining: 27m 1s
9000:	test: 0.7742893	best: 0.7742898 (8999)	total: 5m 27s	remaining: 24m 53s
12000:	test: 0.7751853	best: 0.7751862 (11947)	total: 7m 14s	remaining: 22m 54s
15000:	test: 0.7757747	best: 0.7757756 (14995)	total: 8m 59s	remaining: 20m 59s
bestTest = 0.7760581374
bestIteration = 17361
Shrink model to first 17362 iterations.
Training with fold 6 completed
Training with fold 7 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6757502	best: 0.6757502 (0)	total: 47.1ms	remaining: 39m 16s
3000:	test: 0.7621348	best: 0.7621348 (3000)	total: 1m 51s	remaining: 29m 6s
6000:	test: 0.7662733	best: 0.7662736 (5999)	total: 3m 39s	remaining: 26m 48s
9000:	test: 0.7681481	best: 0.7681481 (9000)	total: 5m 25s	remaining: 24m 44s
12000:	test: 0.7692669	best: 0.7692673 (11999)	total: 7m 12s	remaining: 22m 50s
15000:	test: 0.7699097	best: 0.7699111 (14982)	total: 8m 58s	remaining: 20m 55s
18000:	test: 0.7702288	best: 0.7702358 (17953)	total: 10m 43s	remaining: 19m 4s
21000:	test: 0.7704375	best: 0.7704375 (21000)	total: 12m 30s	remaining: 17m 16s
bestTest = 0.7705372572
bestIteration = 21906
Shrink model to first 21907 iterations.
Training with fold 7 completed
Training with fold 8 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6807206	best: 0.6807206 (0)	total: 47.4ms	remaining: 39m 28s
3000:	test: 0.7701185	best: 0.7701185 (3000)	total: 1m 50s	remaining: 28m 49s
6000:	test: 0.7743630	best: 0.7743630 (6000)	total: 3m 38s	remaining: 26m 40s
9000:	test: 0.7761697	best: 0.7761697 (9000)	total: 5m 26s	remaining: 24m 45s
12000:	test: 0.7772046	best: 0.7772048 (11999)	total: 7m 12s	remaining: 22m 48s
15000:	test: 0.7778890	best: 0.7778890 (15000)	total: 8m 57s	remaining: 20m 53s
18000:	test: 0.7783977	best: 0.7783977 (17990)	total: 10m 42s	remaining: 19m 1s
21000:	test: 0.7786956	best: 0.7787005 (20982)	total: 12m 27s	remaining: 17m 12s
24000:	test: 0.7789191	best: 0.7789324 (23922)	total: 14m 12s	remaining: 15m 23s
bestTest = 0.7789549828
bestIteration = 24519
Shrink model to first 24520 iterations.
Training with fold 8 completed
Training with fold 9 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6759023	best: 0.6759023 (0)	total: 45.8ms	remaining: 38m 12s
3000:	test: 0.7688432	best: 0.7688432 (3000)	total: 1m 50s	remaining: 28m 51s
6000:	test: 0.7730705	best: 0.7730705 (6000)	total: 3m 38s	remaining: 26m 39s
9000:	test: 0.7748244	best: 0.7748258 (8998)	total: 5m 25s	remaining: 24m 42s
12000:	test: 0.7757331	best: 0.7757331 (12000)	total: 7m 11s	remaining: 22m 45s
15000:	test: 0.7763211	best: 0.7763211 (14999)	total: 8m 56s	remaining: 20m 51s
18000:	test: 0.7766635	best: 0.7766635 (17996)	total: 10m 42s	remaining: 19m 2s
21000:	test: 0.7769203	best: 0.7769228 (20997)	total: 12m 28s	remaining: 17m 13s
bestTest = 0.776930511
bestIteration = 21053
Shrink model to first 21054 iterations.
Training with fold 9 completed
Training with fold 10 started


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6769240	best: 0.6769240 (0)	total: 47.7ms	remaining: 39m 46s
3000:	test: 0.7637327	best: 0.7637327 (3000)	total: 1m 50s	remaining: 28m 50s
6000:	test: 0.7677415	best: 0.7677417 (5998)	total: 3m 38s	remaining: 26m 40s
9000:	test: 0.7694526	best: 0.7694531 (8987)	total: 5m 24s	remaining: 24m 39s
12000:	test: 0.7703872	best: 0.7703917 (11988)	total: 7m 10s	remaining: 22m 42s
15000:	test: 0.7709422	best: 0.7709460 (14985)	total: 8m 55s	remaining: 20m 48s
18000:	test: 0.7713677	best: 0.7713702 (17989)	total: 10m 40s	remaining: 18m 57s
21000:	test: 0.7716042	best: 0.7716213 (20965)	total: 12m 24s	remaining: 17m 8s
bestTest = 0.7716213465
bestIteration = 20965
Shrink model to first 20966 iterations.
Training with fold 10 completed


In [9]:
oof_catboost_base = pd.DataFrame({
    "id" : boosting_features_train["id"].values,
    "score": oof
}) 

oof_catboost_base

Unnamed: 0,id,score
0,0,0.070415
1,1,0.514810
2,2,0.759802
3,3,0.231998
4,4,0.428725
...,...,...
2999995,2999995,0.468746
2999996,2999996,0.152985
2999997,2999997,0.345615
2999998,2999998,0.774637


In [10]:
score = np.zeros(len(boosting_features_test))

for model in tqdm.tqdm_notebook(models_catboost_base):
    score += model.predict_proba(boosting_features_test[feature_cols])[:, 1] / len(models_catboost_base)
    
submission_catboost_base = pd.DataFrame({
    "id" : boosting_features_test["id"].values,
    "score_mean": score
}) 

submission_catboost_base

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,score_mean
0,3000000,0.439151
1,3000001,0.459096
2,3000002,0.298743
3,3000003,0.409526
4,3000004,0.120899
...,...,...
499995,3499995,0.487164
499996,3499996,0.255594
499997,3499997,0.525870
499998,3499998,0.320612


In [11]:
oof_catboost_base.to_csv("oof_catboost_base_train.csv", index=None)

In [12]:
submission_catboost_base.to_csv("catboost_base_test.csv", index=None)

Оценим ROC_AUC на трейне (по out of bag прогнозам):

In [14]:
oof_catboost_base_target = oof_catboost_base.merge(boosting_features_train[['id', 'flag']], on='id')
roc_auc_score(oof_catboost_base_target['flag'], oof_catboost_base_target['score'])

0.7757215071201912

### Catboost Ranker

Учитывая, что метрика ROC_AUC интерпретируется как доля правильно упорядоченных пар, попробуем вместо модели классификации применить модель ранжирования. Для этого нам потребуется поделить объекты на группы, внутри которых будет осуществляться сравнение.
Разделим все данные на 4 группы в порядке возрастания id заявки (заявки в тестовом наборе будут отнесены к той же группе, что и последние заявки в тренировочном наборе)

In [15]:
boosting_features_train['Group_id'] = boosting_features_train['id']//875000
boosting_features_train['Group_id'] = boosting_features_train['Group_id'].apply(lambda x: int(x)) 

In [16]:
boosting_features_test['Group_id'] = boosting_features_test['id']//875000
boosting_features_test['Group_id'] = boosting_features_test['Group_id'].apply(lambda x: int(x)) 

In [17]:
boosting_features_train

Unnamed: 0,id,pre_since_opened_0_x,pre_since_opened_1_x,pre_since_opened_2_x,pre_since_opened_3_x,pre_since_opened_4_x,pre_since_opened_5_x,pre_since_opened_6_x,pre_since_opened_7_x,pre_since_opened_8_x,...,pre_loans3060_4_y,pre_loans3060_6_y,pre_loans6090_0_y,pre_loans5_10_y,pre_loans530_5_y,pre_loans530_8_y,pre_loans530_9_y,pre_loans530_17_y,flag,Group_id
0.0,0,0.000000,0.184490,0.156444,0.129769,0.059281,0.185599,0.000000,0.213813,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1.0,1,0.000000,0.000000,0.127095,0.000000,0.000000,0.000000,0.000000,0.142166,0.014265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2.0,2,0.120561,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3.0,3,0.000000,0.338520,0.121126,0.000000,0.133769,0.083891,0.079841,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4.0,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995.0,2999995,0.047641,0.320093,0.000000,0.000000,0.230013,0.000000,0.065111,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3
2999996.0,2999996,0.000000,0.168785,0.000000,0.150892,0.057177,0.507837,0.000000,0.000000,0.032411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3
2999997.0,2999997,0.000000,0.000000,0.129769,0.104579,0.070604,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3
2999998.0,2999998,0.000000,0.000000,0.000000,0.000000,0.040360,0.000000,0.000000,0.187896,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3


In [18]:
feature_cols = list(boosting_features_train.columns.values)
feature_cols.remove("flag")
len(feature_cols)

841

In [19]:
feature_names = feature_cols

target = 'flag'  
targets = boosting_features_train["flag"].values

In [20]:
cv = StratifiedKFold(n_splits=10, random_state=754, shuffle=True)

oof = np.zeros(len(boosting_features_train))

models_catboost_ranker = []

for fold_, (train_idx, val_idx) in enumerate(cv.split(boosting_features_train, targets), 1):
    print(f"Training with fold {fold_} started")
    
    model_catboost = CatBoostRanker(
                    verbose=3000,
                    loss_function='PairLogit:max_pairs=1000000',
                    eval_metric='AUC',
                    early_stopping_rounds=500,
                    task_type="GPU",
                    iterations=50000,
                    learning_rate=0.01,       
                    depth=7, 
                    l2_leaf_reg= 1,
                    random_state=42,
    )
    
    train, val = boosting_features_train.iloc[train_idx], boosting_features_train.iloc[val_idx]
    
    train_pool = Pool(
            data=train[feature_names],
            label=train[target].values,
            group_id=train["Group_id"].values
            )

    val_pool = Pool(
           data=val[feature_names],
           label=val[target].values,
           group_id=val["Group_id"].values
        )
    
    
    model_catboost.fit(
                  train_pool, 
                  eval_set=val_pool,
                  plot=False
    )
    oof[val_idx] = model_catboost.predict(val[feature_cols])

    models_catboost_ranker.append(model_catboost)
    print(f"Training with fold {fold_} completed")

Training with fold 1 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7015447	best: 0.7015447 (0)	total: 104ms	remaining: 1h 26m 21s
3000:	test: 0.7715697	best: 0.7715697 (3000)	total: 3m 52s	remaining: 1h 42s
6000:	test: 0.7745579	best: 0.7745610 (5992)	total: 7m 38s	remaining: 56m 4s
9000:	test: 0.7757638	best: 0.7757638 (9000)	total: 11m 24s	remaining: 51m 59s
bestTest = 0.7762760064
bestIteration = 11216
Shrink model to first 11217 iterations.
Training with fold 1 completed
Training with fold 2 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6981399	best: 0.6981399 (0)	total: 97.1ms	remaining: 1h 20m 53s
3000:	test: 0.7724949	best: 0.7724949 (3000)	total: 3m 50s	remaining: 1h 12s
6000:	test: 0.7750619	best: 0.7750619 (6000)	total: 7m 36s	remaining: 55m 45s
9000:	test: 0.7756866	best: 0.7757432 (8763)	total: 11m 21s	remaining: 51m 42s
bestTest = 0.7758534361
bestIteration = 10236
Shrink model to first 10237 iterations.
Training with fold 2 completed
Training with fold 3 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6975951	best: 0.6975951 (0)	total: 89.5ms	remaining: 1h 14m 37s
3000:	test: 0.7683350	best: 0.7683415 (2990)	total: 3m 50s	remaining: 1h 14s
6000:	test: 0.7714938	best: 0.7714961 (5994)	total: 7m 36s	remaining: 55m 44s
9000:	test: 0.7724234	best: 0.7724351 (8989)	total: 11m 21s	remaining: 51m 44s
12000:	test: 0.7727508	best: 0.7728300 (11630)	total: 15m 7s	remaining: 47m 52s
bestTest = 0.7728300269
bestIteration = 11630
Shrink model to first 11631 iterations.
Training with fold 3 completed
Training with fold 4 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6994123	best: 0.6994123 (0)	total: 91ms	remaining: 1h 15m 50s
3000:	test: 0.7749293	best: 0.7749293 (3000)	total: 3m 52s	remaining: 1h 45s
6000:	test: 0.7782595	best: 0.7782595 (6000)	total: 7m 40s	remaining: 56m 16s
9000:	test: 0.7793744	best: 0.7793744 (9000)	total: 11m 27s	remaining: 52m 12s
12000:	test: 0.7798461	best: 0.7798461 (12000)	total: 15m 15s	remaining: 48m 18s
bestTest = 0.7799081787
bestIteration = 12189
Shrink model to first 12190 iterations.
Training with fold 4 completed
Training with fold 5 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6982542	best: 0.6982542 (0)	total: 97.9ms	remaining: 1h 21m 35s
3000:	test: 0.7668403	best: 0.7668403 (3000)	total: 3m 52s	remaining: 1h 43s
6000:	test: 0.7699701	best: 0.7699701 (6000)	total: 7m 39s	remaining: 56m 8s
9000:	test: 0.7712286	best: 0.7712292 (8998)	total: 11m 26s	remaining: 52m 4s
12000:	test: 0.7718011	best: 0.7718080 (11991)	total: 15m 12s	remaining: 48m 9s
bestTest = 0.7719076235
bestIteration = 12307
Shrink model to first 12308 iterations.
Training with fold 5 completed
Training with fold 6 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6995560	best: 0.6995560 (0)	total: 97ms	remaining: 1h 20m 50s
3000:	test: 0.7690539	best: 0.7690539 (3000)	total: 3m 51s	remaining: 1h 23s
6000:	test: 0.7723064	best: 0.7723064 (6000)	total: 7m 37s	remaining: 55m 56s
9000:	test: 0.7734149	best: 0.7734304 (8939)	total: 11m 24s	remaining: 51m 55s
12000:	test: 0.7739914	best: 0.7739940 (11999)	total: 15m 9s	remaining: 48m
bestTest = 0.7741402667
bestIteration = 13623
Shrink model to first 13624 iterations.
Training with fold 6 completed
Training with fold 7 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6937320	best: 0.6937320 (0)	total: 89.2ms	remaining: 1h 14m 17s
3000:	test: 0.7659358	best: 0.7659358 (3000)	total: 3m 55s	remaining: 1h 1m 28s
6000:	test: 0.7691247	best: 0.7691247 (6000)	total: 7m 44s	remaining: 56m 47s
9000:	test: 0.7705277	best: 0.7705308 (8925)	total: 11m 32s	remaining: 52m 34s
12000:	test: 0.7710350	best: 0.7710475 (11997)	total: 15m 20s	remaining: 48m 33s
bestTest = 0.7711033174
bestIteration = 12389
Shrink model to first 12390 iterations.
Training with fold 7 completed
Training with fold 8 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6970061	best: 0.6970061 (0)	total: 92.5ms	remaining: 1h 17m 3s
3000:	test: 0.7679106	best: 0.7679106 (3000)	total: 3m 52s	remaining: 1h 44s
6000:	test: 0.7714381	best: 0.7714381 (6000)	total: 7m 39s	remaining: 56m 11s
9000:	test: 0.7731013	best: 0.7731048 (8945)	total: 11m 26s	remaining: 52m 5s
bestTest = 0.7735800383
bestIteration = 11450
Shrink model to first 11451 iterations.
Training with fold 8 completed
Training with fold 9 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7018877	best: 0.7018877 (0)	total: 90.4ms	remaining: 1h 15m 18s
3000:	test: 0.7692718	best: 0.7692718 (3000)	total: 3m 56s	remaining: 1h 1m 39s
6000:	test: 0.7720436	best: 0.7720486 (5990)	total: 7m 44s	remaining: 56m 42s
9000:	test: 0.7733269	best: 0.7733354 (8987)	total: 11m 32s	remaining: 52m 32s
12000:	test: 0.7737363	best: 0.7737443 (11843)	total: 15m 19s	remaining: 48m 30s
bestTest = 0.7738373235
bestIteration = 12764
Shrink model to first 12765 iterations.
Training with fold 9 completed
Training with fold 10 started


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6998400	best: 0.6998400 (0)	total: 99.5ms	remaining: 1h 22m 54s
3000:	test: 0.7707384	best: 0.7707384 (3000)	total: 3m 54s	remaining: 1h 1m 11s
6000:	test: 0.7740199	best: 0.7740204 (5994)	total: 7m 48s	remaining: 57m 14s
9000:	test: 0.7753062	best: 0.7753119 (8997)	total: 11m 38s	remaining: 53m
12000:	test: 0.7756587	best: 0.7756658 (11993)	total: 15m 26s	remaining: 48m 54s
15000:	test: 0.7760047	best: 0.7760159 (14967)	total: 19m 14s	remaining: 44m 53s
bestTest = 0.7760159445
bestIteration = 14967
Shrink model to first 14968 iterations.
Training with fold 10 completed


In [21]:
oof_catboost_ranker = pd.DataFrame({
    "id" : boosting_features_train["id"].values,
    "score": oof
}) 

oof_catboost_ranker

Unnamed: 0,id,score
0,0,-2.354863
1,1,0.094452
2,2,0.942868
3,3,-1.067343
4,4,-0.305224
...,...,...
2999995,2999995,-0.248722
2999996,2999996,-1.402858
2999997,2999997,-0.661003
2999998,2999998,1.049500


In [22]:
score = np.zeros(len(boosting_features_test))

preds = []

for model in tqdm.tqdm_notebook(models_catboost_ranker):
    score += model.predict(boosting_features_test[feature_cols]) / len(models_catboost_ranker)
    preds.append(model.predict(boosting_features_test[feature_cols]))
    
submission_catboost_ranker = pd.DataFrame({
    "id" : boosting_features_test["id"].values,
    "score_mean": score
}) 

submission_catboost_ranker

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,score_mean
0,3000000,-0.382256
1,3000001,-0.613684
2,3000002,-0.912539
3,3000003,-0.472359
4,3000004,-2.029135
...,...,...
499995,3499995,-0.256360
499996,3499996,-1.409817
499997,3499997,-0.054107
499998,3499998,-0.728791


In [23]:
oof_catboost_ranker.to_csv("oof_catboost_ranker_train.csv", index=None)

In [24]:
submission_catboost_ranker.to_csv("catboost_ranker_test.csv", index=None)

In [27]:
oof_catboost_ranker_target = oof_catboost_ranker.merge(boosting_features_train[['id', 'flag']], on='id')
roc_auc_score(oof_catboost_ranker_target['flag'], oof_catboost_ranker_target['score'])

0.7745149424486496

### **LightGBM**:

Теперь обучим классификатор LightGBM (baseline от организаторов)

In [49]:
import lightgbm as lgb

In [46]:
feature_cols = list(meta_with_boosting_train.columns.values)

feature_cols.remove("flag")
feature_cols = feature_cols[:840]
len(feature_cols)

840

In [51]:
target = 'flag'  
targets = meta_with_boosting_train["flag"].values

In [54]:
cv = StratifiedKFold(n_splits=10, random_state=777, shuffle=True)

oof = np.zeros(len(meta_with_boosting_train))
train_preds = np.zeros(len(meta_with_boosting_train))

models_lgbm = []

tree_params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "max_depth": 5,
    "reg_lambda": 1,
    "num_leaves": 64,
    "n_jobs": -1,
    "n_estimators": 2000,
    "random_state": 42,
    "class_weight": "balanced",
    "colsample_bytree": 0.8
}

for fold_, (train_idx, val_idx) in enumerate(cv.split(meta_with_boosting_train, targets), 1):
    print(f"Training with fold {fold_} started")
        
    lgb_model = lgb.LGBMClassifier(**tree_params)
    
    train, val = meta_with_boosting_train.iloc[train_idx], meta_with_boosting_train.iloc[val_idx]
    
    lgb_model.fit(train[feature_cols], train.flag.values, 
              eval_set=[(val[feature_cols], val.flag.values)],
              early_stopping_rounds=50, verbose=50
                 )
    oof[val_idx] = lgb_model.predict_proba(val[feature_cols])[:, 1]
    train_preds[train_idx] += lgb_model.predict_proba(train[feature_cols])[:, 1] / (cv.n_splits-1)
    models_lgbm.append(lgb_model)
    print(f"Training with fold {fold_} completed")

Training with fold 1 started
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.745776
[100]	valid_0's auc: 0.756355
[150]	valid_0's auc: 0.761408
[200]	valid_0's auc: 0.764264
[250]	valid_0's auc: 0.766087
[300]	valid_0's auc: 0.767554
[350]	valid_0's auc: 0.768567
[400]	valid_0's auc: 0.769375
[450]	valid_0's auc: 0.769986
[500]	valid_0's auc: 0.770606
[550]	valid_0's auc: 0.771013
[600]	valid_0's auc: 0.771311
[650]	valid_0's auc: 0.771614
[700]	valid_0's auc: 0.771854
[750]	valid_0's auc: 0.772089
[800]	valid_0's auc: 0.772383
[850]	valid_0's auc: 0.772532
[900]	valid_0's auc: 0.7726
[950]	valid_0's auc: 0.772718
[1000]	valid_0's auc: 0.772855
[1050]	valid_0's auc: 0.773047
[1100]	valid_0's auc: 0.77313
[1150]	valid_0's auc: 0.773104
[1200]	valid_0's auc: 0.773198
[1250]	valid_0's auc: 0.773285
[1300]	valid_0's auc: 0.773332
Early stopping, best iteration is:
[1293]	valid_0's auc: 0.773354
Training with fold 1 completed
Training with fold 2 started
T

[950]	valid_0's auc: 0.77097
[1000]	valid_0's auc: 0.771144
[1050]	valid_0's auc: 0.771114
Early stopping, best iteration is:
[1028]	valid_0's auc: 0.771162
Training with fold 9 completed
Training with fold 10 started
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.74528
[100]	valid_0's auc: 0.755411
[150]	valid_0's auc: 0.76039
[200]	valid_0's auc: 0.763254
[250]	valid_0's auc: 0.765029
[300]	valid_0's auc: 0.766389
[350]	valid_0's auc: 0.767487
[400]	valid_0's auc: 0.76853
[450]	valid_0's auc: 0.769351
[500]	valid_0's auc: 0.769917
[550]	valid_0's auc: 0.770344
[600]	valid_0's auc: 0.77071
[650]	valid_0's auc: 0.771008
[700]	valid_0's auc: 0.771225
[750]	valid_0's auc: 0.771423
[800]	valid_0's auc: 0.771494
[850]	valid_0's auc: 0.771634
[900]	valid_0's auc: 0.771704
[950]	valid_0's auc: 0.771847
[1000]	valid_0's auc: 0.771926
[1050]	valid_0's auc: 0.772083
[1100]	valid_0's auc: 0.772134
[1150]	valid_0's auc: 0.772195
Early stopping, best iteration i

In [55]:
oof_lgbm = pd.DataFrame({
    "id" : meta_with_boosting_train["id"].values,
    "score": oof
}) 

oof_lgbm

Unnamed: 0,id,score
0,0.0,0.102570
1,1.0,0.499298
2,2.0,0.724953
3,3.0,0.277353
4,4.0,0.465288
...,...,...
2999995,2999995.0,0.550031
2999996,2999996.0,0.146658
2999997,2999997.0,0.383864
2999998,2999998.0,0.792638


In [80]:
score = np.zeros(len(meta_with_boosting_test))

preds = []

for model in tqdm.tqdm_notebook(models_lgbm):
    score += model.predict_proba(meta_with_boosting_test[feature_cols])[:, 1] / len(models_lgbm)
    preds.append(model.predict_proba(meta_with_boosting_test[feature_cols])[:, 1])
    
submission_lgbm = pd.DataFrame({
    "id" : meta_with_boosting_test["id"].values,
    "score_mean": score, 
    "model_1": preds[0],
    "model_2": preds[1],
    "model_3": preds[2],
    "model_4": preds[3],
    "model_5": preds[4],
    "model_6": preds[5],
    "model_7": preds[6],
    "model_8": preds[7],
    "model_9": preds[8],
    "model_10": preds[9],
}) 
submission_lgbm['score_gmean'] = gmean([submission_lgbm['model_1'],
                                        submission_lgbm['model_2'],
                                        submission_lgbm['model_3'],
                                        submission_lgbm['model_4'],
                                        submission_lgbm['model_5'],
                                        submission_lgbm['model_6'],
                                        submission_lgbm['model_7'],
                                        submission_lgbm['model_8'],
                                        submission_lgbm['model_9'],
                                        submission_lgbm['model_10'],   
                                       ])
submission_lgbm

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,score_mean,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,score_gmean
0,3000000,0.461170,0.520470,0.463620,0.454860,0.459729,0.475377,0.447907,0.422493,0.439244,0.501072,0.426932,0.460251
1,3000001,0.521450,0.510326,0.576220,0.398382,0.601790,0.515808,0.495763,0.500498,0.502062,0.622158,0.491498,0.517789
2,3000002,0.356626,0.324261,0.346225,0.331190,0.350787,0.342694,0.348188,0.380959,0.401180,0.372001,0.368770,0.355927
3,3000003,0.405711,0.379881,0.432934,0.428106,0.441112,0.396106,0.396394,0.397975,0.418002,0.363565,0.403035,0.405048
4,3000004,0.128247,0.101692,0.130747,0.132447,0.134502,0.111597,0.153484,0.134613,0.130582,0.117850,0.134955,0.127488
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,3499995,0.423252,0.378573,0.412401,0.425967,0.438962,0.453632,0.456748,0.409777,0.401783,0.416007,0.438665,0.422613
499996,3499996,0.353055,0.407812,0.354397,0.279484,0.313549,0.347681,0.351419,0.428831,0.350757,0.349664,0.346951,0.350819
499997,3499997,0.617200,0.612353,0.655621,0.484239,0.634376,0.637479,0.594690,0.672297,0.573916,0.658028,0.649001,0.614712
499998,3499998,0.332947,0.338355,0.368419,0.342134,0.368325,0.305611,0.334918,0.324716,0.349863,0.297251,0.299875,0.332026


In [72]:
oof_lgbm.to_csv("oof_lgbm_train.csv", index=None)

In [81]:
submission_lgbm.to_csv("lgbm_test.csv", index=None)

Далее предсказания моделей на основе градиентного бустинга объединим в ансамбль с предсказаниями нейросетевых моделей.