In [26]:
import os
import re
import joblib
import scipy
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost
import lightgbm
from catboost import CatBoostClassifier
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [2]:
features = joblib.load("features.gz")
labels = joblib.load("labels.gz")
x_train, x_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.3, random_state=417)

In [14]:
# xgboost
xgb_model = xgboost.XGBClassifier(n_jobs=multiprocessing.cpu_count(), random_state=2020)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(100, 1000, 100)]
max_depth = [int(x) for x in np.linspace(5, 15, 10)]
gamma = [float(x) for x in np.linspace(0, 1, 10)]
learning_rate = [0.1, 0.01, 0.001]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    gamma=gamma, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    xgb_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
xgb_model = xgboost.XGBClassifier(**rand_result.best_params_)
xgb_model.fit(x_train, y_train)

# Evaluate step
y_pred_xgb = xgb_model.predict(x_valid)
y_prob_xgb = xgb_model.predict_proba(x_valid)
accuracy = metrics.accuracy_score(y_valid, y_pred_xgb)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_xgb[:, 1])
print("Accuracy: {:.4f}".format(accuracy))
print("ROC AUC: {:.4f}".format(roc_auc_score))
print("Confusion matrix: \n{}".format(metrics.confusion_matrix(y_valid, y_pred_xgb)))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s


{'n_estimators': 190, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1111111111111111}
Accuracy: 0.9118
ROC AUC: 0.8000
Confusion matrix: 
[[30  0]
 [ 3  1]]


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.4s finished


In [15]:
# LGBM
lgb_model = lightgbm.LGBMClassifier(n_jobs=multiprocessing.cpu_count(), random_state=417)

# Hyperparameters
boosting_type = ["gbdt", "dart", "goss", "rf"]
n_estimators = [int(x) for x in np.linspace(100, 1000, 100)]
max_depth = [int(x) for x in np.linspace(5, 15, 10)]
num_leaves = [int(x) for x in np.linspace(1, 50, 10)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
feature_fraction = [float(x) for x in np.linspace(0, 1, 10)]
min_data_in_leaf = [int(x) for x in np.linspace(1, 50, 10)]
param_grid = dict(
    boosting_type=boosting_type, 
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    num_leaves=num_leaves, 
    learning_rate=learning_rate, 
    feature_fraction=feature_fraction, 
    min_data_in_leaf=min_data_in_leaf)

# Random search
rand_search = RandomizedSearchCV(
    lgb_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
lgb_model = lightgbm.LGBMClassifier(**rand_result.best_params_)
lgb_model.fit(x_train, y_train)

# Evaluate step
y_pred_lgb = lgb_model.predict(x_valid)
y_prob_lgb = lgb_model.predict_proba(x_valid)
accuracy = metrics.accuracy_score(y_valid, y_pred_lgb)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_lgb[:, 1])
print("Accuracy: {:.4f}".format(accuracy))
print("ROC AUC: {:.4f}".format(roc_auc_score))
print("Confusion matrix: \n{}".format(metrics.confusion_matrix(y_valid, y_pred_lgb)))

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s


{'num_leaves': 39, 'n_estimators': 236, 'min_data_in_leaf': 17, 'max_depth': 5, 'learning_rate': 0.045000000000000005, 'feature_fraction': 0.4444444444444444, 'boosting_type': 'dart'}
Accuracy: 0.8824
ROC AUC: 0.8750
Confusion matrix: 
[[30  0]
 [ 4  0]]


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished


In [19]:
# CatBoost
cat_model = CatBoostClassifier(random_state=417)

# Hyperparameters
depth = [int(x) for x in np.linspace(3, 36, 12)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
param_grid = dict(
    depth=depth, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    cat_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
cat_model = CatBoostClassifier(**rand_result.best_params_)
cat_model.fit(x_train, y_train)

# Evaluate step
y_prob_cat = cat_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_cat[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   26.3s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:  1.9min finished


0:	learn: 0.6263873	total: 1.7ms	remaining: 1.7s
1:	learn: 0.5554880	total: 2.7ms	remaining: 1.35s
2:	learn: 0.5114323	total: 3.71ms	remaining: 1.23s
3:	learn: 0.4709837	total: 5.11ms	remaining: 1.27s
4:	learn: 0.4225326	total: 6.01ms	remaining: 1.2s
5:	learn: 0.3913344	total: 6.93ms	remaining: 1.15s
6:	learn: 0.3596995	total: 8.08ms	remaining: 1.15s
7:	learn: 0.3244920	total: 8.81ms	remaining: 1.09s
8:	learn: 0.2944660	total: 9.7ms	remaining: 1.07s
9:	learn: 0.2694018	total: 10.7ms	remaining: 1.06s
10:	learn: 0.2496477	total: 12ms	remaining: 1.08s
11:	learn: 0.2359909	total: 13.1ms	remaining: 1.08s
12:	learn: 0.2268046	total: 14.3ms	remaining: 1.09s
13:	learn: 0.2090198	total: 15.2ms	remaining: 1.07s
14:	learn: 0.1975737	total: 16.2ms	remaining: 1.06s
15:	learn: 0.1869647	total: 17.1ms	remaining: 1.05s
16:	learn: 0.1776866	total: 18.2ms	remaining: 1.05s
17:	learn: 0.1704838	total: 19.1ms	remaining: 1.04s
18:	learn: 0.1599894	total: 20.1ms	remaining: 1.04s
19:	learn: 0.1558415	total: 2

161:	learn: 0.0069307	total: 163ms	remaining: 842ms
162:	learn: 0.0068754	total: 164ms	remaining: 841ms
163:	learn: 0.0068152	total: 165ms	remaining: 839ms
164:	learn: 0.0067672	total: 166ms	remaining: 838ms
165:	learn: 0.0067090	total: 167ms	remaining: 837ms
166:	learn: 0.0066570	total: 168ms	remaining: 836ms
167:	learn: 0.0066113	total: 169ms	remaining: 836ms
168:	learn: 0.0065555	total: 170ms	remaining: 834ms
169:	learn: 0.0065059	total: 171ms	remaining: 833ms
170:	learn: 0.0064519	total: 172ms	remaining: 832ms
171:	learn: 0.0064089	total: 173ms	remaining: 832ms
172:	learn: 0.0063564	total: 174ms	remaining: 831ms
173:	learn: 0.0063097	total: 175ms	remaining: 830ms
174:	learn: 0.0062686	total: 176ms	remaining: 828ms
175:	learn: 0.0062182	total: 177ms	remaining: 827ms
176:	learn: 0.0061736	total: 178ms	remaining: 827ms
177:	learn: 0.0061344	total: 179ms	remaining: 826ms
178:	learn: 0.0060860	total: 180ms	remaining: 826ms
179:	learn: 0.0060433	total: 181ms	remaining: 824ms
180:	learn: 

322:	learn: 0.0029779	total: 328ms	remaining: 687ms
323:	learn: 0.0029688	total: 329ms	remaining: 686ms
324:	learn: 0.0029589	total: 330ms	remaining: 685ms
325:	learn: 0.0029462	total: 331ms	remaining: 683ms
326:	learn: 0.0029364	total: 332ms	remaining: 682ms
327:	learn: 0.0029276	total: 333ms	remaining: 681ms
328:	learn: 0.0029150	total: 334ms	remaining: 680ms
329:	learn: 0.0029055	total: 335ms	remaining: 679ms
330:	learn: 0.0028968	total: 336ms	remaining: 678ms
331:	learn: 0.0028874	total: 337ms	remaining: 677ms
332:	learn: 0.0028752	total: 338ms	remaining: 676ms
333:	learn: 0.0028668	total: 339ms	remaining: 676ms
334:	learn: 0.0028575	total: 340ms	remaining: 674ms
335:	learn: 0.0028456	total: 341ms	remaining: 674ms
336:	learn: 0.0028373	total: 342ms	remaining: 674ms
337:	learn: 0.0028283	total: 343ms	remaining: 673ms
338:	learn: 0.0028165	total: 344ms	remaining: 671ms
339:	learn: 0.0028084	total: 345ms	remaining: 670ms
340:	learn: 0.0027996	total: 346ms	remaining: 669ms
341:	learn: 

484:	learn: 0.0019190	total: 492ms	remaining: 522ms
485:	learn: 0.0019155	total: 493ms	remaining: 521ms
486:	learn: 0.0019099	total: 494ms	remaining: 520ms
487:	learn: 0.0019066	total: 495ms	remaining: 519ms
488:	learn: 0.0019031	total: 495ms	remaining: 518ms
489:	learn: 0.0018999	total: 496ms	remaining: 517ms
490:	learn: 0.0018961	total: 497ms	remaining: 515ms
491:	learn: 0.0018927	total: 498ms	remaining: 514ms
492:	learn: 0.0018872	total: 499ms	remaining: 513ms
493:	learn: 0.0018837	total: 500ms	remaining: 512ms
494:	learn: 0.0018802	total: 501ms	remaining: 511ms
495:	learn: 0.0018771	total: 504ms	remaining: 512ms
496:	learn: 0.0018717	total: 505ms	remaining: 511ms
497:	learn: 0.0018683	total: 506ms	remaining: 510ms
498:	learn: 0.0018652	total: 507ms	remaining: 509ms
499:	learn: 0.0018616	total: 508ms	remaining: 508ms
500:	learn: 0.0018580	total: 509ms	remaining: 507ms
501:	learn: 0.0018547	total: 510ms	remaining: 505ms
502:	learn: 0.0018516	total: 510ms	remaining: 504ms
503:	learn: 

652:	learn: 0.0014444	total: 658ms	remaining: 350ms
653:	learn: 0.0014426	total: 659ms	remaining: 348ms
654:	learn: 0.0014405	total: 659ms	remaining: 347ms
655:	learn: 0.0014387	total: 660ms	remaining: 346ms
656:	learn: 0.0014354	total: 661ms	remaining: 345ms
657:	learn: 0.0014334	total: 662ms	remaining: 344ms
658:	learn: 0.0014316	total: 663ms	remaining: 343ms
659:	learn: 0.0014292	total: 664ms	remaining: 342ms
660:	learn: 0.0014271	total: 665ms	remaining: 341ms
661:	learn: 0.0014251	total: 666ms	remaining: 340ms
662:	learn: 0.0014233	total: 667ms	remaining: 339ms
663:	learn: 0.0014213	total: 668ms	remaining: 338ms
664:	learn: 0.0014195	total: 668ms	remaining: 337ms
665:	learn: 0.0014164	total: 669ms	remaining: 336ms
666:	learn: 0.0014143	total: 670ms	remaining: 335ms
667:	learn: 0.0014126	total: 671ms	remaining: 333ms
668:	learn: 0.0014103	total: 672ms	remaining: 332ms
669:	learn: 0.0014083	total: 673ms	remaining: 331ms
670:	learn: 0.0014063	total: 674ms	remaining: 330ms
671:	learn: 

830:	learn: 0.0011399	total: 824ms	remaining: 168ms
831:	learn: 0.0011388	total: 825ms	remaining: 167ms
832:	learn: 0.0011367	total: 826ms	remaining: 166ms
833:	learn: 0.0011353	total: 827ms	remaining: 165ms
834:	learn: 0.0011340	total: 828ms	remaining: 164ms
835:	learn: 0.0011327	total: 829ms	remaining: 163ms
836:	learn: 0.0011314	total: 830ms	remaining: 162ms
837:	learn: 0.0011303	total: 831ms	remaining: 161ms
838:	learn: 0.0011291	total: 832ms	remaining: 160ms
839:	learn: 0.0011280	total: 833ms	remaining: 159ms
840:	learn: 0.0011269	total: 834ms	remaining: 158ms
841:	learn: 0.0011249	total: 835ms	remaining: 157ms
842:	learn: 0.0011236	total: 835ms	remaining: 156ms
843:	learn: 0.0011224	total: 836ms	remaining: 155ms
844:	learn: 0.0011213	total: 837ms	remaining: 154ms
845:	learn: 0.0011202	total: 838ms	remaining: 153ms
846:	learn: 0.0011182	total: 839ms	remaining: 152ms
847:	learn: 0.0011168	total: 841ms	remaining: 151ms
848:	learn: 0.0011155	total: 842ms	remaining: 150ms
849:	learn: 

998:	learn: 0.0009507	total: 989ms	remaining: 990us
999:	learn: 0.0009498	total: 990ms	remaining: 0us
{'learning_rate': 0.07800000000000001, 'depth': 6}
0:	learn: 0.6413561	total: 1.89ms	remaining: 1.89s
1:	learn: 0.5860372	total: 2.75ms	remaining: 1.37s
2:	learn: 0.5267140	total: 3.63ms	remaining: 1.21s
3:	learn: 0.4768024	total: 4.52ms	remaining: 1.13s
4:	learn: 0.4344620	total: 5.39ms	remaining: 1.07s
5:	learn: 0.3937539	total: 6.32ms	remaining: 1.05s
6:	learn: 0.3715205	total: 7.2ms	remaining: 1.02s
7:	learn: 0.3441795	total: 8.03ms	remaining: 995ms
8:	learn: 0.3197404	total: 8.88ms	remaining: 978ms
9:	learn: 0.2984591	total: 9.72ms	remaining: 962ms
10:	learn: 0.2712507	total: 10.6ms	remaining: 951ms
11:	learn: 0.2574280	total: 11.4ms	remaining: 940ms
12:	learn: 0.2386543	total: 12.3ms	remaining: 932ms
13:	learn: 0.2272565	total: 12.9ms	remaining: 911ms
14:	learn: 0.2119048	total: 13.8ms	remaining: 907ms
15:	learn: 0.1980488	total: 14.7ms	remaining: 904ms
16:	learn: 0.1800641	total

159:	learn: 0.0069525	total: 140ms	remaining: 733ms
160:	learn: 0.0068930	total: 141ms	remaining: 733ms
161:	learn: 0.0068411	total: 141ms	remaining: 732ms
162:	learn: 0.0067821	total: 142ms	remaining: 731ms
163:	learn: 0.0067258	total: 143ms	remaining: 730ms
164:	learn: 0.0066687	total: 144ms	remaining: 729ms
165:	learn: 0.0066201	total: 145ms	remaining: 728ms
166:	learn: 0.0065648	total: 146ms	remaining: 728ms
167:	learn: 0.0065120	total: 147ms	remaining: 727ms
168:	learn: 0.0064584	total: 148ms	remaining: 726ms
169:	learn: 0.0064128	total: 148ms	remaining: 725ms
170:	learn: 0.0063609	total: 150ms	remaining: 725ms
171:	learn: 0.0063113	total: 151ms	remaining: 725ms
172:	learn: 0.0062679	total: 151ms	remaining: 724ms
173:	learn: 0.0062181	total: 153ms	remaining: 724ms
174:	learn: 0.0061709	total: 153ms	remaining: 723ms
175:	learn: 0.0061226	total: 154ms	remaining: 722ms
176:	learn: 0.0060817	total: 155ms	remaining: 722ms
177:	learn: 0.0060348	total: 156ms	remaining: 721ms
178:	learn: 

340:	learn: 0.0027438	total: 305ms	remaining: 589ms
341:	learn: 0.0027350	total: 306ms	remaining: 588ms
342:	learn: 0.0027271	total: 306ms	remaining: 587ms
343:	learn: 0.0027162	total: 307ms	remaining: 586ms
344:	learn: 0.0027076	total: 308ms	remaining: 585ms
345:	learn: 0.0026999	total: 309ms	remaining: 584ms
346:	learn: 0.0026892	total: 310ms	remaining: 584ms
347:	learn: 0.0026807	total: 311ms	remaining: 583ms
348:	learn: 0.0026732	total: 312ms	remaining: 582ms
349:	learn: 0.0026627	total: 313ms	remaining: 581ms
350:	learn: 0.0026544	total: 314ms	remaining: 580ms
351:	learn: 0.0026470	total: 314ms	remaining: 579ms
352:	learn: 0.0026367	total: 315ms	remaining: 578ms
353:	learn: 0.0026282	total: 316ms	remaining: 577ms
354:	learn: 0.0026201	total: 317ms	remaining: 576ms
355:	learn: 0.0026129	total: 318ms	remaining: 575ms
356:	learn: 0.0026049	total: 319ms	remaining: 574ms
357:	learn: 0.0025949	total: 320ms	remaining: 573ms
358:	learn: 0.0025879	total: 321ms	remaining: 572ms
359:	learn: 

526:	learn: 0.0016987	total: 470ms	remaining: 422ms
527:	learn: 0.0016957	total: 471ms	remaining: 421ms
528:	learn: 0.0016927	total: 472ms	remaining: 420ms
529:	learn: 0.0016884	total: 472ms	remaining: 419ms
530:	learn: 0.0016854	total: 473ms	remaining: 418ms
531:	learn: 0.0016824	total: 474ms	remaining: 417ms
532:	learn: 0.0016790	total: 475ms	remaining: 416ms
533:	learn: 0.0016760	total: 476ms	remaining: 415ms
534:	learn: 0.0016731	total: 477ms	remaining: 414ms
535:	learn: 0.0016689	total: 478ms	remaining: 414ms
536:	learn: 0.0016660	total: 479ms	remaining: 413ms
537:	learn: 0.0016626	total: 479ms	remaining: 412ms
538:	learn: 0.0016597	total: 480ms	remaining: 411ms
539:	learn: 0.0016569	total: 481ms	remaining: 410ms
540:	learn: 0.0016527	total: 482ms	remaining: 409ms
541:	learn: 0.0016499	total: 483ms	remaining: 408ms
542:	learn: 0.0016470	total: 484ms	remaining: 407ms
543:	learn: 0.0016429	total: 484ms	remaining: 406ms
544:	learn: 0.0016401	total: 485ms	remaining: 405ms
545:	learn: 

705:	learn: 0.0012517	total: 634ms	remaining: 264ms
706:	learn: 0.0012501	total: 635ms	remaining: 263ms
707:	learn: 0.0012485	total: 636ms	remaining: 262ms
708:	learn: 0.0012461	total: 637ms	remaining: 261ms
709:	learn: 0.0012445	total: 639ms	remaining: 261ms
710:	learn: 0.0012429	total: 640ms	remaining: 260ms
711:	learn: 0.0012406	total: 640ms	remaining: 259ms
712:	learn: 0.0012390	total: 641ms	remaining: 258ms
713:	learn: 0.0012374	total: 642ms	remaining: 257ms
714:	learn: 0.0012355	total: 643ms	remaining: 256ms
715:	learn: 0.0012339	total: 644ms	remaining: 256ms
716:	learn: 0.0012324	total: 645ms	remaining: 255ms
717:	learn: 0.0012300	total: 646ms	remaining: 254ms
718:	learn: 0.0012285	total: 647ms	remaining: 253ms
719:	learn: 0.0012269	total: 648ms	remaining: 252ms
720:	learn: 0.0012253	total: 649ms	remaining: 251ms
721:	learn: 0.0012230	total: 650ms	remaining: 250ms
722:	learn: 0.0012215	total: 651ms	remaining: 249ms
723:	learn: 0.0012200	total: 652ms	remaining: 248ms
724:	learn: 

884:	learn: 0.0009924	total: 799ms	remaining: 104ms
885:	learn: 0.0009909	total: 802ms	remaining: 103ms
886:	learn: 0.0009899	total: 803ms	remaining: 102ms
887:	learn: 0.0009889	total: 804ms	remaining: 101ms
888:	learn: 0.0009874	total: 805ms	remaining: 101ms
889:	learn: 0.0009864	total: 806ms	remaining: 99.6ms
890:	learn: 0.0009854	total: 807ms	remaining: 98.7ms
891:	learn: 0.0009844	total: 808ms	remaining: 97.8ms
892:	learn: 0.0009829	total: 809ms	remaining: 96.9ms
893:	learn: 0.0009819	total: 810ms	remaining: 96ms
894:	learn: 0.0009807	total: 811ms	remaining: 95.2ms
895:	learn: 0.0009797	total: 812ms	remaining: 94.3ms
896:	learn: 0.0009788	total: 813ms	remaining: 93.4ms
897:	learn: 0.0009778	total: 814ms	remaining: 92.5ms
898:	learn: 0.0009763	total: 815ms	remaining: 91.5ms
899:	learn: 0.0009753	total: 816ms	remaining: 90.6ms
900:	learn: 0.0009743	total: 817ms	remaining: 89.7ms
901:	learn: 0.0009729	total: 818ms	remaining: 88.8ms
902:	learn: 0.0009719	total: 819ms	remaining: 87.9ms


In [27]:
# AdaBoost
ada_model = ensemble.AdaBoostClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(50, 500, 20)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
param_grid = dict(
    n_estimators=n_estimators, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    ada_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
ada_model = CatBoostClassifier(**rand_result.best_params_)
ada_model.fit(x_train, y_train)

# Evaluate step
y_prob_ada = ada_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    2.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    4.1s finished


{'n_estimators': 144, 'learning_rate': 0.012}
0:	learn: 0.6848490	total: 1.6ms	remaining: 230ms
1:	learn: 0.6748219	total: 2.63ms	remaining: 187ms
2:	learn: 0.6628532	total: 3.58ms	remaining: 169ms
3:	learn: 0.6513232	total: 4.6ms	remaining: 161ms
4:	learn: 0.6399004	total: 5.61ms	remaining: 156ms
5:	learn: 0.6280575	total: 6.65ms	remaining: 153ms
6:	learn: 0.6191552	total: 7.65ms	remaining: 150ms
7:	learn: 0.6071972	total: 8.6ms	remaining: 146ms
8:	learn: 0.5983784	total: 9.6ms	remaining: 144ms
9:	learn: 0.5885573	total: 10.5ms	remaining: 141ms
10:	learn: 0.5765317	total: 11.6ms	remaining: 140ms
11:	learn: 0.5678873	total: 12.3ms	remaining: 136ms
12:	learn: 0.5606358	total: 13.4ms	remaining: 135ms
13:	learn: 0.5525262	total: 14.5ms	remaining: 134ms
14:	learn: 0.5454872	total: 15.6ms	remaining: 134ms
15:	learn: 0.5358112	total: 17ms	remaining: 136ms
16:	learn: 0.5270571	total: 18.1ms	remaining: 135ms
17:	learn: 0.5191414	total: 19.1ms	remaining: 134ms
18:	learn: 0.5130141	total: 20.3ms

In [28]:
# ExtraTrees
etc_model = ensemble.ExtraTreesClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(100, 1000, 20)]
max_depth = [int(x) for x in np.linspace(1, 10, 10)]
min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10)]
min_samples_leaf = [float(x) for x in np.linspace(0.1, 1.0, 10)]
min_weight_fraction_leaf = [float(x) for x in np.linspace(0.1, 1.0, 10)]
bootstrap = [True, False]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    min_samples_split=min_samples_split, 
    min_samples_leaf=min_samples_leaf, 
    min_weight_fraction_leaf=min_weight_fraction_leaf, 
    bootstrap=bootstrap)

# Random search
rand_search = RandomizedSearchCV(
    etc_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
etc_model = ensemble.ExtraTreesClassifier(**rand_result.best_params_)
etc_model.fit(x_train, y_train)

# Evaluate step
y_prob_ada = etc_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.6s finished


{'n_estimators': 573, 'min_weight_fraction_leaf': 0.2, 'min_samples_split': 0.6, 'min_samples_leaf': 0.2, 'max_depth': 3, 'bootstrap': False}
ROC AUC: 0.7333


In [32]:
# RandomForest
rfc_model = ensemble.RandomForestClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(50, 500, 20)]
max_depth = [int(x) for x in np.linspace(1, 15, 15)]
bootstrap = [True, False]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    bootstrap=bootstrap)

# Random search
rand_search = RandomizedSearchCV(
    rfc_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
rfc_model = ensemble.RandomForestClassifier(**rand_result.best_params_)
rfc_model.fit(x_train, y_train)

# Evaluate step
y_prob_ada = rfc_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.5s finished


{'n_estimators': 239, 'max_depth': 6, 'bootstrap': True}
ROC AUC: 0.7583


In [20]:
def transform_data(df, train_data=True, scale=None):
    
    # Get features and label
    if train_data: 
        # Remove "LOCKHART EUGENE E" and "THE TRAVEL AGENCY IN THE PARK" samples
        df = df.drop(index=107, axis=0)
        df = df.drop(index=65, axis=0)
        labels = df["poi"]
        features = df.drop(["poi"], axis=1)
    else: 
        labels = None
        features = df.copy()
    
    # Adding the proportions
    features.loc[:, "long_term_incentive_p"] = features.loc[:, "long_term_incentive"]/features.loc[:, "total_payments"]
    features.loc[:, "restricted_stock_deferred_p"] = features.loc[:, "restricted_stock_deferred"]/features.loc[:, "total_stock_value"]
    features.loc[:, "from_this_person_to_poi_p"] = features.loc[:, "from_this_person_to_poi"]/features.loc[:, "from_messages"]
    
    # Removing the original values.
    features.drop("long_term_incentive", axis=1)
    features.drop("restricted_stock_deferred", axis=1)
    features.drop("from_this_person_to_poi", axis=1)
    
    # Select numerical feature (float or integer)
    num_features = []
    for dtype, feature in zip(features.dtypes, features.columns):
        if dtype == 'float64' or dtype == 'int64':
            num_features.append(feature)
            
    # Fill nan
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imp_median.fit(features[num_features])
    features = imp_median.transform(features[num_features])
    
    # Scale the data
    if scale is not None:
        if scale.lower() == "mmencoder":
            MMEncoder = MinMaxScaler()
            features = MMEncoder.fit_transform(features)
        elif scale.lower() == "ssencoder":
            SSEncoder = StandardScaler()
            features = SSEncoder.fit_transform(features)
        else:
            print("Only MinMaxScaler() and StandardScaler() are available.")
            
    return features, labels

In [33]:
# Read in test data
test = pd.read_csv("test_features.csv")
x_test, _ = transform_data(test, train_data=False, scale=None)

# Pass in model
test_prob_lgm = lgb_model.predict_proba(x_test)[:, 1]
test_prob_xgb = xgb_model.predict_proba(x_test)[:, 1]
test_prob_cat = cat_model.predict_proba(x_test)[:, 1]
test_prob_ada = ada_model.predict_proba(x_test)[:, 1]
test_prob_etc = etc_model.predict_proba(x_test)[:, 1]
test_prob_rfc = rfc_model.predict_proba(x_test)[:, 1]

blending_prob = (test_prob_lgm + test_prob_xgb + test_prob_cat + test_prob_ada + test_prob_etc + test_prob_rfc) / 6
blending_prob

array([0.24573132, 0.47396729, 0.42198456, 0.27149549, 0.33225716,
       0.27760093, 0.58161235, 0.18032961, 0.19492022, 0.17152753,
       0.1470812 , 0.07799676, 0.21105173, 0.14886049, 0.40212473,
       0.0888731 , 0.16970471, 0.17428717, 0.17135917, 0.15151738,
       0.60757388, 0.44428219, 0.10650918, 0.20200498, 0.14257767,
       0.18388576, 0.10499283, 0.13926394, 0.16133436, 0.39432935,
       0.09609113, 0.18552039, 0.4110926 ])

In [36]:
from mlxtend.classifier import StackingClassifier

meta_estimator = ensemble.GradientBoostingClassifier(
    tol=100, subsample=0.70, n_estimators=50, max_features='sqrt', max_depth=4, learning_rate=0.3)
stacking_model = StackingClassifier(
    classifiers=[lgb_model, xgb_model, cat_model, ada_model, etc_model, rfc_model], 
    meta_classifier=meta_estimator, 
    use_probas=True, 
    average_probas=False)
stacking_model.fit(x_train, y_train)
stacking_prob = stacking_model.predict_proba(x_test)[:, 1]

submit = pd.read_csv("sample_submission.csv")
submit['poi'] = stacking_prob
submit.to_csv('Submission_20200625_stacking.csv', index=False)

0:	learn: 0.6413561	total: 3ms	remaining: 2.99s
1:	learn: 0.5860372	total: 4.06ms	remaining: 2.03s
2:	learn: 0.5267140	total: 5.08ms	remaining: 1.69s
3:	learn: 0.4768024	total: 6.17ms	remaining: 1.54s
4:	learn: 0.4344620	total: 7.58ms	remaining: 1.51s
5:	learn: 0.3937539	total: 8.57ms	remaining: 1.42s
6:	learn: 0.3715205	total: 9.57ms	remaining: 1.36s
7:	learn: 0.3441795	total: 10.6ms	remaining: 1.31s
8:	learn: 0.3197404	total: 11.6ms	remaining: 1.27s
9:	learn: 0.2984591	total: 12.7ms	remaining: 1.26s
10:	learn: 0.2712507	total: 13.8ms	remaining: 1.24s
11:	learn: 0.2574280	total: 14.8ms	remaining: 1.22s
12:	learn: 0.2386543	total: 15.8ms	remaining: 1.2s
13:	learn: 0.2272565	total: 16.6ms	remaining: 1.17s
14:	learn: 0.2119048	total: 17.7ms	remaining: 1.16s
15:	learn: 0.1980488	total: 18.9ms	remaining: 1.16s
16:	learn: 0.1800641	total: 19.9ms	remaining: 1.15s
17:	learn: 0.1665077	total: 21ms	remaining: 1.15s
18:	learn: 0.1601451	total: 22ms	remaining: 1.14s
19:	learn: 0.1562083	total: 23

316:	learn: 0.0029847	total: 325ms	remaining: 701ms
317:	learn: 0.0029742	total: 326ms	remaining: 699ms
318:	learn: 0.0029649	total: 327ms	remaining: 698ms
319:	learn: 0.0029521	total: 328ms	remaining: 697ms
320:	learn: 0.0029419	total: 329ms	remaining: 696ms
321:	learn: 0.0029328	total: 330ms	remaining: 695ms
322:	learn: 0.0029203	total: 331ms	remaining: 694ms
323:	learn: 0.0029113	total: 332ms	remaining: 692ms
324:	learn: 0.0028990	total: 333ms	remaining: 691ms
325:	learn: 0.0028886	total: 334ms	remaining: 690ms
326:	learn: 0.0028788	total: 335ms	remaining: 689ms
327:	learn: 0.0028700	total: 336ms	remaining: 688ms
328:	learn: 0.0028603	total: 337ms	remaining: 687ms
329:	learn: 0.0028484	total: 338ms	remaining: 686ms
330:	learn: 0.0028399	total: 339ms	remaining: 685ms
331:	learn: 0.0028300	total: 340ms	remaining: 683ms
332:	learn: 0.0028205	total: 341ms	remaining: 682ms
333:	learn: 0.0028122	total: 342ms	remaining: 681ms
334:	learn: 0.0028006	total: 343ms	remaining: 680ms
335:	learn: 

492:	learn: 0.0018242	total: 491ms	remaining: 505ms
493:	learn: 0.0018192	total: 492ms	remaining: 504ms
494:	learn: 0.0018158	total: 493ms	remaining: 503ms
495:	learn: 0.0018123	total: 494ms	remaining: 502ms
496:	learn: 0.0018074	total: 495ms	remaining: 501ms
497:	learn: 0.0018040	total: 496ms	remaining: 500ms
498:	learn: 0.0018006	total: 496ms	remaining: 498ms
499:	learn: 0.0017972	total: 497ms	remaining: 497ms
500:	learn: 0.0017924	total: 498ms	remaining: 496ms
501:	learn: 0.0017884	total: 499ms	remaining: 495ms
502:	learn: 0.0017851	total: 500ms	remaining: 494ms
503:	learn: 0.0017818	total: 501ms	remaining: 493ms
504:	learn: 0.0017785	total: 502ms	remaining: 492ms
505:	learn: 0.0017737	total: 503ms	remaining: 491ms
506:	learn: 0.0017704	total: 504ms	remaining: 490ms
507:	learn: 0.0017672	total: 505ms	remaining: 489ms
508:	learn: 0.0017624	total: 506ms	remaining: 488ms
509:	learn: 0.0017592	total: 507ms	remaining: 487ms
510:	learn: 0.0017560	total: 508ms	remaining: 486ms
511:	learn: 

669:	learn: 0.0013218	total: 657ms	remaining: 324ms
670:	learn: 0.0013196	total: 658ms	remaining: 323ms
671:	learn: 0.0013178	total: 659ms	remaining: 322ms
672:	learn: 0.0013161	total: 660ms	remaining: 321ms
673:	learn: 0.0013134	total: 661ms	remaining: 320ms
674:	learn: 0.0013116	total: 662ms	remaining: 319ms
675:	learn: 0.0013098	total: 663ms	remaining: 318ms
676:	learn: 0.0013072	total: 664ms	remaining: 317ms
677:	learn: 0.0013054	total: 664ms	remaining: 316ms
678:	learn: 0.0013037	total: 665ms	remaining: 315ms
679:	learn: 0.0013019	total: 666ms	remaining: 314ms
680:	learn: 0.0012993	total: 667ms	remaining: 313ms
681:	learn: 0.0012976	total: 668ms	remaining: 312ms
682:	learn: 0.0012958	total: 669ms	remaining: 311ms
683:	learn: 0.0012933	total: 670ms	remaining: 310ms
684:	learn: 0.0012916	total: 671ms	remaining: 309ms
685:	learn: 0.0012898	total: 672ms	remaining: 308ms
686:	learn: 0.0012873	total: 673ms	remaining: 307ms
687:	learn: 0.0012853	total: 674ms	remaining: 306ms
688:	learn: 

846:	learn: 0.0010377	total: 821ms	remaining: 148ms
847:	learn: 0.0010365	total: 822ms	remaining: 147ms
848:	learn: 0.0010354	total: 823ms	remaining: 146ms
849:	learn: 0.0010338	total: 824ms	remaining: 145ms
850:	learn: 0.0010327	total: 825ms	remaining: 144ms
851:	learn: 0.0010316	total: 826ms	remaining: 144ms
852:	learn: 0.0010303	total: 827ms	remaining: 143ms
853:	learn: 0.0010292	total: 828ms	remaining: 142ms
854:	learn: 0.0010281	total: 829ms	remaining: 141ms
855:	learn: 0.0010265	total: 830ms	remaining: 140ms
856:	learn: 0.0010254	total: 831ms	remaining: 139ms
857:	learn: 0.0010243	total: 832ms	remaining: 138ms
858:	learn: 0.0010232	total: 833ms	remaining: 137ms
859:	learn: 0.0010216	total: 834ms	remaining: 136ms
860:	learn: 0.0010205	total: 835ms	remaining: 135ms
861:	learn: 0.0010195	total: 836ms	remaining: 134ms
862:	learn: 0.0010179	total: 837ms	remaining: 133ms
863:	learn: 0.0010168	total: 838ms	remaining: 132ms
864:	learn: 0.0010157	total: 839ms	remaining: 131ms
865:	learn: 

In [34]:
submit = pd.read_csv("sample_submission.csv")
submit['poi'] = blending_prob
submit.to_csv('Submission_20200625_blending_6.csv', index=False)

In [37]:
# https://www.kaggle.com/c/2020-ml100marathon-midterm/submissions
! kaggle competitions submit -c 2020-ml100marathon-midterm -f Submission_20200625_stacking.csv -m "Stacking 6 models"

Successfully submitted to 2020_ML100Marathon Midterm



  0%|          | 0.00/1.23k [00:00<?, ?B/s]
100%|██████████| 1.23k/1.23k [00:04<00:00, 289B/s]
