In [26]:
import os
import re
import joblib
import scipy
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost
import lightgbm
from catboost import CatBoostClassifier
from sklearn import ensemble
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [2]:
features = joblib.load("features.gz")
labels = joblib.load("labels.gz")
x_train, x_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.3, random_state=417)

In [39]:
# xgboost
xgb_model = xgboost.XGBClassifier(n_jobs=multiprocessing.cpu_count(), random_state=2020)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(100, 1000, 100)]
max_depth = [int(x) for x in np.linspace(5, 15, 10)]
gamma = [float(x) for x in np.linspace(0, 1, 10)]
learning_rate = [0.1, 0.01, 0.001]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    gamma=gamma, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    xgb_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
xgb_model = xgboost.XGBClassifier(**rand_result.best_params_)
xgb_model.fit(features, labels)

# Evaluate step
y_pred_xgb = xgb_model.predict(x_valid)
y_prob_xgb = xgb_model.predict_proba(x_valid)
accuracy = metrics.accuracy_score(y_valid, y_pred_xgb)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_xgb[:, 1])
print("Accuracy: {:.4f}".format(accuracy))
print("ROC AUC: {:.4f}".format(roc_auc_score))
print("Confusion matrix: \n{}".format(metrics.confusion_matrix(y_valid, y_pred_xgb)))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    2.0s


{'n_estimators': 245, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0.3333333333333333}
Accuracy: 0.9412
ROC AUC: 1.0000
Confusion matrix: 
[[30  0]
 [ 2  2]]


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    5.2s finished


In [40]:
# LGBM
lgb_model = lightgbm.LGBMClassifier(n_jobs=multiprocessing.cpu_count(), random_state=417)

# Hyperparameters
boosting_type = ["gbdt", "dart", "goss", "rf"]
n_estimators = [int(x) for x in np.linspace(100, 1000, 100)]
max_depth = [int(x) for x in np.linspace(5, 15, 10)]
num_leaves = [int(x) for x in np.linspace(1, 50, 10)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
feature_fraction = [float(x) for x in np.linspace(0, 1, 10)]
min_data_in_leaf = [int(x) for x in np.linspace(1, 50, 10)]
param_grid = dict(
    boosting_type=boosting_type, 
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    num_leaves=num_leaves, 
    learning_rate=learning_rate, 
    feature_fraction=feature_fraction, 
    min_data_in_leaf=min_data_in_leaf)

# Random search
rand_search = RandomizedSearchCV(
    lgb_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
lgb_model = lightgbm.LGBMClassifier(**rand_result.best_params_)
lgb_model.fit(features, labels)

# Evaluate step
y_pred_lgb = lgb_model.predict(x_valid)
y_prob_lgb = lgb_model.predict_proba(x_valid)
accuracy = metrics.accuracy_score(y_valid, y_pred_lgb)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_lgb[:, 1])
print("Accuracy: {:.4f}".format(accuracy))
print("ROC AUC: {:.4f}".format(roc_auc_score))
print("Confusion matrix: \n{}".format(metrics.confusion_matrix(y_valid, y_pred_lgb)))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s


{'num_leaves': 33, 'n_estimators': 145, 'min_data_in_leaf': 17, 'max_depth': 13, 'learning_rate': 0.05600000000000001, 'feature_fraction': 0.2222222222222222, 'boosting_type': 'goss'}
Accuracy: 0.8824
ROC AUC: 0.9750
Confusion matrix: 
[[30  0]
 [ 4  0]]


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    3.3s finished


In [41]:
# CatBoost
cat_model = CatBoostClassifier(random_state=417)

# Hyperparameters
depth = [int(x) for x in np.linspace(3, 36, 12)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
param_grid = dict(
    depth=depth, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    cat_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
cat_model = CatBoostClassifier(**rand_result.best_params_)
cat_model.fit(features, labels)

# Evaluate step
y_prob_cat = cat_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_cat[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:  4.3min finished


0:	learn: 0.6089301	total: 1.94ms	remaining: 1.93s
1:	learn: 0.5236944	total: 2.91ms	remaining: 1.45s
2:	learn: 0.4741909	total: 3.88ms	remaining: 1.29s
3:	learn: 0.4133482	total: 4.84ms	remaining: 1.2s
4:	learn: 0.3584739	total: 5.81ms	remaining: 1.16s
5:	learn: 0.3302343	total: 6.79ms	remaining: 1.12s
6:	learn: 0.2992518	total: 7.71ms	remaining: 1.09s
7:	learn: 0.2639500	total: 8.81ms	remaining: 1.09s
8:	learn: 0.2476488	total: 9.81ms	remaining: 1.08s
9:	learn: 0.2269945	total: 10.7ms	remaining: 1.06s
10:	learn: 0.2072916	total: 11.7ms	remaining: 1.05s
11:	learn: 0.1998007	total: 12.7ms	remaining: 1.04s
12:	learn: 0.1868219	total: 13.7ms	remaining: 1.04s
13:	learn: 0.1731189	total: 14.6ms	remaining: 1.03s
14:	learn: 0.1593475	total: 15.6ms	remaining: 1.02s
15:	learn: 0.1501147	total: 16.5ms	remaining: 1.01s
16:	learn: 0.1437343	total: 17.5ms	remaining: 1.01s
17:	learn: 0.1356181	total: 18.6ms	remaining: 1.01s
18:	learn: 0.1244705	total: 19.5ms	remaining: 1s
19:	learn: 0.1147591	total

161:	learn: 0.0048956	total: 160ms	remaining: 825ms
162:	learn: 0.0048579	total: 160ms	remaining: 824ms
163:	learn: 0.0048258	total: 162ms	remaining: 823ms
164:	learn: 0.0047868	total: 162ms	remaining: 822ms
165:	learn: 0.0047523	total: 163ms	remaining: 821ms
166:	learn: 0.0047186	total: 164ms	remaining: 820ms
167:	learn: 0.0046788	total: 165ms	remaining: 818ms
168:	learn: 0.0046489	total: 166ms	remaining: 817ms
169:	learn: 0.0046103	total: 167ms	remaining: 816ms
170:	learn: 0.0045813	total: 168ms	remaining: 815ms
171:	learn: 0.0045484	total: 169ms	remaining: 814ms
172:	learn: 0.0045174	total: 170ms	remaining: 813ms
173:	learn: 0.0044897	total: 171ms	remaining: 813ms
174:	learn: 0.0044535	total: 172ms	remaining: 812ms
175:	learn: 0.0044238	total: 173ms	remaining: 811ms
176:	learn: 0.0043887	total: 174ms	remaining: 810ms
177:	learn: 0.0043625	total: 175ms	remaining: 809ms
178:	learn: 0.0043340	total: 176ms	remaining: 808ms
179:	learn: 0.0043002	total: 177ms	remaining: 807ms
180:	learn: 

328:	learn: 0.0021533	total: 327ms	remaining: 667ms
329:	learn: 0.0021444	total: 328ms	remaining: 666ms
330:	learn: 0.0021382	total: 329ms	remaining: 665ms
331:	learn: 0.0021316	total: 330ms	remaining: 664ms
332:	learn: 0.0021255	total: 331ms	remaining: 663ms
333:	learn: 0.0021168	total: 332ms	remaining: 662ms
334:	learn: 0.0021098	total: 333ms	remaining: 661ms
335:	learn: 0.0021038	total: 334ms	remaining: 660ms
336:	learn: 0.0020974	total: 335ms	remaining: 659ms
337:	learn: 0.0020910	total: 336ms	remaining: 658ms
338:	learn: 0.0020825	total: 339ms	remaining: 662ms
339:	learn: 0.0020767	total: 340ms	remaining: 661ms
340:	learn: 0.0020705	total: 341ms	remaining: 660ms
341:	learn: 0.0020622	total: 342ms	remaining: 659ms
342:	learn: 0.0020565	total: 343ms	remaining: 658ms
343:	learn: 0.0020509	total: 344ms	remaining: 657ms
344:	learn: 0.0020427	total: 345ms	remaining: 656ms
345:	learn: 0.0020362	total: 346ms	remaining: 654ms
346:	learn: 0.0020301	total: 347ms	remaining: 653ms
347:	learn: 

491:	learn: 0.0013953	total: 495ms	remaining: 511ms
492:	learn: 0.0013928	total: 496ms	remaining: 510ms
493:	learn: 0.0013886	total: 497ms	remaining: 509ms
494:	learn: 0.0013861	total: 498ms	remaining: 508ms
495:	learn: 0.0013835	total: 499ms	remaining: 507ms
496:	learn: 0.0013794	total: 500ms	remaining: 506ms
497:	learn: 0.0013769	total: 501ms	remaining: 505ms
498:	learn: 0.0013744	total: 502ms	remaining: 504ms
499:	learn: 0.0013719	total: 503ms	remaining: 503ms
500:	learn: 0.0013679	total: 504ms	remaining: 502ms
501:	learn: 0.0013649	total: 505ms	remaining: 501ms
502:	learn: 0.0013625	total: 506ms	remaining: 500ms
503:	learn: 0.0013600	total: 507ms	remaining: 499ms
504:	learn: 0.0013576	total: 508ms	remaining: 498ms
505:	learn: 0.0013537	total: 509ms	remaining: 496ms
506:	learn: 0.0013512	total: 509ms	remaining: 495ms
507:	learn: 0.0013488	total: 510ms	remaining: 494ms
508:	learn: 0.0013460	total: 511ms	remaining: 493ms
509:	learn: 0.0013436	total: 512ms	remaining: 492ms
510:	learn: 

662:	learn: 0.0010006	total: 662ms	remaining: 336ms
663:	learn: 0.0009980	total: 663ms	remaining: 335ms
664:	learn: 0.0009967	total: 664ms	remaining: 334ms
665:	learn: 0.0009954	total: 664ms	remaining: 333ms
666:	learn: 0.0009928	total: 665ms	remaining: 332ms
667:	learn: 0.0009915	total: 666ms	remaining: 331ms
668:	learn: 0.0009902	total: 667ms	remaining: 330ms
669:	learn: 0.0009889	total: 668ms	remaining: 329ms
670:	learn: 0.0009864	total: 669ms	remaining: 328ms
671:	learn: 0.0009851	total: 670ms	remaining: 327ms
672:	learn: 0.0009838	total: 671ms	remaining: 326ms
673:	learn: 0.0009813	total: 672ms	remaining: 325ms
674:	learn: 0.0009800	total: 673ms	remaining: 324ms
675:	learn: 0.0009787	total: 674ms	remaining: 323ms
676:	learn: 0.0009775	total: 675ms	remaining: 322ms
677:	learn: 0.0009750	total: 676ms	remaining: 321ms
678:	learn: 0.0009737	total: 677ms	remaining: 320ms
679:	learn: 0.0009725	total: 678ms	remaining: 319ms
680:	learn: 0.0009700	total: 679ms	remaining: 318ms
681:	learn: 

827:	learn: 0.0007837	total: 828ms	remaining: 172ms
828:	learn: 0.0007829	total: 829ms	remaining: 171ms
829:	learn: 0.0007821	total: 830ms	remaining: 170ms
830:	learn: 0.0007805	total: 831ms	remaining: 169ms
831:	learn: 0.0007797	total: 832ms	remaining: 168ms
832:	learn: 0.0007789	total: 833ms	remaining: 167ms
833:	learn: 0.0007781	total: 834ms	remaining: 166ms
834:	learn: 0.0007773	total: 835ms	remaining: 165ms
835:	learn: 0.0007757	total: 836ms	remaining: 164ms
836:	learn: 0.0007749	total: 837ms	remaining: 163ms
837:	learn: 0.0007741	total: 838ms	remaining: 162ms
838:	learn: 0.0007725	total: 839ms	remaining: 161ms
839:	learn: 0.0007718	total: 840ms	remaining: 160ms
840:	learn: 0.0007710	total: 841ms	remaining: 159ms
841:	learn: 0.0007702	total: 842ms	remaining: 158ms
842:	learn: 0.0007686	total: 843ms	remaining: 157ms
843:	learn: 0.0007679	total: 844ms	remaining: 156ms
844:	learn: 0.0007671	total: 845ms	remaining: 155ms
845:	learn: 0.0007663	total: 846ms	remaining: 154ms
846:	learn: 

998:	learn: 0.0006592	total: 993ms	remaining: 994us
999:	learn: 0.0006587	total: 994ms	remaining: 0us
{'learning_rate': 0.1, 'depth': 6}
0:	learn: 0.5776576	total: 3.6ms	remaining: 3.6s
1:	learn: 0.5168438	total: 4.63ms	remaining: 2.31s
2:	learn: 0.4489954	total: 5.59ms	remaining: 1.86s
3:	learn: 0.4079521	total: 6.65ms	remaining: 1.66s
4:	learn: 0.3615129	total: 7.6ms	remaining: 1.51s
5:	learn: 0.3367726	total: 8.53ms	remaining: 1.41s
6:	learn: 0.3050082	total: 9.48ms	remaining: 1.34s
7:	learn: 0.2856642	total: 10.6ms	remaining: 1.31s
8:	learn: 0.2657344	total: 11.6ms	remaining: 1.27s
9:	learn: 0.2410923	total: 12.6ms	remaining: 1.24s
10:	learn: 0.2218036	total: 13.6ms	remaining: 1.22s
11:	learn: 0.2020149	total: 14.6ms	remaining: 1.2s
12:	learn: 0.1868583	total: 15.7ms	remaining: 1.19s
13:	learn: 0.1711604	total: 16.7ms	remaining: 1.18s
14:	learn: 0.1620861	total: 17.7ms	remaining: 1.16s
15:	learn: 0.1543460	total: 18.7ms	remaining: 1.15s
16:	learn: 0.1460481	total: 19.7ms	remaining:

307:	learn: 0.0025185	total: 311ms	remaining: 699ms
308:	learn: 0.0025125	total: 312ms	remaining: 698ms
309:	learn: 0.0025063	total: 313ms	remaining: 697ms
310:	learn: 0.0024993	total: 314ms	remaining: 696ms
311:	learn: 0.0024930	total: 315ms	remaining: 695ms
312:	learn: 0.0024816	total: 316ms	remaining: 694ms
313:	learn: 0.0024758	total: 317ms	remaining: 693ms
314:	learn: 0.0024723	total: 318ms	remaining: 692ms
315:	learn: 0.0024654	total: 319ms	remaining: 691ms
316:	learn: 0.0024594	total: 320ms	remaining: 690ms
317:	learn: 0.0024483	total: 321ms	remaining: 688ms
318:	learn: 0.0024376	total: 322ms	remaining: 687ms
319:	learn: 0.0024319	total: 323ms	remaining: 686ms
320:	learn: 0.0024264	total: 324ms	remaining: 685ms
321:	learn: 0.0024199	total: 325ms	remaining: 684ms
322:	learn: 0.0024140	total: 326ms	remaining: 683ms
323:	learn: 0.0024032	total: 327ms	remaining: 682ms
324:	learn: 0.0024000	total: 328ms	remaining: 681ms
325:	learn: 0.0023866	total: 329ms	remaining: 680ms
326:	learn: 

476:	learn: 0.0015491	total: 478ms	remaining: 524ms
477:	learn: 0.0015466	total: 479ms	remaining: 523ms
478:	learn: 0.0015439	total: 480ms	remaining: 522ms
479:	learn: 0.0015388	total: 481ms	remaining: 521ms
480:	learn: 0.0015364	total: 482ms	remaining: 520ms
481:	learn: 0.0015339	total: 483ms	remaining: 519ms
482:	learn: 0.0015313	total: 484ms	remaining: 518ms
483:	learn: 0.0015289	total: 485ms	remaining: 517ms
484:	learn: 0.0015249	total: 486ms	remaining: 516ms
485:	learn: 0.0015223	total: 487ms	remaining: 515ms
486:	learn: 0.0015180	total: 488ms	remaining: 514ms
487:	learn: 0.0015148	total: 489ms	remaining: 513ms
488:	learn: 0.0015103	total: 490ms	remaining: 512ms
489:	learn: 0.0015070	total: 491ms	remaining: 511ms
490:	learn: 0.0015025	total: 492ms	remaining: 510ms
491:	learn: 0.0014981	total: 493ms	remaining: 509ms
492:	learn: 0.0014938	total: 494ms	remaining: 508ms
493:	learn: 0.0014897	total: 495ms	remaining: 507ms
494:	learn: 0.0014856	total: 496ms	remaining: 506ms
495:	learn: 

643:	learn: 0.0011398	total: 645ms	remaining: 356ms
644:	learn: 0.0011386	total: 646ms	remaining: 355ms
645:	learn: 0.0011366	total: 647ms	remaining: 354ms
646:	learn: 0.0011338	total: 648ms	remaining: 353ms
647:	learn: 0.0011317	total: 648ms	remaining: 352ms
648:	learn: 0.0011304	total: 649ms	remaining: 351ms
649:	learn: 0.0011279	total: 650ms	remaining: 350ms
650:	learn: 0.0011260	total: 651ms	remaining: 349ms
651:	learn: 0.0011248	total: 652ms	remaining: 348ms
652:	learn: 0.0011230	total: 653ms	remaining: 347ms
653:	learn: 0.0011211	total: 654ms	remaining: 346ms
654:	learn: 0.0011180	total: 655ms	remaining: 345ms
655:	learn: 0.0011162	total: 656ms	remaining: 344ms
656:	learn: 0.0011143	total: 657ms	remaining: 343ms
657:	learn: 0.0011131	total: 658ms	remaining: 342ms
658:	learn: 0.0011113	total: 659ms	remaining: 341ms
659:	learn: 0.0011086	total: 660ms	remaining: 340ms
660:	learn: 0.0011062	total: 661ms	remaining: 339ms
661:	learn: 0.0011050	total: 662ms	remaining: 338ms
662:	learn: 

817:	learn: 0.0008919	total: 812ms	remaining: 181ms
818:	learn: 0.0008901	total: 813ms	remaining: 180ms
819:	learn: 0.0008892	total: 814ms	remaining: 179ms
820:	learn: 0.0008877	total: 815ms	remaining: 178ms
821:	learn: 0.0008869	total: 816ms	remaining: 177ms
822:	learn: 0.0008853	total: 817ms	remaining: 176ms
823:	learn: 0.0008845	total: 818ms	remaining: 175ms
824:	learn: 0.0008837	total: 819ms	remaining: 174ms
825:	learn: 0.0008830	total: 820ms	remaining: 173ms
826:	learn: 0.0008814	total: 821ms	remaining: 172ms
827:	learn: 0.0008803	total: 822ms	remaining: 171ms
828:	learn: 0.0008788	total: 823ms	remaining: 170ms
829:	learn: 0.0008781	total: 824ms	remaining: 169ms
830:	learn: 0.0008769	total: 825ms	remaining: 168ms
831:	learn: 0.0008762	total: 826ms	remaining: 167ms
832:	learn: 0.0008750	total: 827ms	remaining: 166ms
833:	learn: 0.0008743	total: 827ms	remaining: 165ms
834:	learn: 0.0008735	total: 828ms	remaining: 164ms
835:	learn: 0.0008721	total: 829ms	remaining: 163ms
836:	learn: 

985:	learn: 0.0007401	total: 980ms	remaining: 13.9ms
986:	learn: 0.0007396	total: 981ms	remaining: 12.9ms
987:	learn: 0.0007394	total: 982ms	remaining: 11.9ms
988:	learn: 0.0007388	total: 983ms	remaining: 10.9ms
989:	learn: 0.0007388	total: 984ms	remaining: 9.94ms
990:	learn: 0.0007375	total: 985ms	remaining: 8.95ms
991:	learn: 0.0007365	total: 986ms	remaining: 7.95ms
992:	learn: 0.0007354	total: 987ms	remaining: 6.96ms
993:	learn: 0.0007348	total: 988ms	remaining: 5.96ms
994:	learn: 0.0007339	total: 989ms	remaining: 4.97ms
995:	learn: 0.0007338	total: 990ms	remaining: 3.98ms
996:	learn: 0.0007336	total: 991ms	remaining: 2.98ms
997:	learn: 0.0007332	total: 992ms	remaining: 1.99ms
998:	learn: 0.0007328	total: 993ms	remaining: 993us
999:	learn: 0.0007325	total: 994ms	remaining: 0us
ROC AUC: 1.0000


In [42]:
# AdaBoost
ada_model = ensemble.AdaBoostClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(50, 500, 20)]
learning_rate = [float(x) for x in np.linspace(0.001, 0.1, 10)]
param_grid = dict(
    n_estimators=n_estimators, 
    learning_rate=learning_rate)

# Random search
rand_search = RandomizedSearchCV(
    ada_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
ada_model = CatBoostClassifier(**rand_result.best_params_)
ada_model.fit(features, labels)

# Evaluate step
y_prob_ada = ada_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.8s finished


{'n_estimators': 168, 'learning_rate': 0.012}
0:	learn: 0.6779491	total: 1.16ms	remaining: 194ms
1:	learn: 0.6677336	total: 2.37ms	remaining: 197ms
2:	learn: 0.6544449	total: 3.31ms	remaining: 182ms
3:	learn: 0.6440362	total: 4.25ms	remaining: 174ms
4:	learn: 0.6320225	total: 5.51ms	remaining: 180ms
5:	learn: 0.6231484	total: 6.5ms	remaining: 175ms
6:	learn: 0.6122098	total: 7.45ms	remaining: 171ms
7:	learn: 0.6006702	total: 8.41ms	remaining: 168ms
8:	learn: 0.5898845	total: 9.41ms	remaining: 166ms
9:	learn: 0.5784893	total: 10.4ms	remaining: 164ms
10:	learn: 0.5701930	total: 11.3ms	remaining: 161ms
11:	learn: 0.5586532	total: 12.3ms	remaining: 159ms
12:	learn: 0.5481132	total: 13.2ms	remaining: 157ms
13:	learn: 0.5391397	total: 14.1ms	remaining: 155ms
14:	learn: 0.5324676	total: 15.1ms	remaining: 154ms
15:	learn: 0.5255908	total: 16.1ms	remaining: 153ms
16:	learn: 0.5184228	total: 17.1ms	remaining: 152ms
17:	learn: 0.5119651	total: 18.2ms	remaining: 151ms
18:	learn: 0.5031160	total: 1

167:	learn: 0.1274924	total: 163ms	remaining: 0us
ROC AUC: 1.0000


In [43]:
# ExtraTrees
etc_model = ensemble.ExtraTreesClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(100, 1000, 20)]
max_depth = [int(x) for x in np.linspace(1, 10, 10)]
min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10)]
min_samples_leaf = [float(x) for x in np.linspace(0.1, 1.0, 10)]
min_weight_fraction_leaf = [float(x) for x in np.linspace(0.1, 1.0, 10)]
bootstrap = [True, False]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    min_samples_split=min_samples_split, 
    min_samples_leaf=min_samples_leaf, 
    min_weight_fraction_leaf=min_weight_fraction_leaf, 
    bootstrap=bootstrap)

# Random search
rand_search = RandomizedSearchCV(
    etc_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
etc_model = ensemble.ExtraTreesClassifier(**rand_result.best_params_)
etc_model.fit(features, labels)

# Evaluate step
y_prob_ada = etc_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    3.2s finished


{'n_estimators': 668, 'min_weight_fraction_leaf': 0.2, 'min_samples_split': 0.2, 'min_samples_leaf': 0.1, 'max_depth': 7, 'bootstrap': True}
ROC AUC: 0.8333


In [44]:
# RandomForest
rfc_model = ensemble.RandomForestClassifier(random_state=417)

# Hyperparameters
n_estimators = [int(x) for x in np.linspace(50, 500, 20)]
max_depth = [int(x) for x in np.linspace(1, 15, 15)]
bootstrap = [True, False]
param_grid = dict(
    n_estimators=n_estimators, 
    max_depth=max_depth, 
    bootstrap=bootstrap)

# Random search
rand_search = RandomizedSearchCV(
    rfc_model, 
    param_grid, 
    scoring="roc_auc_ovr", 
    cv=ShuffleSplit(n_splits=10, test_size=0.3, random_state=417), 
    n_jobs=multiprocessing.cpu_count(), 
    verbose=1)
rand_result = rand_search.fit(x_train, y_train)
print(rand_result.best_params_)
rfc_model = ensemble.RandomForestClassifier(**rand_result.best_params_)
rfc_model.fit(features, labels)

# Evaluate step
y_prob_ada = rfc_model.predict_proba(x_valid)
roc_auc_score = metrics.roc_auc_score(y_valid, y_prob_ada[:, 1])
print("ROC AUC: {:.4f}".format(roc_auc_score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.6s finished


{'n_estimators': 239, 'max_depth': 13, 'bootstrap': True}
ROC AUC: 1.0000


In [45]:
def transform_data(df, train_data=True, scale=None):
    
    # Get features and label
    if train_data: 
        # Remove "LOCKHART EUGENE E" and "THE TRAVEL AGENCY IN THE PARK" samples
        df = df.drop(index=107, axis=0)
        df = df.drop(index=65, axis=0)
        labels = df["poi"]
        features = df.drop(["poi"], axis=1)
    else: 
        labels = None
        features = df.copy()
    
    # Adding the proportions
    features.loc[:, "long_term_incentive_p"] = features.loc[:, "long_term_incentive"]/features.loc[:, "total_payments"]
    features.loc[:, "restricted_stock_deferred_p"] = features.loc[:, "restricted_stock_deferred"]/features.loc[:, "total_stock_value"]
    features.loc[:, "from_this_person_to_poi_p"] = features.loc[:, "from_this_person_to_poi"]/features.loc[:, "from_messages"]
    
    # Removing the original values.
    features.drop("long_term_incentive", axis=1)
    features.drop("restricted_stock_deferred", axis=1)
    features.drop("from_this_person_to_poi", axis=1)
    
    # Select numerical feature (float or integer)
    num_features = []
    for dtype, feature in zip(features.dtypes, features.columns):
        if dtype == 'float64' or dtype == 'int64':
            num_features.append(feature)
            
    # Fill nan
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imp_median.fit(features[num_features])
    features = imp_median.transform(features[num_features])
    
    # Scale the data
    if scale is not None:
        if scale.lower() == "mmencoder":
            MMEncoder = MinMaxScaler()
            features = MMEncoder.fit_transform(features)
        elif scale.lower() == "ssencoder":
            SSEncoder = StandardScaler()
            features = SSEncoder.fit_transform(features)
        else:
            print("Only MinMaxScaler() and StandardScaler() are available.")
            
    return features, labels

In [46]:
# Read in test data
test = pd.read_csv("test_features.csv")
x_test, _ = transform_data(test, train_data=False, scale=None)

# Pass in model
test_prob_lgm = lgb_model.predict_proba(x_test)[:, 1]
test_prob_xgb = xgb_model.predict_proba(x_test)[:, 1]
test_prob_cat = cat_model.predict_proba(x_test)[:, 1]
test_prob_ada = ada_model.predict_proba(x_test)[:, 1]
test_prob_etc = etc_model.predict_proba(x_test)[:, 1]
test_prob_rfc = rfc_model.predict_proba(x_test)[:, 1]

blending_prob = (test_prob_lgm + test_prob_xgb + test_prob_cat + test_prob_ada + test_prob_etc + test_prob_rfc) / 6
blending_prob

array([0.24748089, 0.32547292, 0.43355797, 0.22305747, 0.30807928,
       0.25253251, 0.56677127, 0.16001798, 0.18780673, 0.16977755,
       0.11615459, 0.07356007, 0.16418463, 0.15175585, 0.35736102,
       0.09730931, 0.12847571, 0.23814895, 0.2044116 , 0.15082059,
       0.59502763, 0.47303844, 0.08265327, 0.17890337, 0.13605938,
       0.12877944, 0.10172052, 0.15244786, 0.17443688, 0.35783302,
       0.07193514, 0.15056842, 0.48485681])

In [48]:
submit = pd.read_csv("sample_submission.csv")
submit['poi'] = blending_prob
submit.to_csv('Full_blending_6.csv', index=False)

In [47]:
from mlxtend.classifier import StackingClassifier

meta_estimator = ensemble.GradientBoostingClassifier(
    tol=100, subsample=0.70, n_estimators=50, max_features='sqrt', max_depth=4, learning_rate=0.3)
stacking_model = StackingClassifier(
    classifiers=[lgb_model, xgb_model, cat_model, ada_model, etc_model, rfc_model], 
    meta_classifier=meta_estimator, 
    use_probas=True, 
    average_probas=False)
stacking_model.fit(features, labels)
stacking_prob = stacking_model.predict_proba(x_test)[:, 1]

submit = pd.read_csv("sample_submission.csv")
submit['poi'] = stacking_prob
submit.to_csv('Submission_20200625_stacking.csv', index=False)

0:	learn: 0.5776576	total: 2.77ms	remaining: 2.77s
1:	learn: 0.5168438	total: 3.81ms	remaining: 1.9s
2:	learn: 0.4489954	total: 4.88ms	remaining: 1.62s
3:	learn: 0.4079521	total: 5.95ms	remaining: 1.48s
4:	learn: 0.3615129	total: 6.97ms	remaining: 1.39s
5:	learn: 0.3367726	total: 8.02ms	remaining: 1.33s
6:	learn: 0.3050082	total: 9.13ms	remaining: 1.29s
7:	learn: 0.2856642	total: 10.2ms	remaining: 1.26s
8:	learn: 0.2657344	total: 11.3ms	remaining: 1.25s
9:	learn: 0.2410923	total: 12.4ms	remaining: 1.23s
10:	learn: 0.2218036	total: 13.5ms	remaining: 1.22s
11:	learn: 0.2020149	total: 14.7ms	remaining: 1.21s
12:	learn: 0.1868583	total: 15.9ms	remaining: 1.21s
13:	learn: 0.1711604	total: 17ms	remaining: 1.19s
14:	learn: 0.1620861	total: 18.1ms	remaining: 1.19s
15:	learn: 0.1543460	total: 19.2ms	remaining: 1.18s
16:	learn: 0.1460481	total: 20.3ms	remaining: 1.17s
17:	learn: 0.1420780	total: 21.3ms	remaining: 1.16s
18:	learn: 0.1317598	total: 22.4ms	remaining: 1.15s
19:	learn: 0.1243824	tota

329:	learn: 0.0023582	total: 325ms	remaining: 661ms
330:	learn: 0.0023480	total: 326ms	remaining: 660ms
331:	learn: 0.0023448	total: 328ms	remaining: 659ms
332:	learn: 0.0023396	total: 329ms	remaining: 658ms
333:	learn: 0.0023296	total: 330ms	remaining: 657ms
334:	learn: 0.0023265	total: 331ms	remaining: 656ms
335:	learn: 0.0023214	total: 332ms	remaining: 655ms
336:	learn: 0.0023118	total: 332ms	remaining: 654ms
337:	learn: 0.0023079	total: 333ms	remaining: 653ms
338:	learn: 0.0023020	total: 334ms	remaining: 652ms
339:	learn: 0.0022904	total: 335ms	remaining: 651ms
340:	learn: 0.0022874	total: 336ms	remaining: 650ms
341:	learn: 0.0022777	total: 337ms	remaining: 649ms
342:	learn: 0.0022748	total: 338ms	remaining: 647ms
343:	learn: 0.0022699	total: 339ms	remaining: 646ms
344:	learn: 0.0022604	total: 340ms	remaining: 645ms
345:	learn: 0.0022510	total: 341ms	remaining: 644ms
346:	learn: 0.0022464	total: 342ms	remaining: 643ms
347:	learn: 0.0022413	total: 343ms	remaining: 642ms
348:	learn: 

501:	learn: 0.0014636	total: 493ms	remaining: 489ms
502:	learn: 0.0014614	total: 494ms	remaining: 488ms
503:	learn: 0.0014594	total: 495ms	remaining: 487ms
504:	learn: 0.0014557	total: 496ms	remaining: 486ms
505:	learn: 0.0014536	total: 497ms	remaining: 485ms
506:	learn: 0.0014516	total: 498ms	remaining: 484ms
507:	learn: 0.0014494	total: 499ms	remaining: 483ms
508:	learn: 0.0014464	total: 500ms	remaining: 482ms
509:	learn: 0.0014443	total: 500ms	remaining: 481ms
510:	learn: 0.0014398	total: 501ms	remaining: 480ms
511:	learn: 0.0014377	total: 502ms	remaining: 479ms
512:	learn: 0.0014333	total: 503ms	remaining: 478ms
513:	learn: 0.0014299	total: 504ms	remaining: 477ms
514:	learn: 0.0014279	total: 505ms	remaining: 476ms
515:	learn: 0.0014259	total: 506ms	remaining: 475ms
516:	learn: 0.0014222	total: 507ms	remaining: 474ms
517:	learn: 0.0014201	total: 511ms	remaining: 476ms
518:	learn: 0.0014159	total: 512ms	remaining: 475ms
519:	learn: 0.0014140	total: 513ms	remaining: 474ms
520:	learn: 

672:	learn: 0.0010858	total: 660ms	remaining: 321ms
673:	learn: 0.0010847	total: 661ms	remaining: 320ms
674:	learn: 0.0010830	total: 662ms	remaining: 319ms
675:	learn: 0.0010819	total: 663ms	remaining: 318ms
676:	learn: 0.0010797	total: 664ms	remaining: 317ms
677:	learn: 0.0010769	total: 665ms	remaining: 316ms
678:	learn: 0.0010740	total: 666ms	remaining: 315ms
679:	learn: 0.0010728	total: 666ms	remaining: 314ms
680:	learn: 0.0010718	total: 667ms	remaining: 313ms
681:	learn: 0.0010706	total: 668ms	remaining: 312ms
682:	learn: 0.0010685	total: 669ms	remaining: 311ms
683:	learn: 0.0010658	total: 670ms	remaining: 310ms
684:	learn: 0.0010646	total: 671ms	remaining: 309ms
685:	learn: 0.0010635	total: 672ms	remaining: 308ms
686:	learn: 0.0010613	total: 673ms	remaining: 307ms
687:	learn: 0.0010603	total: 674ms	remaining: 306ms
688:	learn: 0.0010586	total: 675ms	remaining: 305ms
689:	learn: 0.0010576	total: 676ms	remaining: 304ms
690:	learn: 0.0010565	total: 677ms	remaining: 303ms
691:	learn: 

850:	learn: 0.0008562	total: 829ms	remaining: 145ms
851:	learn: 0.0008544	total: 830ms	remaining: 144ms
852:	learn: 0.0008536	total: 831ms	remaining: 143ms
853:	learn: 0.0008531	total: 832ms	remaining: 142ms
854:	learn: 0.0008523	total: 833ms	remaining: 141ms
855:	learn: 0.0008515	total: 834ms	remaining: 140ms
856:	learn: 0.0008500	total: 835ms	remaining: 139ms
857:	learn: 0.0008481	total: 836ms	remaining: 138ms
858:	learn: 0.0008462	total: 837ms	remaining: 137ms
859:	learn: 0.0008455	total: 837ms	remaining: 136ms
860:	learn: 0.0008448	total: 838ms	remaining: 135ms
861:	learn: 0.0008441	total: 839ms	remaining: 134ms
862:	learn: 0.0008426	total: 840ms	remaining: 133ms
863:	learn: 0.0008416	total: 841ms	remaining: 132ms
864:	learn: 0.0008401	total: 842ms	remaining: 131ms
865:	learn: 0.0008385	total: 843ms	remaining: 130ms
866:	learn: 0.0008378	total: 844ms	remaining: 129ms
867:	learn: 0.0008371	total: 845ms	remaining: 128ms
868:	learn: 0.0008369	total: 845ms	remaining: 127ms
869:	learn: 

157:	learn: 0.1367223	total: 164ms	remaining: 10.4ms
158:	learn: 0.1357664	total: 165ms	remaining: 9.33ms
159:	learn: 0.1349494	total: 166ms	remaining: 8.29ms
160:	learn: 0.1339339	total: 167ms	remaining: 7.25ms
161:	learn: 0.1330912	total: 168ms	remaining: 6.21ms
162:	learn: 0.1317013	total: 169ms	remaining: 5.17ms
163:	learn: 0.1307707	total: 170ms	remaining: 4.13ms
164:	learn: 0.1301281	total: 170ms	remaining: 3.1ms
165:	learn: 0.1291371	total: 171ms	remaining: 2.06ms
166:	learn: 0.1283892	total: 172ms	remaining: 1.03ms
167:	learn: 0.1274924	total: 173ms	remaining: 0us


In [50]:
# https://www.kaggle.com/c/2020-ml100marathon-midterm/submissions
! kaggle competitions submit -c 2020-ml100marathon-midterm -f Submission_20200625_stacking.csv -m "Full train stacking"

403 - Your team has used its submission allowance (5 of 5). This resets at midnight UTC (11 hours from now).



  0%|          | 0.00/1.24k [00:00<?, ?B/s]
100%|██████████| 1.24k/1.24k [00:05<00:00, 227B/s]
