In [1]:
import pandas as pd, numpy as np, time
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv("flights_short.csv")
print(data.shape)
data.head()

(150000, 11)


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
0,1,28,3,14,102,717,608,102.0,713.0,634,0
1,8,11,2,3,152,748,690,134.0,111.0,1028,1
2,2,4,3,4,1184,597,740,111.0,1734.0,931,0
3,3,27,5,14,170,770,609,173.0,1807.0,1436,0
4,8,1,6,14,4321,772,544,63.0,2151.0,481,1


In [3]:
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]

In [4]:
data.dropna(inplace=True)

In [5]:
data.shape

(150000, 11)

In [6]:
cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes + 1

In [7]:
train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), 
                                                data["ARRIVAL_DELAY"],
                                                random_state=42, test_size=0.25)

In [8]:
print(train.shape)
print(y_train.shape)
print(test.shape)
print(y_test.shape)

(112500, 10)
(112500,)
(37500, 10)
(37500,)


In [9]:
y_train

92988     1
43658     0
110698    0
77130     0
109216    0
64982     0
147719    0
65298     0
34637     0
39703     0
115085    0
16671     0
15649     0
13076     0
71993     0
31355     1
49082     0
37277     0
149639    0
103190    0
6515      0
68655     0
117433    0
103396    0
27721     0
57433     0
36600     0
45919     0
100315    0
76035     0
         ..
129981    0
65725     0
123855    0
2747      0
130523    0
149503    0
122537    1
84478     0
130608    0
85305     0
103355    0
5311      0
64925     0
59735     0
769       0
64820     0
67221     0
41090     0
16023     0
126324    0
112727    0
87498     0
137337    0
54886     0
110268    1
119879    0
103694    0
131932    0
146867    0
121958    0
Name: ARRIVAL_DELAY, Length: 112500, dtype: int64

In [10]:
def auc(model, train, test): 
    return (metrics.roc_auc_score(y_train, model.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test, model.predict_proba(test)[:,1]))

In [11]:
train=pd.read_csv('X_train.csv')
test=pd.read_csv('X_test.csv')
y_train=np.array(pd.read_csv('y_train.csv'))
y_test=np.array(pd.read_csv('y_test.csv'))

In [12]:
y_train = y_train.ravel()

---

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score

### LightGBM

Сначала подберем параметры с помощью LightGBM

In [14]:
model2 = lgb.LGBMClassifier()
model2.fit(train, y_train)

auc(model2, train, test)

(0.7556001645011479, 0.7227598890343905)

In [15]:
ones_ratio = y_train[y_train == 1].shape[0] * 1.0 / y_train[y_train == 0].shape[0] # посчитаем соотношение между классами

param_grid = {
    # параметры ансамбля
    'n_estimators': [10, 30, 50, 100, 200, 400, 600, 1000],
    'learning_rate': [0.1],
    
    # параметры дерева
    'max_depth': [5],
    'min_child_weight': [2],
    'gamma': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'scale_pos_weight': [ones_ratio],
    
    # параметры регуляризации
    'reg_alpha': [0.0],
    'reg_lambda': [1.0]
}

cv = KFold(n_splits=4, shuffle=True)

model2 = lgb.LGBMClassifier()
gs = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=cv, verbose=5)

gs.fit(train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (AUC): ', gs.best_score_)
print('Best params: ')
best_params

Fitting 4 folds for each of 8 candidates, totalling 32 fits
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.675, total=   1.0s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.672, total=   0.9s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.666, total=   0.8s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.6s remaining:    0.0s


[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=10, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.675, total=   0.8s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.4s remaining:    0.0s


[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.689, total=   1.4s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.689, total=   1.8s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=30, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, sub

[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=600, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.733, total=  13.7s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=600, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=600, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8, score=0.737, total=  18.2s
[CV] colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=600, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=5, min_child_weight=2, n_estimators=600, reg_alpha=0.0, reg_lambda=1.0, scale_pos_weight=0.27826383365526647

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:  4.7min finished


Best score (AUC):  0.733740370000131
Best params: 


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_samples': 20,
 'min_child_weight': 2,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'gamma': 0.1,
 'scale_pos_weight': 0.27826383365526647}

In [16]:
param_grid = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

model2 = lgb.LGBMClassifier(**best_params)
gs = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=cv, verbose=5)

gs.fit(train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (AUC): ', gs.best_score_)
print('Best params: ')
best_params

Fitting 4 folds for each of 12 candidates, totalling 48 fits
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... max_depth=3, min_child_weight=1, score=0.731, total=  15.2s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.2s remaining:    0.0s


[CV] ..... max_depth=3, min_child_weight=1, score=0.727, total=  14.9s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   30.1s remaining:    0.0s


[CV] ..... max_depth=3, min_child_weight=1, score=0.730, total=  15.5s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   45.6s remaining:    0.0s


[CV] ..... max_depth=3, min_child_weight=1, score=0.729, total=  15.4s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.0min remaining:    0.0s


[CV] ..... max_depth=3, min_child_weight=3, score=0.730, total=  15.2s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] ..... max_depth=3, min_child_weight=3, score=0.726, total=  15.5s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] ..... max_depth=3, min_child_weight=3, score=0.731, total=  15.1s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] ..... max_depth=3, min_child_weight=3, score=0.730, total=  15.4s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] ..... max_depth=3, min_child_weight=5, score=0.729, total=  15.4s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] ..... max_depth=3, min_child_weight=5, score=0.726, total=  15.5s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] ..... max_depth=3, min_child_weight=5, score=0.731, total=  15.3s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 13.1min finished


Best score (AUC):  0.7361107451183581
Best params: 


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 9,
 'min_child_samples': 20,
 'min_child_weight': 3,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'gamma': 0.1,
 'scale_pos_weight': 0.27826383365526647}

In [17]:
param_grid = {
    'gamma': [0.1*i for i in range(6)]
}

model2 = lgb.LGBMClassifier(**best_params)
gs = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=cv, verbose=5)

gs.fit(train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (AUC): ', gs.best_score_)
print('Best params: ')
best_params

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................... gamma=0.0, score=0.736, total=  14.8s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.8s remaining:    0.0s


[CV] ........................... gamma=0.0, score=0.734, total=  15.1s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.9s remaining:    0.0s


[CV] ........................... gamma=0.0, score=0.736, total=  15.6s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   45.5s remaining:    0.0s


[CV] ........................... gamma=0.0, score=0.735, total=  14.6s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.0min remaining:    0.0s


[CV] ........................... gamma=0.1, score=0.736, total=  14.8s
[CV] gamma=0.1 .......................................................
[CV] ........................... gamma=0.1, score=0.734, total=  14.6s
[CV] gamma=0.1 .......................................................
[CV] ........................... gamma=0.1, score=0.736, total=  14.3s
[CV] gamma=0.1 .......................................................
[CV] ........................... gamma=0.1, score=0.735, total=  14.8s
[CV] gamma=0.2 .......................................................
[CV] ........................... gamma=0.2, score=0.736, total=  14.8s
[CV] gamma=0.2 .......................................................
[CV] ........................... gamma=0.2, score=0.734, total=  14.7s
[CV] gamma=0.2 .......................................................
[CV] ........................... gamma=0.2, score=0.736, total=  15.4s
[CV] gamma=0.2 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  5.9min finished


Best score (AUC):  0.7352955493189595
Best params: 


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 9,
 'min_child_samples': 20,
 'min_child_weight': 3,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'gamma': 0.0,
 'scale_pos_weight': 0.27826383365526647}

In [18]:
param_grid = {
    'subsample': [0.5 + 0.1*i for i in range(6)],
    'colsample_bytree': [0.5 + 0.1*i for i in range(6)]
}

model2 = lgb.LGBMClassifier(**best_params)
gs = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=cv, verbose=5)

gs.fit(train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (AUC): ', gs.best_score_)
print('Best params: ')
best_params

Fitting 4 folds for each of 36 candidates, totalling 144 fits
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . colsample_bytree=0.5, subsample=0.5, score=0.736, total=  13.9s
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s remaining:    0.0s


[CV] . colsample_bytree=0.5, subsample=0.5, score=0.737, total=  13.9s
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.8s remaining:    0.0s


[CV] . colsample_bytree=0.5, subsample=0.5, score=0.737, total=  13.7s
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   41.4s remaining:    0.0s


[CV] . colsample_bytree=0.5, subsample=0.5, score=0.730, total=  13.5s
[CV] colsample_bytree=0.5, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   55.0s remaining:    0.0s


[CV] . colsample_bytree=0.5, subsample=0.6, score=0.736, total=  13.6s
[CV] colsample_bytree=0.5, subsample=0.6 .............................
[CV] . colsample_bytree=0.5, subsample=0.6, score=0.737, total=  13.3s
[CV] colsample_bytree=0.5, subsample=0.6 .............................
[CV] . colsample_bytree=0.5, subsample=0.6, score=0.737, total=  13.6s
[CV] colsample_bytree=0.5, subsample=0.6 .............................
[CV] . colsample_bytree=0.5, subsample=0.6, score=0.730, total=  13.6s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] . colsample_bytree=0.5, subsample=0.7, score=0.736, total=  13.7s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] . colsample_bytree=0.5, subsample=0.7, score=0.737, total=  15.9s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] . colsample_bytree=0.5, subsample=0.7, score=0.737, total=  21.1s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] .

[CV] . colsample_bytree=0.7, subsample=0.8, score=0.739, total=  17.8s
[CV] colsample_bytree=0.7, subsample=0.8 .............................
[CV] . colsample_bytree=0.7, subsample=0.8, score=0.731, total=  14.1s
[CV] colsample_bytree=0.7, subsample=0.9 .............................
[CV] . colsample_bytree=0.7, subsample=0.9, score=0.736, total=  14.6s
[CV] colsample_bytree=0.7, subsample=0.9 .............................
[CV] . colsample_bytree=0.7, subsample=0.9, score=0.739, total=  14.0s
[CV] colsample_bytree=0.7, subsample=0.9 .............................
[CV] . colsample_bytree=0.7, subsample=0.9, score=0.739, total=  17.2s
[CV] colsample_bytree=0.7, subsample=0.9 .............................
[CV] . colsample_bytree=0.7, subsample=0.9, score=0.731, total=  15.2s
[CV] colsample_bytree=0.7, subsample=1.0 .............................
[CV] . colsample_bytree=0.7, subsample=1.0, score=0.736, total=  19.4s
[CV] colsample_bytree=0.7, subsample=1.0 .............................
[CV] .

[CV] . colsample_bytree=1.0, subsample=0.5, score=0.736, total=  30.3s
[CV] colsample_bytree=1.0, subsample=0.5 .............................
[CV] . colsample_bytree=1.0, subsample=0.5, score=0.738, total=  49.1s
[CV] colsample_bytree=1.0, subsample=0.5 .............................
[CV] . colsample_bytree=1.0, subsample=0.5, score=0.737, total=  23.3s
[CV] colsample_bytree=1.0, subsample=0.5 .............................
[CV] . colsample_bytree=1.0, subsample=0.5, score=0.731, total=  35.1s
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] . colsample_bytree=1.0, subsample=0.6, score=0.736, total=  28.7s
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] . colsample_bytree=1.0, subsample=0.6, score=0.738, total=  24.8s
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] . colsample_bytree=1.0, subsample=0.6, score=0.737, total=  43.0s
[CV] colsample_bytree=1.0, subsample=0.6 .............................
[CV] .

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 57.0min finished


Best score (AUC):  0.7363844189502521
Best params: 


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 9,
 'min_child_samples': 20,
 'min_child_weight': 3,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'silent': True,
 'subsample': 0.5,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'gamma': 0.0,
 'scale_pos_weight': 0.27826383365526647}

In [19]:
param_grid = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100],
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
}

model2 = lgb.LGBMClassifier(**best_params)
gs = GridSearchCV(model2, param_grid, scoring='roc_auc', cv=cv, verbose=5)

gs.fit(train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (AUC): ', gs.best_score_)
print('Best params: ')
best_params

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV] reg_alpha=1e-05, reg_lambda=1e-05 ...............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... reg_alpha=1e-05, reg_lambda=1e-05, score=0.741, total=  21.7s
[CV] reg_alpha=1e-05, reg_lambda=1e-05 ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.7s remaining:    0.0s


[CV] ... reg_alpha=1e-05, reg_lambda=1e-05, score=0.737, total=  17.9s
[CV] reg_alpha=1e-05, reg_lambda=1e-05 ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   39.6s remaining:    0.0s


[CV] ... reg_alpha=1e-05, reg_lambda=1e-05, score=0.736, total=  19.3s
[CV] reg_alpha=1e-05, reg_lambda=1e-05 ...............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   58.9s remaining:    0.0s


[CV] ... reg_alpha=1e-05, reg_lambda=1e-05, score=0.729, total=  18.8s
[CV] reg_alpha=1e-05, reg_lambda=0.01 ................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV] .... reg_alpha=1e-05, reg_lambda=0.01, score=0.741, total=  17.9s
[CV] reg_alpha=1e-05, reg_lambda=0.01 ................................
[CV] .... reg_alpha=1e-05, reg_lambda=0.01, score=0.736, total=  18.1s
[CV] reg_alpha=1e-05, reg_lambda=0.01 ................................
[CV] .... reg_alpha=1e-05, reg_lambda=0.01, score=0.736, total=  18.8s
[CV] reg_alpha=1e-05, reg_lambda=0.01 ................................
[CV] .... reg_alpha=1e-05, reg_lambda=0.01, score=0.731, total=  18.8s
[CV] reg_alpha=1e-05, reg_lambda=0.1 .................................
[CV] ..... reg_alpha=1e-05, reg_lambda=0.1, score=0.742, total=  28.9s
[CV] reg_alpha=1e-05, reg_lambda=0.1 .................................
[CV] ..... reg_alpha=1e-05, reg_lambda=0.1, score=0.736, total=  30.9s
[CV] reg_alpha=1e-05, reg_lambda=0.1 .................................
[CV] ..... reg_alpha=1e-05, reg_lambda=0.1, score=0.737, total=  30.4s
[CV] reg_alpha=1e-05, reg_lambda=0.1 .................................
[CV] .

[CV] ....... reg_alpha=1, reg_lambda=1e-05, score=0.738, total=  40.8s
[CV] reg_alpha=1, reg_lambda=1e-05 ...................................
[CV] ....... reg_alpha=1, reg_lambda=1e-05, score=0.734, total= 1.1min
[CV] reg_alpha=1, reg_lambda=0.01 ....................................
[CV] ........ reg_alpha=1, reg_lambda=0.01, score=0.745, total=  55.4s
[CV] reg_alpha=1, reg_lambda=0.01 ....................................
[CV] ........ reg_alpha=1, reg_lambda=0.01, score=0.738, total=  51.9s
[CV] reg_alpha=1, reg_lambda=0.01 ....................................
[CV] ........ reg_alpha=1, reg_lambda=0.01, score=0.739, total=  54.0s
[CV] reg_alpha=1, reg_lambda=0.01 ....................................
[CV] ........ reg_alpha=1, reg_lambda=0.01, score=0.734, total=  42.6s
[CV] reg_alpha=1, reg_lambda=0.1 .....................................
[CV] ......... reg_alpha=1, reg_lambda=0.1, score=0.744, total=  44.0s
[CV] reg_alpha=1, reg_lambda=0.1 .....................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 62.1min finished


Best score (AUC):  0.7391588927221746
Best params: 


{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 9,
 'min_child_samples': 20,
 'min_child_weight': 3,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'silent': True,
 'subsample': 0.5,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'gamma': 0.0,
 'scale_pos_weight': 0.27826383365526647}

In [38]:
model2 = lgb.LGBMClassifier(**best_params)
model2.fit(train, y_train)

auc(model2, train, test)

(0.8595836215482584, 0.741268743951971)

In [26]:
for feature_name, feature_importance in zip(train.columns, model2.feature_importances_):
    print('Feature: "%s"\tFeature importance: %.4f' % (feature_name, feature_importance))

Feature: "MONTH"	Feature importance: 2598.0000
Feature: "DAY"	Feature importance: 3047.0000
Feature: "DAY_OF_WEEK"	Feature importance: 1516.0000
Feature: "AIRLINE"	Feature importance: 1586.0000
Feature: "FLIGHT_NUMBER"	Feature importance: 3704.0000
Feature: "DESTINATION_AIRPORT"	Feature importance: 2874.0000
Feature: "ORIGIN_AIRPORT"	Feature importance: 3002.0000
Feature: "AIR_TIME"	Feature importance: 3537.0000
Feature: "DEPARTURE_TIME"	Feature importance: 4894.0000
Feature: "DISTANCE"	Feature importance: 3242.0000


In [39]:
from sklearn.metrics import accuracy_score

y_pred = model2.predict(test)
"Accuracy (test): %.3f" % accuracy_score(y_pred, y_test)

'Accuracy (test): 0.795'

### XGBoost

In [42]:
model = xgb.XGBClassifier()
model.fit(train, np.array(y_train))

auc(model, train, test)

(0.7037043597582944, 0.6957299469499221)

In [54]:
xgb_params = {'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.0,
 'learning_rate': 0.5,
 'max_delta_step': 0,
 'max_depth': 9,
 'min_child_weight': 3,
 'missing': None,
 'n_estimators': 31,
 'n_jobs': 1,
 'nthread': 1,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'scale_pos_weight': 0.27826383365526647,
 'seed': 0,
 'silent': True,
 'subsample': 0.5}

In [55]:
model = xgb.XGBClassifier(**xgb_params)
model.fit(train, np.array(y_train))

auc(model, train, test)

(0.7929877017576002, 0.7085435472649705)

### Catboost

In [19]:
clf = cb.CatBoostClassifier(iterations=30, silent=True)
clf.fit(train,y_train)

auc(clf, train, test)

(0.7170484823706651, 0.7025652854461808)