In [1]:
import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split

# 读取数据
data = pd.read_csv("https://cdn.coggle.club/kaggle-flight-delays/flights_10k.csv.zip")

# 提取有用的列
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)

# 筛选出部分数据
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1

# 进行编码
cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1

# 划分训练集和测试集
train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"], random_state=10, test_size=0.25)

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9592 entries, 0 to 9998
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   MONTH                9592 non-null   int64  
 1   DAY                  9592 non-null   int64  
 2   DAY_OF_WEEK          9592 non-null   int64  
 3   AIRLINE              9592 non-null   int8   
 4   FLIGHT_NUMBER        9592 non-null   int16  
 5   DESTINATION_AIRPORT  9592 non-null   int16  
 6   ORIGIN_AIRPORT       9592 non-null   int16  
 7   AIR_TIME             9592 non-null   float64
 8   DEPARTURE_TIME       9592 non-null   float64
 9   DISTANCE             9592 non-null   int64  
 10  ARRIVAL_DELAY        9592 non-null   int32  
dtypes: float64(2), int16(3), int32(1), int64(4), int8(1)
memory usage: 627.6 KB


In [4]:
#导入库
import lightgbm as lgb
import pandas as pd
import numpy as np
import graphviz
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import accuracy_score

In [None]:

### 构建LightGBM分类器
#使用sklearn API 
def test_depth(max_depth):    
    gbm = lgb.LGBMClassifier(max_depth=max_depth)
    gbm.fit(train, y_train,
            eval_set=[(test, y_test)],
            eval_metric='binary_logloss',
            callbacks=[lgb.early_stopping(5)])
    #eval_metric默认值：LGBMRegressor 为“l2”，LGBMClassifier 为“logloss”，LGBMRanker 为“ndcg”。
    #使用binary_logloss或者logloss准确率都是一样的。默认logloss
    y_pred = gbm.predict(test)
    # 计算准确率
    accuracy = accuracy_score(y_test,y_pred)
    auc_score=roc_auc_score(y_test,gbm.predict_proba(test)[:,1])#predict_proba输出正负样本概率值，取第二列为正样本概率值
    print("max_depth=",max_depth,"accuarcy: %.2f%%" % (accuracy*100.0),"auc_score: %.2f%%" % (auc_score*100.0)+"\n"+"*"*100)

test_depth(3)
test_depth(5)
test_depth(6)
test_depth(9)

In [9]:
#使用 lgb 原生API
lgb_train = lgb.Dataset(train, y_train)
lgb_test = lgb.Dataset(test, y_test)
def test_depth_lgb(max_depth):
    params_lgb = {
            "boosting_type" : 'gbdt',
            "num_leaves" : 31,
            "max_depth" : max_depth,
            "learning_rate" : 0.1,
            "objective" : "binary",
            "min_child_samples" : 20,
            "verbose" : -1
    }
    #构建模型
    gbm = lgb.train(params_lgb, train_set=lgb_train, num_boost_round=10,valid_sets=[lgb_train, lgb_test],callbacks=[lgb.early_stopping(stopping_rounds=5)])
    y_pred = gbm.predict(test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred)
    print("max_depth=",max_depth,"accuarcy: %.2f%%" % (accuracy*100.0),"auc_score: %.2f%%" % (auc_score*100.0)+"\n"+"*"*100)

test_depth_lgb(3)
test_depth_lgb(5)
test_depth_lgb(6)
test_depth_lgb(9)

Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10]	training's binary_logloss: 0.495478	valid_1's binary_logloss: 0.476429
max_depth= 3 accuarcy: 80.69% auc_score: 54.03%
****************************************************************************************************
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10]	training's binary_logloss: 0.47337	valid_1's binary_logloss: 0.462207
max_depth= 5 accuarcy: 80.73% auc_score: 54.50%
****************************************************************************************************
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10]	training's binary_logloss: 0.462849	valid_1's binary_logloss: 0.457939
max_depth= 6 accuarcy: 80.90% auc_score: 55.26%
****************************************************************************************************
Tr

### 网格搜索

In [13]:
from sklearn.model_selection import GridSearchCV

#定义自己的网格搜索函数
def GridSearch(clf, params, X, y):
    gscv = GridSearchCV(clf, params, scoring='neg_mean_squared_error', n_jobs=1, cv=5)
    gscv.fit(X, y)
    print("The best result: ", gscv.cv_results_)
    print("The best params: ", gscv.best_params_)

#实验
params_sklearn = {
        "boosting_type" : 'gbdt',
        'max_depth':3,
        "learning_rate" : 0.1,
        "objective" : "binary",
        "min_child_samples" : 20,
        "metric" : ['binary_logloss','auc'],
        "verbose" : -1,
        'reg_lambda': 0.1,
        'reg_alpha': 0.2,
        "random_state": 420,
}
gbm = lgb.LGBMClassifier(**params_sklearn)
#网格搜索的参数
params_grid = {
    'num_leaves':range(30,50),
    'max_depth': range(4,8),
}
gscv = GridSearch(gbm, params_grid, train, y_train)
#The best params:  {'max_depth': 6, 'num_leaves': 40}
#Timing 16.8s

The best result:  {'mean_fit_time': array([0.02921987, 0.0277298 , 0.02722893, 0.02710776, 0.02658749,
       0.02806115, 0.02838387, 0.02623038, 0.02725739, 0.02728405,
       0.026652  , 0.02676206, 0.02636676, 0.02627826, 0.02624707,
       0.02602482, 0.02680478, 0.02689538, 0.02693291, 0.02669716,
       0.03494782, 0.03508873, 0.0343873 , 0.03484421, 0.03587337,
       0.03405466, 0.03292918, 0.03933659, 0.03386059, 0.03363762,
       0.03445148, 0.03436213, 0.03377647, 0.03065367, 0.03300133,
       0.0356452 , 0.02889781, 0.03554893, 0.03634109, 0.03112736,
       0.04379506, 0.04133816, 0.04111052, 0.03686166, 0.04131641,
       0.04124103, 0.04367919, 0.04118233, 0.04235725, 0.04475617,
       0.03604364, 0.04261937, 0.04299288, 0.04272861, 0.04518905,
       0.04256382, 0.04156528, 0.04471931, 0.04389238, 0.04615903,
       0.04828277, 0.04909472, 0.04869537, 0.04470472, 0.05399003,
       0.04270034, 0.05070567, 0.05090499, 0.05512028, 0.05120287,
       0.05271912, 0.04898

In [14]:
#随机搜索，和网格搜索代码一致，除了更改GridSearchCV为RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

#定义自己的网格搜索函数
def RandomizedSearch(clf, params, X, y):
    rscv = RandomizedSearchCV(clf, params, scoring='neg_mean_squared_error', n_jobs=1, cv=5)
    rscv.fit(X, y)
    print("The best result: ", rscv.cv_results_)
    print("The best params: ", rscv.best_params_)

#实验
params_sklearn = {
        "boosting_type" : 'gbdt',
        'max_depth':3,
        "learning_rate" : 0.1,
        "objective" : "binary",
        "min_child_samples" : 20,
        "metric" : ['binary_logloss','auc'],
        "verbose" : -1,
        'reg_lambda': 0.1,
        'reg_alpha': 0.2,
        "random_state": 420,
}
gbm = lgb.LGBMClassifier(**params_sklearn)
#网格搜索的参数
params_randm = {
    'num_leaves':range(30,50),
    'max_depth': range(4,8),
}
rscv = RandomizedSearch(gbm, params_randm, train, y_train)

#The best params:  {'num_leaves': 49, 'max_depth': 7}
#Timing:2.8s

The best result:  {'mean_fit_time': array([0.0357038 , 0.05781279, 0.03814998, 0.05852489, 0.04751415,
       0.05181551, 0.06209617, 0.04875693, 0.05725231, 0.05196323]), 'std_fit_time': array([0.00164253, 0.00457298, 0.00845761, 0.00251767, 0.00137485,
       0.00345873, 0.00750214, 0.00311624, 0.00627642, 0.00105492]), 'mean_score_time': array([0.00259109, 0.00250063, 0.00187292, 0.00321846, 0.00280075,
       0.00268426, 0.00280223, 0.0023984 , 0.00279145, 0.00199332]), 'std_score_time': array([4.88382996e-04, 1.26794674e-03, 9.93728622e-04, 6.85266816e-04,
       4.12243315e-04, 3.94227167e-04, 5.00386166e-04, 4.83194144e-04,
       3.99020611e-04, 3.23406696e-07]), 'param_num_leaves': masked_array(data=[35, 30, 45, 49, 32, 31, 40, 33, 46, 42],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[5, 7, 5, 7, 6, 7, 7, 7, 7, 7],
            

In [42]:
#设定贝叶斯优化的黑盒函数LGB_bayesian
def LGB_bayesian(
    num_leaves,  # int
    min_data_in_leaf,  # int
    learning_rate,
    min_sum_hessian_in_leaf,    # int  
    feature_fraction,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
    max_depth):
    
    # LightGBM expects next three parameters need to be integer. So we make them integer
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
        "verbosity":-1

    }    
    
    
    lgb_train = lgb.Dataset(train,
                           label=y_train)
    lgb_valid = lgb.Dataset(test,label=y_test,reference=lgb_train)   

    num_round = 500
    gbm= lgb.train(param, lgb_train, num_round, valid_sets = [lgb_valid],callbacks=[lgb.early_stopping(stopping_rounds=5)])   
    predictions = gbm.predict(test,num_iteration=gbm.best_iteration)
    score = roc_auc_score(y_test, predictions)
    
    return score

In [43]:
bounds_LGB = {
    'num_leaves': (5, 20), 
    'min_data_in_leaf': (5, 20),  
    'learning_rate': (0.01, 0.3),
    'min_sum_hessian_in_leaf': (0.00001, 0.01),    
    'feature_fraction': (0.05, 0.5),
    'lambda_l1': (0, 5.0), 
    'lambda_l2': (0, 5.0), 
    'min_gain_to_split': (0, 1.0),
    'max_depth':(3,15),
}

#将它们全部放在BayesianOptimization对象中
from bayes_opt import BayesianOptimization
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)
print(LGB_BO.space.keys)#显示要优化的参数

['feature_fraction', 'lambda_l1', 'lambda_l2', 'learning_rate', 'max_depth', 'min_data_in_leaf', 'min_gain_to_split', 'min_sum_hessian_in_leaf', 'num_leaves']


In [44]:
import warnings
import gc
pd.set_option('display.max_columns', 200)

init_points = 5
n_iter = 5
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.770384
| [0m 1       [0m | [0m 0.7704  [0m | [0m 0.4     [0m | [0m 1.188   [0m | [0m 4.121   [0m | [0m 0.2901  [0m | [0m 14.67   [0m | [0m 11.8    [0m | [0m 0.609   [0m | [0m 0.007758[0m | [0m 14.62   [0m |
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.737399
| [0m 2       [0m | [0m 0.7374  [0m | [0m 0.3749  [0m | [0m 0.1752  [0m | [0m 1.492   [0m | [0m 0.02697 [0m | [0m 13.28