# 这个文件做了什么

LightGBM手动调参，按照调参手册

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
import gc
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

In [2]:

def data_prepare(num=None):
    train = pd.read_csv('../data/handled/train.csv', nrows=num, index_col=0)
    test = pd.read_csv('../data/handled/test.csv', nrows=num, index_col=0)
    y_train = pd.read_csv('../data/handled/y_train.csv', nrows=num, header=-1, index_col=0)
    return train, test, y_train.values.ravel()

def imput_by_mean(train, test):
    im = Imputer()
    train = im.fit_transform(train)
    test = im.transform(test)
    return train, test


In [15]:
def lgb_score(params, x, y, num_boost_round=100, nfolds=5, metrics='auc', seed=2018):
    data = lgb.Dataset(x, label=y)
    cv_res = lgb.cv(params, data, num_boost_round=num_boost_round, nfold=nfolds, 
                    metrics=metrics, seed=seed, early_stopping_rounds=50)
    return cv_res['auc-mean'][-1], len(cv_res['auc-mean'])

In [6]:
df_train, df_test, y_train = data_prepare()
train, test = imput_by_mean(df_train.values, df_test.values)

In [19]:
params_lgb = {
    'nthread': 4,
    #is_unbalance=True,
    #'n_estimators' : 10000,
    'learning_rate' : 0.1171,
    #'num_leaves' : 32,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'max_depth' : 7,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain' : 0.179,
    'min_child_weight' : 13,
    'metric': 'auc',
    'verbose': -1,
    #scale_pos_weight=11
}

## step 1. 确定最优learning_rate 和 迭代器数目
参考 [https://blog.csdn.net/han_xiaoyang/article/details/52665396](https://blog.csdn.net/han_xiaoyang/article/details/52665396)

In [21]:
# learning_rate = 0.1, n_estermatro = 100
res = lgb_score(params_lgb, train, y_train)
res

(0.7759551429811502, 100)

In [23]:
params_lgb['learning_rate'] = 0.2
lgb_score(params_lgb, train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


(0.7745931129272222, 95)

In [24]:
params_lgb['learning_rate'] = 0.3
lgb_score(params_lgb, train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


(0.7719364634745626, 68)

In [26]:
params_lgb['learning_rate'] = 0.09
lgb_score(params_lgb, train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


(0.7745532486404375, 100)

In [27]:
params_lgb['learning_rate'] = 0.11
lgb_score(params_lgb, train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


(0.7756976065151073, 100)

**最终确定learning_rate=0.1, n_estimator=100**

## step 2. max_depth 和 min_weight 参数调优

In [29]:
init_params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain' : 0.179,
    'metric': 'auc',
}

In [38]:
param_grid = {
    'max_depth' : range(5, 15, 2),
    'min_child_weight' : range(1, 15, 2),
}

In [39]:
searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc')

In [40]:
searcher.fit(train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9604, learning_rate=0.1, max_depth=-1,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.179, n_estimator=100, n_estimators=100, n_jobs=-1,
        num_leaves=31, objective=None, random_state=None, reg_alpha=9.6523,
        reg_lambda=1, silent=True, subsample=0.9609,
        subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(5, 15, 2), 'min_child_weight': range(1, 15, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [42]:
print(searcher.grid_scores_)
print("========================")
print(searcher.best_params_)
print("========================")
print(searcher.best_score_)


[mean: 0.77125, std: 0.00306, params: {'max_depth': 5, 'min_child_weight': 1}, mean: 0.77150, std: 0.00314, params: {'max_depth': 5, 'min_child_weight': 3}, mean: 0.77092, std: 0.00426, params: {'max_depth': 5, 'min_child_weight': 5}, mean: 0.77073, std: 0.00359, params: {'max_depth': 5, 'min_child_weight': 7}, mean: 0.77145, std: 0.00298, params: {'max_depth': 5, 'min_child_weight': 9}, mean: 0.77174, std: 0.00290, params: {'max_depth': 5, 'min_child_weight': 11}, mean: 0.77270, std: 0.00173, params: {'max_depth': 5, 'min_child_weight': 13}, mean: 0.77345, std: 0.00291, params: {'max_depth': 7, 'min_child_weight': 1}, mean: 0.77382, std: 0.00340, params: {'max_depth': 7, 'min_child_weight': 3}, mean: 0.77330, std: 0.00335, params: {'max_depth': 7, 'min_child_weight': 5}, mean: 0.77394, std: 0.00303, params: {'max_depth': 7, 'min_child_weight': 7}, mean: 0.77348, std: 0.00308, params: {'max_depth': 7, 'min_child_weight': 9}, mean: 0.77367, std: 0.00309, params: {'max_depth': 7, 'min_ch



### 细调

In [43]:
param_grid = {
    'max_depth' : [12, 13, 14],
    'min_child_weight' : [8, 9, 10],
}

In [46]:
searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc')
searcher.fit(train, y_train)

print(searcher.grid_scores_)
print("========================")
print(searcher.best_params_)
print("========================")
print(searcher.best_score_)


[mean: 0.77437, std: 0.00327, params: {'max_depth': 12, 'min_child_weight': 8}, mean: 0.77430, std: 0.00377, params: {'max_depth': 12, 'min_child_weight': 9}, mean: 0.77477, std: 0.00342, params: {'max_depth': 12, 'min_child_weight': 10}, mean: 0.77439, std: 0.00318, params: {'max_depth': 13, 'min_child_weight': 8}, mean: 0.77479, std: 0.00340, params: {'max_depth': 13, 'min_child_weight': 9}, mean: 0.77482, std: 0.00289, params: {'max_depth': 13, 'min_child_weight': 10}, mean: 0.77458, std: 0.00337, params: {'max_depth': 14, 'min_child_weight': 8}, mean: 0.77494, std: 0.00325, params: {'max_depth': 14, 'min_child_weight': 9}, mean: 0.77472, std: 0.00327, params: {'max_depth': 14, 'min_child_weight': 10}]
{'max_depth': 14, 'min_child_weight': 9}
0.7749436130861374




## step 2.5 插曲：忘记了参数 num_leaves，在这里补上

In [47]:
init_params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 31,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain': 0.179,
    'metric': 'auc', 
}
lgb_score(init_params, train, y_train)



(0.775656387390286, 100)

In [50]:
init_params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 100,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain': 0.179,
    'metric': 'auc', 
}
lgb_score(init_params, train, y_train)



(0.7765023963754675, 100)

In [53]:
init_params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 95,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain': 0.179,
    'metric': 'auc', 
}
lgb_score(init_params, train, y_train)



(0.7763945871634774, 99)

In [63]:
init_params = {
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain': 0.179,
    'metric': 'auc', 
}

param_grid = {
    'num_leaves': range(20, 100, 10),
}


In [64]:
searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc')
searcher.fit(train, y_train)

print(searcher.grid_scores_)
print("========================")
print(searcher.best_params_)
print("========================")
print(searcher.best_score_)


[mean: 0.77362, std: 0.00327, params: {'num_leaves': 20}, mean: 0.77434, std: 0.00334, params: {'num_leaves': 30}, mean: 0.77390, std: 0.00334, params: {'num_leaves': 40}, mean: 0.77517, std: 0.00362, params: {'num_leaves': 50}, mean: 0.77435, std: 0.00414, params: {'num_leaves': 60}, mean: 0.77438, std: 0.00379, params: {'num_leaves': 70}, mean: 0.77265, std: 0.00388, params: {'num_leaves': 80}, mean: 0.77367, std: 0.00431, params: {'num_leaves': 90}]
{'num_leaves': 50}
0.7751657038584755




In [65]:
param_grid = {
    'num_leaves': range(45, 55, 1),
}
searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc')
searcher.fit(train, y_train)

print(searcher.grid_scores_)
print("========================")
print(searcher.best_params_)
print("========================")
print(searcher.best_score_)



[mean: 0.77364, std: 0.00398, params: {'num_leaves': 45}, mean: 0.77433, std: 0.00362, params: {'num_leaves': 46}, mean: 0.77396, std: 0.00368, params: {'num_leaves': 47}, mean: 0.77408, std: 0.00439, params: {'num_leaves': 48}, mean: 0.77414, std: 0.00375, params: {'num_leaves': 49}, mean: 0.77517, std: 0.00362, params: {'num_leaves': 50}, mean: 0.77445, std: 0.00425, params: {'num_leaves': 51}, mean: 0.77387, std: 0.00396, params: {'num_leaves': 52}, mean: 0.77394, std: 0.00426, params: {'num_leaves': 53}, mean: 0.77439, std: 0.00369, params: {'num_leaves': 54}]
{'num_leaves': 50}
0.7751657038584755




## step 3. 调整 min_split_gain

In [66]:
init_params = {
    'nthread': 8,
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 50,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'metric': 'auc', 
}

In [67]:
param_grid = {
    'min_split_gain' : np.arange(0.15, 0.2, 0.1),
}

In [68]:
searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc')
searcher.fit(train, y_train)

print(searcher.grid_scores_)
print("========================")
print(searcher.best_params_)
print("========================")
print(searcher.best_score_)


[mean: 0.77484, std: 0.00405, params: {'min_split_gain': 0.1}, mean: 0.77509, std: 0.00377, params: {'min_split_gain': 0.2}, mean: 0.77444, std: 0.00425, params: {'min_split_gain': 0.30000000000000004}, mean: 0.77476, std: 0.00345, params: {'min_split_gain': 0.4}]
{'min_split_gain': 0.2}
0.7750868191685372




## step 4. 调整subsample 和 colsample_bytree 参数

In [77]:
init_params = {
    'nthread': 8,
    'learning_rate' : 0.1,
    'n_estimators' : 100,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 50,
    'min_split_gain': 0.2,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'colsample_bytree': 1.0,
    'metric': 'auc', 
}


**测试发现colsample_bytree 对结果无影响**

In [78]:
param_grid = {
 'subsample': np.arange(0.9, 1.0, 0.01),
}

In [82]:
# searcher = GridSearchCV(estimator=lgb.LGBMClassifier(**init_params), param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=8)
# searcher.fit(train, y_train)

# for i in searcher.grid_scores_:
#     print(i)
# print("========================")
# print(searcher.best_params_)
# print("========================")
# print(searcher.best_score_)


()