In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [1]:
cance_data = load_breast_cancer()
train_data = cance_data.data
train_label = cance_data.target
print(len(train_data), len(train_label))

NameError: name 'load_breast_cancer' is not defined

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, random_state=0, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))

455 114 455 114


In [10]:
# 初始化参数：给予重要参数一个初始值(意义不太大)，只是为了方便确定其他参数(便于调参)
params = {
    "boosting_type": "gbdt", 
    "objective": "binary", 
    "metric": "auc", 
    "nthread": 4, 
    "learning_rate": 0.1, 
    "num_leaves": 30, 
    "max_depth": 5, 
    "subsample": 0.8, 
    "colsample_bytree": 0.8
}

In [12]:
# step1: 确定学习率和迭代次数
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, 
                    stratified=False, shuffle=True, metrics="auc", 
                    early_stopping_rounds=50, seed=0)
print("best n_estimates: ", len(cv_results["auc-mean"]))
print("best cv score: ", pd.Series(cv_results["auc-mean"]).max())

best n_estimates:  188
best cv score:  0.9913471629808542


In [14]:
# 根据上述结果，选取 n_estimates = 188
# step2: 确定 max_depth 和 num_leaves
from sklearn.model_selection import GridSearchCV

params_test1 = {
    "max_depth": range(3, 8, 1), 
    "num_leaves": range(5, 100, 5)
}
model = lgb.LGBMClassifier(boosting_type="gbdt", 
                           objective="binary", 
                           metrics="auc", 
                           learning_rate=0.1, 
                           n_estimators=188, 
                           max_depth=6, 
                           bagging_fraction=0.8, 
                           feature_fraction=0.8)
gsearch1 = GridSearchCV(estimator=model, param_grid=params_test1, 
                        scoring="roc_auc", cv=5, n_jobs=-1)
gsearch1.fit(X_train, y_train)
print(gsearch1.best_params_, gsearch1.best_score_)

{'num_leaves': 10, 'max_depth': 4} 0.9943573667711598


In [15]:
# 根据上述结果可知，选取 max_depth = 4, num_leaves = 10
# step3: 确定 min_data_in_leaf 和 max_bin 参数
params_test2 = {
    "max_bin": range(5, 256, 10), 
    "min_data_in_leaf": range(1, 102, 10)
}
model2 = lgb.LGBMClassifier(boosting_type="gbdt", 
                            objective="binary", 
                            metrics="auc", 
                            learning_rate=0.1, 
                            n_estimators=188, 
                            max_depth=4, 
                            num_leaves=10, 
                            bagging_fraction=0.8, 
                            feature_fraction=0.8)
gsearch2 = GridSearchCV(estimator=model2, param_grid=params_test2, 
                        scoring="roc_auc", cv=5, n_jobs=-1)
gsearch2.fit(X_train, y_train)
print(gsearch2.best_params_, gsearch2.best_score_)

{'min_data_in_leaf': 51, 'max_bin': 15} 0.9952978056426331


In [17]:
# 根据上述结果，选取 min_data_in_leaf = 51, max_bin = 15
# step4: 确定 feature_fraction, bagging_fraction, bagging_freq
params_test3 = {
    "feature_fraction": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
    "bagging_fraction": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "bagging_freq": range(0, 100, 10)
}
model3 = lgb.LGBMClassifier(boosting_type="gbdt", 
                            objective="binary", 
                            metrics="auc", 
                            learning_rate=0.1, 
                            n_estimators=188, 
                            max_depth=4, 
                            num_leaves=10, 
                            max_bin=15, 
                            min_data_in_leaf=51)
gsearch3 = GridSearchCV(estimator=model3, param_grid=params_test3, 
                        scoring="roc_auc", cv=5, n_jobs=-1)
gsearch3.fit(X_train, y_train)
print(gsearch3.best_params_, gsearch3.best_score_)

{'bagging_freq': 0, 'feature_fraction': 0.8, 'bagging_fraction': 0.1} 0.9952978056426331


In [18]:
# 根据上述结果，选取 bagging_freq = 0, feature_fraction = 0.8, bagging_fraction = 0.1
# step5: 确定 lambda_l1 和 lambda_l2
params_test4 = {
    "lambda_l1": [1e-5, 1e-3, 1e-1, 0.0, 0.3, 0.5, 0.7, 0.9, 1.0], 
    "lambda_l2": [1e-5, 1e-3, 1e-1, 0.0, 0.3, 0.5, 0.7, 0.9, 1.0]
}
model4 = lgb.LGBMClassifier(boosting_type="gbdt", 
                            objective="binary", 
                            metrics="auc", 
                            learning_rate=0.1, 
                            n_estimators=188, 
                            max_depth=4, 
                            num_leaves=10, 
                            max_bin=15, 
                            min_data_in_leaf=51, 
                            bagging_fraction=0.6, 
                            bagging_freq=0, 
                            feature_fraction=0.8)
gsearch4 = GridSearchCV(estimator=model4, param_grid=params_test4, 
                        scoring="roc_auc", cv=5, n_jobs=-1)
gsearch4.fit(X_train, y_train)
print(gsearch4.best_params_, gsearch4.best_score_)

{'lambda_l1': 1e-05, 'lambda_l2': 1e-05} 0.9952978056426331


In [19]:
# 根据上述结果， 选取 lambda_l1 = 1e-5, lambda_l2 = 1e-5
# step6: 确定 min_split_gain
params_test5 = {
    "min_split_gain": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
model5 = lgb.LGBMClassifier(boosting_type="gbdt", 
                            objective="binary", 
                            metrics="auc", 
                            learning_rate=0.1, 
                            n_estimators=188, 
                            max_depth=4, 
                            num_leaves=10, 
                            max_bin=15, 
                            min_data_in_leaf=51, 
                            bagging_fraction=0.6, 
                            bagging_freq=0, 
                            feature_fraction=0.8, 
                            lambda_l1=1e-5, 
                            lambda_l2=1e-5)
gsearch5 = GridSearchCV(estimator=model5, param_grid=params_test5, 
                        scoring="roc_auc", cv=5, n_jobs=-1)
gsearch5.fit(X_train, y_train)
print(gsearch5.best_params_, gsearch5.best_score_)

{'min_split_gain': 0.0} 0.9952978056426331


In [21]:
# 根据上述结果，选取 min_split_gain = 0.0
# step7: 降低学习率，增加迭代次数，验证模型
from sklearn import metrics
final_model = lgb.LGBMClassifier(boosting_type="gbdt", 
                                 objective="binary", 
                                 metrics="auc", 
                                 learning_rate=0.01, 
                                 n_estimators=1000, 
                                 max_depth=4, 
                                 num_leaves=10, 
                                 max_bin=15, 
                                 min_data_in_leaf=51, 
                                 bagging_fraction=0.6, 
                                 bagging_freq=0, 
                                 feature_fraction=0.8, 
                                 lambda_l1=1e-5, 
                                 lambda_l2=1e-5, 
                                 min_split_gain=0.0)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("acc: ", metrics.accuracy_score(y_test, y_pred))
print("auc: ", metrics.roc_auc_score(y_test, y_pred))

acc:  0.9736842105263158
auc:  0.9744363289933311


In [22]:
# 使用默认参数
default_model = lgb.LGBMClassifier()
default_model.fit(X_train, y_train)
y_pred = default_model.predict(X_test)
print("default acc: ", metrics.accuracy_score(y_test, y_pred))
print("default auc: ", metrics.roc_auc_score(y_test, y_pred))

default acc:  0.9649122807017544
default auc:  0.9637980311209908
