In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score, recall_score, precision_score, auc, confusion_matrix, f1_score, roc_auc_score
from xgboost.sklearn import XGBClassifier
from collections import Counter
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [2]:
from base import reduce_mem_usage, read_csv, datapath, drop_features, tmppath, Box_Cox, train_drop_features, getTrainTest, minmax_target, combine_feature, auto_feature_make

In [3]:
train = reduce_mem_usage(read_csv(tmppath + '10FoldSubTrain.csv'))
test = reduce_mem_usage(read_csv(tmppath + '10FoldSubTest.csv'))

Memory usage after optimization is: 6.57 MB
Decreased by 47.5%
Memory usage after optimization is: 1.90 MB
Decreased by 47.5%


In [4]:
X_train = train.drop(['emd_lable2'], axis=1)
Y_train = train['emd_lable2'].astype(int)
test = test.drop(['emd_lable2'], axis=1)

In [5]:

discrete_list = ['seg_cabin','seg_flight','pref_line_y2_2','pref_line_y1_2','pref_line_y3_4','pref_line_y3_3','pref_line_y2_1','pref_city_y3_2','pref_city_y2_3','pref_line_y3_5','pref_line_y2_4','pref_city_y2_2','pref_line_y3_2','pref_line_y2_5','pref_line_y2_3','pref_orig_city_y3','pref_city_y3_3','pref_line_y3_1','pref_orig_y2_2','seg_route_to','pref_orig_y3_3','pref_orig_city_y2','pref_line_y1_3','pref_line_m3_3','pref_aircraft_y3_3','pref_aircraft_y2_5','seg_dep_time_hour']

In [6]:
feature_list = X_train.columns.tolist()
continue_list = list(set(feature_list) - set(discrete_list))

In [7]:
X_train, X_test = minmax_target(X_train, test, Y_train, continue_list, discrete_list) # 离散值编码与连续特征归一化

  elif pd.api.types.is_categorical(cols):


In [8]:
x_train, x_test, y_train, y_test =getTrainTest(X_train, Y_train)# 线下验证，80%训练集，20%验证集

In [9]:
dtrain = xgb.DMatrix(x_train[continue_list].values, label=y_train.values)
dtest = xgb.DMatrix(x_test[continue_list].values, label=y_test.values)

### 初始化自定义参数


# 存在问题1：cv函数确定迭代次数的方法，dtrain只使用了连续特征的数据
# 存在问题2： GridSearch使用cv==5,对不平衡数据可能存在偏差，应该使用Straifield生成的折叠器

In [12]:
xgb1 = XGBClassifier(max_depth=5,
                     learning_rate=0.1,
                     n_estimators=10000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=27)

测试在初始化自定义参数下的得分结果

In [13]:
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.6539459263375629 0.9818870104744122
f1= 0.41025641025641024 0.9714285714285714
precision_score= 0.5549132947976878 0.9776824034334763
recall_score= 0.3254237288135593 0.9652542372881356
accuracy= 0.9411137187966716 0.9964257135236063
auc= 0.6539459263375629 0.9818870104744122
#####混淆矩阵#########
[[4315   77]
 [ 199   96]] [[17539    26]
 [   41  1139]]


在当前学习率下利用cv函数获得最佳的迭代次数

In [14]:
cv_result = xgb.cv(xgb1.get_xgb_params(),
dtrain,
num_boost_round=xgb1.get_params()['n_estimators'],
nfold=5,
metrics='auc',
early_stopping_rounds=100,
callbacks=[xgb.callback.early_stop(100),
xgb.callback.print_evaluation(period=1,show_stdv=True)])

st-auc:0.76771+0.02130
[463]	train-auc:0.97140+0.00097	test-auc:0.76791+0.02127
[464]	train-auc:0.97152+0.00090	test-auc:0.76785+0.02118
[465]	train-auc:0.97157+0.00094	test-auc:0.76793+0.02110
[466]	train-auc:0.97162+0.00093	test-auc:0.76786+0.02123
[467]	train-auc:0.97169+0.00095	test-auc:0.76793+0.02109
[468]	train-auc:0.97176+0.00099	test-auc:0.76790+0.02114
[469]	train-auc:0.97187+0.00101	test-auc:0.76779+0.02122
[470]	train-auc:0.97192+0.00103	test-auc:0.76772+0.02108
[471]	train-auc:0.97197+0.00107	test-auc:0.76770+0.02101
[472]	train-auc:0.97198+0.00106	test-auc:0.76770+0.02100
[473]	train-auc:0.97204+0.00105	test-auc:0.76783+0.02084
[474]	train-auc:0.97211+0.00103	test-auc:0.76792+0.02093
[475]	train-auc:0.97217+0.00104	test-auc:0.76785+0.02110
[476]	train-auc:0.97222+0.00104	test-auc:0.76783+0.02106
[477]	train-auc:0.97231+0.00105	test-auc:0.76801+0.02101
[478]	train-auc:0.97240+0.00110	test-auc:0.76784+0.02084
[479]	train-auc:0.97249+0.00109	test-auc:0.76789+0.02081
[480]	tr

得到结果为712

In [15]:
xgb1 = XGBClassifier(max_depth=5,
                     learning_rate=0.1,
                     n_estimators=712,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=27)

下面调整max_depth和min_child_weight参数

首先先大范围粗调

In [16]:
param_grid = {'max_depth':range(1,9,2),
'min_child_weight':range(1,5,2)}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'max_depth': 7, 'min_child_weight': 1}
best_score: 0.21884645201651254


下面再次对这两个参数进行细微调整

In [17]:
param_grid = {'max_depth':[6,7,8],
'min_child_weight':[0,1,2]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'max_depth': 8, 'min_child_weight': 0}
best_score: 0.24610877865758782


细化调整后得分有所提高，故采用max_depth=8，min_child_weight=0

In [18]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.1,
                     n_estimators=712,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=27)

下面继续调整gamma参数

In [19]:
param_grid = {'gamma':[i/10.0 for i in range(0,11)]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'gamma': 0.1}
best_score: 0.24963431326754892


得分提高，得到最佳gamma值为0.1

In [10]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.1,
                     n_estimators=712,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=27)

下面调整subsample和colsample_bytree参数

In [21]:
param_grid = {'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
best_params: {'colsample_bytree': 1.0, 'subsample': 0.6}
best_score: 0.2581196763146104


再进行细化的参数测试

In [11]:
param_grid = {'subsample':[i/10.0 for i in range(0,6)],
'colsample_bytree':[i/10.0 for i in range(8,11)]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
best_params: {'colsample_bytree': 1.0, 'subsample': 0.4}
best_score: 0.2516351066294395


得分降低，故选择上一次参数，colsample_bytree=1.0, subsample=0.6

In [12]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.1,
                     n_estimators=712,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     seed=27)

最后尝试调整正则化参数

In [13]:
param_grid = {'reg_alpha':[i/10.0 for i in range(0,11)]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'reg_alpha': 0.0}
best_score: 0.2581196763146104


得分结果未改变，故不调整默认即可。

继续尝试调整reg_lambda参数观察结果是否有提高

In [14]:
param_grid = {'reg_lambda':[i/10.0 for i in range(0,11)]}
grid_search = GridSearchCV(xgb1,param_grid,scoring='f1',iid=False,cv=5)
grid_search.fit(x_train,y_train)
print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'reg_lambda': 0.2}
best_score: 0.2605130441438939


得分提高故采用此参数reg_lambda=0.2

In [11]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.1,
                     n_estimators=712,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [12]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.6349857985242815 0.9786110427612693
f1= 0.3878504672897196 0.9695670810115731
precision_score= 0.6240601503759399 0.9809193408499567
recall_score= 0.28135593220338984 0.9584745762711865
accuracy= 0.9441007040751014 0.9962123232862097
auc= 0.6349857985242815 0.9786110427612693
#####混淆矩阵#########
[[4342   50]
 [ 212   83]] [[17543    22]
 [   49  1131]]


降低学习率，然后测试最佳迭代次数

In [49]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.05,
                     n_estimators=10000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [50]:
cv_result = xgb.cv(xgb1.get_xgb_params(),
dtrain,
num_boost_round=xgb1.get_params()['n_estimators'],
nfold=5,
metrics='auc',
early_stopping_rounds=400,
callbacks=[xgb.callback.early_stop(400),
xgb.callback.print_evaluation(period=1,show_stdv=True)])

st-auc:0.78244+0.01057
[416]	train-auc:0.99700+0.00028	test-auc:0.78240+0.01056
[417]	train-auc:0.99701+0.00028	test-auc:0.78253+0.01064
[418]	train-auc:0.99702+0.00028	test-auc:0.78240+0.01076
[419]	train-auc:0.99704+0.00028	test-auc:0.78238+0.01066
[420]	train-auc:0.99705+0.00027	test-auc:0.78231+0.01049
[421]	train-auc:0.99708+0.00025	test-auc:0.78226+0.01038
[422]	train-auc:0.99710+0.00025	test-auc:0.78231+0.01027
[423]	train-auc:0.99712+0.00024	test-auc:0.78242+0.01045
[424]	train-auc:0.99714+0.00022	test-auc:0.78248+0.01046
[425]	train-auc:0.99716+0.00022	test-auc:0.78240+0.01023
[426]	train-auc:0.99718+0.00021	test-auc:0.78249+0.01031
[427]	train-auc:0.99720+0.00022	test-auc:0.78241+0.01019
[428]	train-auc:0.99723+0.00022	test-auc:0.78232+0.01006
[429]	train-auc:0.99724+0.00022	test-auc:0.78223+0.01027
[430]	train-auc:0.99724+0.00020	test-auc:0.78213+0.01023
[431]	train-auc:0.99727+0.00021	test-auc:0.78186+0.01022
[432]	train-auc:0.99728+0.00021	test-auc:0.78179+0.01017
[433]	tr

In [15]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.05,
                     n_estimators=365,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [16]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.5794205952270692 0.8057329434980002
f1= 0.26666666666666666 0.7564169722367731
precision_score= 0.7384615384615385 0.99039780521262
recall_score= 0.16271186440677965 0.611864406779661
accuracy= 0.9436739918924685 0.9751933849026407
auc= 0.5794205952270692 0.8057329434980002
#####混淆矩阵#########
[[4375   17]
 [ 247   48]] [[17558     7]
 [  458   722]]


继续调低学习率

In [46]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.025,
                     n_estimators=10000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [47]:
cv_result = xgb.cv(xgb1.get_xgb_params(),
dtrain,
num_boost_round=xgb1.get_params()['n_estimators'],
nfold=5,
metrics='auc',
early_stopping_rounds=400)


In [48]:
print(cv_result.shape[0])

548


In [32]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.025,
                     n_estimators=548,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [33]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.5577282269766293 0.7371458070990559
f1= 0.20348837209302328 0.6418338108882522
precision_score= 0.7142857142857143 0.9911504424778761
recall_score= 0.11864406779661017 0.4745762711864407
accuracy= 0.9415404309793045 0.9666577754067751
auc= 0.5577282269766293 0.7371458070990559
#####混淆矩阵#########
[[4378   14]
 [ 260   35]] [[17560     5]
 [  620   560]]


继续降低学习率测试

In [44]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.015,
                     n_estimators=10000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [45]:
cv_result = xgb.cv(xgb1.get_xgb_params(),
dtrain,
num_boost_round=xgb1.get_params()['n_estimators'],
nfold=5,
metrics='auc',
early_stopping_rounds=400)
print(cv_result.shape[0])

810


In [36]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.015,
                     n_estimators=810,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [37]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.5525296378623692 0.708303782078189
f1= 0.18713450292397663 0.5864123957091776
precision_score= 0.6808510638297872 0.9879518072289156
recall_score= 0.10847457627118644 0.41694915254237286
accuracy= 0.9406870066140388 0.9629767938116831
auc= 0.5525296378623692 0.7083037820781889
#####混淆矩阵#########
[[4377   15]
 [ 263   32]] [[17559     6]
 [  688   492]]


继续降低学习率

In [42]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.01,
                     n_estimators=10000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [43]:
cv_result = xgb.cv(xgb1.get_xgb_params(),
dtrain,
num_boost_round=xgb1.get_params()['n_estimators'],
nfold=5,
metrics='auc',
early_stopping_rounds=400)
print(cv_result.shape[0])

872


In [40]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.01,
                     n_estimators=872,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [41]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.5293700410607884 0.6379932647261745
f1= 0.11145510835913312 0.43150231634679015
precision_score= 0.6428571428571429 0.9848942598187311
recall_score= 0.061016949152542375 0.27627118644067794
accuracy= 0.9387668017921912 0.9541744465190718
auc= 0.5293700410607884 0.6379932647261745
#####混淆矩阵#########
[[4382   10]
 [ 277   18]] [[17560     5]
 [  854   326]]


手动调整学习率和迭代次数

In [75]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.05,
                     n_estimators=7250,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [74]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.6615352258343367 0.9814917473596858
f1= 0.4409799554565702 0.9714041826717882
precision_score= 0.6428571428571429 0.9785038693035254
recall_score= 0.33559322033898303 0.964406779661017
accuracy= 0.9464476210795818 0.9964257135236063
auc= 0.6615352258343368 0.9814917473596858
#####混淆矩阵#########
[[4337   55]
 [ 196   99]] [[17540    25]
 [   42  1138]]


In [84]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.1,
                     n_estimators=1350,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [85]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.6463948318977494 0.9838633260480443
f1= 0.4100227790432802 0.9715498938428875
precision_score= 0.625 0.9736170212765958
recall_score= 0.3050847457627119 0.9694915254237289
accuracy= 0.9447407723490505 0.9964257135236063
auc= 0.6463948318977494 0.9838633260480443
#####混淆矩阵#########
[[4338   54]
 [ 205   90]] [[17534    31]
 [   36  1144]]


In [88]:
xgb1 = XGBClassifier(max_depth=8,
                     learning_rate=0.01,
                     n_estimators=14000,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0.1,
                     min_child_weight=0,
                     subsample=0.6,
                     colsample_bytree=1.0,
                     reg_lambda=0.2,
                     seed=27)

In [89]:
#测试得分为
xgb_bst1 = xgb1.fit(x_train, y_train)
y_pred = xgb_bst1.predict(x_test)
y_pred2 = xgb_bst1.predict(x_train)
print("XGBoost结果")
print("balanced_accuracy_score=", balanced_accuracy_score(y_pred=y_pred, y_true=y_test),balanced_accuracy_score(y_pred=y_pred2, y_true=y_train))
print("f1=", f1_score(y_pred=y_pred, y_true=y_test), f1_score(y_pred=y_pred2, y_true=y_train))
print("precision_score=", precision_score(y_pred=y_pred, y_true=y_test),precision_score(y_pred=y_pred2, y_true=y_train))
print("recall_score=", recall_score(y_pred=y_pred, y_true=y_test), recall_score(y_pred=y_pred2, y_true=y_train))
print("accuracy=", accuracy_score(y_pred=y_pred, y_true=y_test), accuracy_score(y_pred=y_pred2, y_true=y_train))
print("auc=", roc_auc_score(y_true=y_test, y_score=y_pred), roc_auc_score(y_true=y_train, y_score=y_pred2))
print("#####混淆矩阵#########")
print(confusion_matrix(y_true=y_test, y_pred=y_pred), confusion_matrix(y_true=y_train, y_pred=y_pred2))

XGBoost结果
balanced_accuracy_score= 0.6438019048501127 0.9818870104744122
f1= 0.4093023255813954 0.9714285714285714
precision_score= 0.6518518518518519 0.9776824034334763
recall_score= 0.2983050847457627 0.9652542372881356
accuracy= 0.9458075528056326 0.9964257135236063
auc= 0.6438019048501127 0.9818870104744122
#####混淆矩阵#########
[[4345   47]
 [ 207   88]] [[17539    26]
 [   41  1139]]
