# 这个文件做了什么

- 不同数据集的xgboost测试
    - 不处理缺失值
    - 处理缺失值，不做特征选择
    - 处理缺失值， 在model_2基础上做特征选择，剔除importance为0的变量
    - 处理缺失值， 在model_2基础上做特征选择，剔除importance排名50以后的变量（目前效果最好）
- 根据上面最优的结果，用bayes_opt和xgboost进行调参

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
import gc
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb


def data_prepare(num=None):
    train = pd.read_csv('../data/handled/train.csv', nrows=num, index_col=0)
    test = pd.read_csv('../data/handled/test.csv', nrows=num, index_col=0)
    y_train = pd.read_csv('../data/handled/y_train.csv', nrows=num, header=-1, index_col=0)
    return train, test, y_train


def output_result(test_id, test_prob, sid=''):
    result = pd.DataFrame(np.column_stack((test_id, test_prob)))
    result.columns = ['SK_ID_CURR', 'TARGET']
    result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
    result.to_csv('./submission/submission_' + str(sid) + '.csv', header=True, index=False)


def show_importance(model, num=20, height=0.8):
    xgb.plot_importance(model, max_num_features=num, height=height)
    plt.show()


def xgb_k_folder_cv(params, xgtrain, fold=5, seed=918):
    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=fold, seed=seed)
    return cv   # ['test-auc-mean'].values[-1]


def xgb_evaluate(params,
                 xgtrain,
                 #   以下需要再次调用匿名函数封装
                 eta,
                 min_child_weight,
                 cosample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):
    params['eta'] = max(eta, 0)
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(cosample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)

    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=5, seed=918)
    return cv['test-auc-mean'].values[-1]


def xgb_no_feature_select(train: pd.DataFrame, test: pd.DataFrame, y_train, cv=False):
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def get_select_ids(importance, top_num=None):
    if top_num:
        threshold = np.sort(importance)[-top_num-1]
    else:
        threshold = 0
    select_id = [True if i > threshold else False for i in importance]
    return select_id

def xgb_feature_select(train, test, y_train, importance, top_num=None, cv=False):
    select_id = get_select_ids(importance, top_num)
    train = train.loc[:, select_id]
    test = test.loc[:, select_id]
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    xgtest = xgb.DMatrix(test)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def xgb_bayes_opt(train, y_train):
    # num_rounds = 3000
    random_state = 918
    num_iter = 25
    init_points = 5
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': random_state,
    }
    xgtrian = xgb.DMatrix(train, label=y_train)
    _xgb_evaluate = lambda eta, min_child_weight, cosample_bytree, max_depth, subsample, gamma, alpha: xgb_evaluate(params, 
                                                                                                                    xgtrian, eta, min_child_weight, cosample_bytree, max_depth, subsample, gamma, alpha)
    xgbBO = BayesianOptimization(_xgb_evaluate, {
        'eta': (0.1, 0.5),
        'min_child_weight': (1, 20),
        'cosample_bytree': (0.1, 1),
        'max_depth': (5, 15),
        'subsample': (0.5, 1),
        'gamma': (0, 10),
        'alpha': (0, 10)
    })
    xgbBO.maximize(init_points=init_points, n_iter=num_iter)
    return xgbBO


def xgb_unbalance_handle(train, test):
    pass


def models_stack(trian, test):
    pass


def other_feature_select_method():
    pass


def xgb_other_params_adj():
    pass

In [3]:
df_train, df_test, y_train  = data_prepare()

In [4]:
im = Imputer()
train = im.fit_transform(df_train.values)
test = im.transform(df_test.values)

In [12]:
# 不处理缺失值
model_1, predict_1 = xgb_no_feature_select(df_train, df_test, y_train, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.707869       0.001570       0.701674      0.005074
1        0.717715       0.002910       0.709581      0.005716
2        0.724155       0.001126       0.714875      0.006228
3        0.729238       0.001824       0.718391      0.005958
4        0.735447       0.001496       0.722992      0.006305
5        0.740085       0.001239       0.726137      0.006038
6        0.743385       0.000927       0.728394      0.006291
7        0.747936       0.001394       0.732213      0.005573
8        0.751383       0.001561       0.734831      0.005485
9        0.754769       0.001466       0.736970      0.005613


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
# 处理缺失值，不做特征选择
model_2, predict_2 = xgb_no_feature_select(train, test, y_train, cv=False)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [28]:
# 处理缺失值， 在model_2基础上做特征选择，剔除importance为0的变量
model_3, predict_3 = xgb_feature_select(pd.DataFrame(train), pd.DataFrame(test), y_train, model_2.feature_importances_ cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.705940       0.001702       0.700791      0.004070
1        0.715977       0.001601       0.708964      0.005218
2        0.721559       0.001921       0.713096      0.004604
3        0.729107       0.003050       0.719017      0.004984
4        0.734947       0.001923       0.723125      0.005808
5        0.738053       0.001160       0.725641      0.005626
6        0.741941       0.001742       0.728415      0.005118
7        0.746078       0.001233       0.731368      0.005458
8        0.749308       0.000856       0.733661      0.005486
9        0.752145       0.000717       0.735624      0.005890


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [6]:
# 处理缺失值， 在model_2基础上做特征选择，剔除importance排名50以后的变量（目前效果最好）
model_4, predict_4 = xgb_feature_select(pd.DataFrame(train), pd.DataFrame(test), y_train, model_2.feature_importances_, top_num=50, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.706042       0.001785       0.701831      0.004747
1        0.714816       0.001182       0.709383      0.006181
2        0.721308       0.002198       0.714526      0.005455
3        0.727664       0.003177       0.719543      0.004026
4        0.732737       0.001711       0.723735      0.005447
5        0.737351       0.001959       0.727239      0.005994
6        0.741039       0.001453       0.730273      0.005834
7        0.744264       0.001350       0.732804      0.005809
8        0.747553       0.001487       0.735507      0.005466
9        0.750888       0.001121       0.737955      0.006092


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# 处理缺失值，用lasso进行特征选择
# clf = LassoCV()
# sfm = SelectFromModel(LogisticRegression(penalty='l1', C=0.2))
# sfm.fit(train, y_train)


# train2 = sfm.transform(train)
# test2 = sfm.transform(test)

# train2.shape

# model_5, predict_5 = xgb_no_feature_select(train2, test2, y_train, cv=True)

  y = column_or_1d(y, warn=True)


# 选择前50个特征进行优化

In [6]:
select_id = get_select_ids( model_2.feature_importances_, top_num=50)


In [8]:
xgb_bayes_opt(train[:, select_id], y_train)

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   cosample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   subsample | 
    1 | 00m47s | [35m   0.70602[0m | [32m   3.0266[0m | [32m           0.9476[0m | [32m   0.4695[0m | [32m   0.1433[0m | [32m    14.3798[0m | [32m            5.9790[0m | [32m     0.7988[0m | 
    2 | 00m33s | [35m   0.74914[0m | [32m   8.4436[0m | [32m           0.9429[0m | [32m   0.2094[0m | [32m   2.5442[0m | [32m    11.9146[0m | [32m            1.4303[0m | [32m     0.9731[0m | 
    3 | 00m39s | [35m   0.75593[0m | [32m   2.5162[0m | [32m           0.3097[0m | [32m   0.4260[0m | [32m   2.5674[0m | [32m    12.2332[0m | [32m           14.4849[0m | [32m     0.9831[0m | 
    4 | 00m29s | [35m   0.75621[0m | [32m   9.3030[0m | [32m       

  " state: %s" % convergence_dict)


   15 | 00m31s |    0.72737 |    8.4032 |            0.2459 |    0.1131 |    7.5971 |      8.1175 |            19.9348 |      0.9538 | 
   16 | 00m46s |    0.75168 |    9.6840 |            0.1720 |    0.3426 |    0.6509 |     14.9173 |            17.2413 |      0.5072 | 
   17 | 00m50s |    0.70919 |    1.4669 |            0.1000 |    0.5000 |    0.0000 |     13.5030 |            20.0000 |      1.0000 | 
   18 | 00m23s |    0.74843 |    0.1304 |            0.9735 |    0.4493 |    4.4563 |      5.4239 |            11.1289 |      0.5334 | 
   19 | 00m46s |    0.74831 |    9.9399 |            0.1212 |    0.4802 |    4.3441 |     13.0097 |             9.4828 |      0.5837 | 
   20 | 00m34s |    0.74415 |    4.1692 |            0.1000 |    0.5000 |    5.5960 |      9.0141 |             1.0000 |      0.5000 | 
   21 | 00m22s |    0.75152 |    8.0354 |            0.9687 |    0.4850 |    1.6274 |      5.0049 |             1.0000 |      0.5118 | 


  " state: %s" % convergence_dict)


   22 | 00m35s | [35m   0.75708[0m | [32m   7.4560[0m | [32m           1.0000[0m | [32m   0.4145[0m | [32m   1.3210[0m | [32m     9.6588[0m | [32m           14.9833[0m | [32m     0.9887[0m | 


  " state: %s" % convergence_dict)


   23 | 00m33s |    0.73800 |    0.0442 |            0.9040 |    0.1350 |    5.6013 |      9.8169 |            16.3595 |      0.8644 | 
   24 | 00m25s |    0.75547 |    4.5700 |            0.1800 |    0.4824 |    2.1971 |      6.1241 |            16.4240 |      0.9958 | 
   25 | 00m31s |    0.75537 |    9.8227 |            0.1630 |    0.4410 |    0.6713 |      8.9389 |             1.2392 |      0.7224 | 
   26 | 00m49s |    0.75231 |    5.7913 |            0.1032 |    0.4838 |    5.0950 |     13.1176 |            15.2090 |      0.9951 | 
   27 | 00m25s |    0.75423 |    0.0340 |            0.2073 |    0.4384 |    0.1598 |      6.6958 |            12.0528 |      0.9744 | 
   28 | 00m24s |    0.75484 |    9.3045 |            0.6895 |    0.4498 |    0.0445 |      5.5405 |            19.9142 |      0.7695 | 
   29 | 00m23s |    0.74738 |    0.0000 |            0.1000 |    0.5000 |   10.0000 |      5.0000 |            13.3978 |      0.8983 | 
   30 | 00m36s |    0.74588 |    4.1113 |       

<bayes_opt.bayesian_optimization.BayesianOptimization at 0x7fae64235748>

In [12]:
new_params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 7.4560,
        'cosample_bytree': 1,
        'eta': 0.4145,
        'gamma': 1.3210,
        'max_depth': 10,
        'min_child_weight': 14.9833,
        'subsample': 0.9887 
}

xgtrain = xgb.DMatrix(train[:, select_id], label=y_train)
cv_res = xgb_k_folder_cv(new_params, xgtrain)
print(cv_res)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.710163       0.001028       0.704493      0.005373
1        0.731842       0.001143       0.722752      0.005548
2        0.745533       0.001212       0.733423      0.005197
3        0.755561       0.001319       0.740539      0.004901
4        0.762862       0.000590       0.745473      0.005074
5        0.768425       0.001029       0.749519      0.004855
6        0.772858       0.001076       0.752282      0.004501
7        0.775894       0.000991       0.754436      0.004672
8        0.778441       0.000856       0.756022      0.004437
9        0.780419       0.000932       0.757220      0.004167


In [7]:
idxs = np.arange(df_train.shape[0])
np.random.seed(918)
np.random.shuffle(idxs)

idx_list = []
num = df_train.shape[0] // 11
for i in range(11):
    if i != 0:
        idx_list.append(idxs[i * num:(i+1) * num])
    else:
        idx_list.append(idxs[10 * num: ])

length = 0
for i in idx_list:
    length += len(i)
length

307511

## 使用 LGB并且分11份进行融合

对照表：https://blog.csdn.net/weiyongle1996/article/details/78446244/

In [12]:
params_lgb = {
    'nthread': 4,
    #is_unbalance=True,
    'n_estimators' : 10000,
    'learning_rate' : 0.4145,
    #'num_leaves' : 32,
    'colsample_bytree' : 1.0,
    'subsample' : 0.9887,
    'max_depth' : 10,
    'reg_alpha' : 7.4560,
    'reg_lambda' : 1,
    'min_split_gain' : 1.3210,
    'min_child_weight' : 14.9833,
    'metric': 'auc',
    'silent': -1,
    'verbose': -1,
    #scale_pos_weight=11
}
cls_list = [lgb.LGBMClassifier(**params_lgb) for i in range(11)]
for i, idx in enumerate(idx_list):
    new_train = train[:, select_id][idx, :]
    new_y_train = y_train.values[idx]
    cls_list[i].fit(new_train, new_y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test[:, select_id])[:, 1])

In [14]:
res = np.mean(np.array(predict_list), axis=0)

output_result(df_test.index, res, sid='11models_mean_lgb_feature_selected')  # 0.761

In [17]:
select_id_2 = get_select_ids( model_2.feature_importances_)


In [32]:
xgb_bayes_opt(train, y_train)

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   cosample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   subsample | 
    1 | 04m25s | [35m   0.75149[0m | [32m   4.3189[0m | [32m           0.8706[0m | [32m   0.4793[0m | [32m   3.4073[0m | [32m    11.8852[0m | [32m            1.2523[0m | [32m     0.5476[0m | 
    2 | 04m38s |    0.73406 |    2.2241 |            0.3628 |    0.1189 |    5.3189 |     12.5851 |            13.7512 |      0.7440 | 
    3 | 02m29s |    0.74166 |    9.9938 |            0.6321 |    0.4288 |    9.7143 |      5.3945 |             8.4153 |      0.6080 | 
    4 | 04m16s | [35m   0.75376[0m | [32m   9.3430[0m | [32m           0.8629[0m | [32m   0.4114[0m | [32m   3.6532[0m | [32m    11.1689[0m | [32m            3.7081[0m | [32m     0.8137[0m | 
    5 | 

  " state: %s" % convergence_dict)


   11 | 02m25s |    0.73802 |    9.2431 |            0.1012 |    0.1635 |    2.3191 |      5.0247 |            19.7658 |      0.9331 | 
   12 | 02m21s |    0.74995 |    0.0000 |            0.1000 |    0.5000 |    7.4377 |      5.0000 |             2.6587 |      1.0000 | 
   13 | 05m27s |    0.72226 |    7.5087 |            0.2006 |    0.4711 |    0.1676 |     14.6824 |             2.3424 |      0.8735 | 
   14 | 05m09s |    0.74788 |    9.8905 |            0.8040 |    0.4246 |    9.9410 |     14.3655 |            19.6972 |      0.9432 | 
   15 | 02m56s |    0.74894 |    6.7998 |            0.8695 |    0.4531 |    9.9907 |      8.2168 |             1.6439 |      0.9919 | 
   16 | 02m08s | [35m   0.75509[0m | [32m   4.3554[0m | [32m           0.9234[0m | [32m   0.4817[0m | [32m   2.8024[0m | [32m     5.1707[0m | [32m           10.9578[0m | [32m     0.9606[0m | 
   17 | 02m04s |    0.73115 |    0.0000 |            1.0000 |    0.1000 |    0.1518 |      5.0000 |             

  " state: %s" % convergence_dict)


   28 | 03m10s | [35m   0.75665[0m | [32m   5.1538[0m | [32m           0.9957[0m | [32m   0.4950[0m | [32m   1.8564[0m | [32m     7.5292[0m | [32m           19.8110[0m | [32m     0.9914[0m | 
   29 | 04m31s |    0.75145 |    6.5462 |            0.9962 |    0.4741 |    6.3148 |     11.8320 |             1.1249 |      0.9381 | 
   30 | 03m48s |    0.73734 |    1.2879 |            0.9490 |    0.4964 |    0.2787 |      8.1213 |             9.5916 |      0.5919 | 


<bayes_opt.bayesian_optimization.BayesianOptimization at 0x7f5f21bd1828>

# 朴素贝叶斯测试

In [27]:

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
cross_val_score(gnb, X=train, y=y_train.values.ravel(), cv=10)

array([0.89421826, 0.89272242, 0.89239724, 0.89935614, 0.89652706,
       0.89840981, 0.89512195, 0.89661789, 0.89843902, 0.89899187])

In [28]:
gnb.fit(train, y_train.values.ravel())

GaussianNB(priors=None)

In [30]:
y_predict = gnb.predict_proba(test)[:, 1]

In [31]:
output_result(df_test.index, y_predict, sid='naive_bayes') 