# 这个文件做了什么

- XGBoost 11模型处理数据不平衡（为啥是11，因为正负样本比是1:11）
- LightGBM 11模型，获得最优结果0.772

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
import gc
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb


def data_prepare(num=None):
    train = pd.read_csv('../data/handled/train.csv', nrows=num, index_col=0)
    test = pd.read_csv('../data/handled/test.csv', nrows=num, index_col=0)
    y_train = pd.read_csv('../data/handled/y_train.csv', nrows=num, header=-1, index_col=0)
    return train, test, y_train


def output_result(test_id, test_prob, sid=''):
    result = pd.DataFrame(np.column_stack((test_id, test_prob)))
    result.columns = ['SK_ID_CURR', 'TARGET']
    result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
    result.to_csv('./submission/submission_' + str(sid) + '.csv', header=True, index=False)


def show_importance(model, num=20, height=0.8):
    xgb.plot_importance(model, max_num_features=num, height=height)
    plt.show()


def xgb_k_folder_cv(params, xgtrain, fold=5, seed=918):
    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=fold, seed=seed)
    return cv   # ['test-auc-mean'].values[-1]


def xgb_evaluate(params,
                 xgtrain,
                 #   以下需要再次调用匿名函数封装
                 eta,
                 min_child_weight,
                 cosample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):
    params['eta'] = max(eta, 0)
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(cosample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)

    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=5, seed=918)
    return cv['test-auc-mean'].values[-1]


def xgb_no_feature_select(train: pd.DataFrame, test: pd.DataFrame, y_train, cv=False):
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def xgb_feature_select(train, test, y_train, importance, top_num=None, cv=False):
    if top_num:
        threshold = np.sort(importance)[-top_num-1]
    else:
        threshold = 0
    select_id = [True if i > threshold else False for i in importance]
    train = train.loc[:, select_id]
    test = test.loc[:, select_id]
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    xgtest = xgb.DMatrix(test)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def xgb_bayes_opt(train, y_train):
    # num_rounds = 3000
    random_state = 918
    num_iter = 25
    init_points = 5
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': random_state,
    }
    xgtrian = xgb.DMatrix(train, label=y_train)
    _xgb_evaluate = lambda a, b, c, d, e, f, g: xgb_evaluate(params, xgtrian, a, b, c, d, e, f, g)
    xgbBO = BayesianOptimization(_xgb_evaluate, {
        'eta': (0.1, 0.5),
        'min_child_weight': (1, 20),
        'cosample_bytree': (0.1, 1),
        'max_depth': (5, 15),
        'subsample': (0.5, 1),
        'gamma': (0, 10),
        'alpha': (0, 10)
    })
    xgbBO.maximize(init_points=init_points, n_iter=num_iter)
    return xgbBO


def xgb_unbalance_handle(train, test):
    pass


def models_stack(trian, test):
    pass




In [2]:
df_train, df_test, y_train  = data_prepare()

In [3]:
im = Imputer()
train = im.fit_transform(df_train.values)
test = im.transform(df_test.values)

In [4]:
idxs = np.arange(df_train.shape[0])
np.random.seed(918)
np.random.shuffle(idxs)

idx_list = []
num = df_train.shape[0] // 11
for i in range(11):
    if i != 0:
        idx_list.append(idxs[i * num:(i+1) * num])
    else:
        idx_list.append(idxs[10 * num: ])

length = 0
for i in idx_list:
    length += len(i)
length

307511

# XGboost

In [32]:
params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }

In [33]:
cls_list = [XGBClassifier(**params) for i in range(11)]

In [37]:
for i, idx in enumerate(idx_list):
    new_train = train[idx, :]
    new_y_train = y_train.values[idx]
    cls_list[i].fit(new_train, new_y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [38]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test)[:, 1])


In [46]:
res = np.mean(np.array(predict_list), axis=0)

In [48]:
output_result(df_test.index, res, sid='_11models_mean')  # 0.769

# LightGBM

In [21]:
## 0.772的参数
# params_lgb = {
#     'nthread': 4,
#     #is_unbalance=True,
#     'n_estimators' : 10000,
#     'learning_rate' : 0.1171,
#     #'num_leaves' : 32,
#     'colsample_bytree' : 0.9604,
#     'subsample' : 0.9609,
#     'max_depth' : 7,
#     'reg_alpha' : 9.6523,
#     'reg_lambda' : 1,
#     'min_split_gain' : 0.179,
#     'min_child_weight' : 13,
#     'metric': 'auc',
#     'silent': -1,
#     'verbose': -1,
#     #scale_pos_weight=11
# }

# 测试参数
params_lgb = {
    'nthread': 4,
    'learning_rate' : 0.1,
    'n_estimators' : 10000,
    'max_depth' : 14,
    'min_child_weight' : 9,
    'num_leaves': 50,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'reg_alpha' : 1,
    'reg_lambda' : 1,
    'min_split_gain': 0.179,
    'metric': 'auc', 
}

In [14]:
# clf = lgb.LGBMClassifier(**params_lgb)
# clf.fit(train, y_train.values.ravel())

In [15]:
# with open('./importance.txt', 'w') as f:
#     f.write(",".join([str(i) for i in clf.feature_importances_]))

In [20]:
with open('./importance.txt', 'r') as f:
    imp = f.readline()
imp = np.array([float(i) for i in imp.split(",")])
Counter(imp!=0)

Counter({True: 494, False: 235})

In [20]:
xgtrain = lgb.Dataset(train, label=y_train.values.ravel())
lgb.cv(params_lgb, xgtrain, 10, nfold=5, metrics='auc', early_stopping_rounds=10,
       # This is what I added
        stratified=False)


#model = lgb.LGBMClassifier(**params)
#model.fit(train, y_train)
#y_predict = model.predict_proba(test)
#return model, y_predict



{'auc-mean': [0.7126279264511636,
  0.7197547605101197,
  0.7247751276220311,
  0.7279719631060804,
  0.7306819781017511,
  0.7328141498283063,
  0.7349241747183818,
  0.7373736612831365,
  0.7391132953490921,
  0.7410615901503937,
  0.7425229933110855,
  0.7440041700798464,
  0.7455445239450417,
  0.7467603001257452,
  0.7481734541327923,
  0.7492153812907109,
  0.7502546190981512,
  0.7515502058590728,
  0.7528323601262417,
  0.7537306188279674,
  0.7549260717300712,
  0.7560719958336332,
  0.7568125000569209,
  0.7578306432866746,
  0.7587043063756489,
  0.7595713020567706,
  0.7604552977590439,
  0.7610956963967888,
  0.7617481502384162,
  0.7622708471671141,
  0.7628948955516244,
  0.7635365755882105,
  0.7639369505255551,
  0.7644087974557694,
  0.7647509586025716,
  0.7650497692493698,
  0.7655410058266116,
  0.7657865157869506,
  0.76614172639534,
  0.7665247165544494,
  0.7668657935940788,
  0.7671633095376243,
  0.7674062875940078,
  0.7676167898025055,
  0.7678919445824154,


In [22]:
cls_list = [lgb.LGBMClassifier(**params_lgb) for i in range(11)]

In [23]:
for i, idx in enumerate(idx_list):
    new_train = train[idx, :]
    new_y_train = y_train.values.ravel()[idx]
    cls_list[i].fit(new_train, new_y_train)

KeyboardInterrupt: 

In [None]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test)[:, 1])


In [None]:
res = np.mean(np.array(predict_list), axis=0)

output_result(df_test.index, res, sid='11models_mean_lgb_gridsearch_lhy')  # 0.772 11models_mean_lgb

In [29]:
params_lgb = {
    'nthread': 4,
    #is_unbalance=True,
    'n_estimators' : 100,
    'learning_rate' : 0.1171,
    'num_leaves' : 100,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'max_depth' : 7,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain' : 0.179,
    'min_child_weight' : 13,
    'metric': 'auc',
    #'silent': -1,
    'verbose': -1,
    #scale_pos_weight=11
}


In [30]:
cls_list = [lgb.LGBMClassifier(**params_lgb) for i in range(11)]

In [31]:
for i, idx in enumerate(idx_list):
    new_train = train[idx, :]
    new_y_train = y_train.values.ravel()[idx]
    cls_list[i].fit(new_train, new_y_train)

In [32]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test)[:, 1])


In [33]:
res = np.mean(np.array(predict_list), axis=0)

output_result(df_test.index, res, sid='11models_mean_lgb_change_leaf_num_100')  # 0.772

# 测试 RobusxtScaler

In [41]:
from sklearn.preprocessing import RobustScaler

In [45]:
y_train = y_train.values.ravel()

In [49]:
xgb_no_feature_select(train, test, y_train, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.705973       0.001680       0.700744      0.004082
1        0.714776       0.002003       0.708041      0.005586
2        0.721093       0.001433       0.712055      0.005532
3        0.729924       0.001927       0.719409      0.005051
4        0.733852       0.002252       0.722567      0.004702
5        0.738533       0.001697       0.726365      0.004990
6        0.742360       0.001394       0.729371      0.005238
7        0.745623       0.001581       0.731764      0.005240
8        0.749460       0.001838       0.734793      0.005099
9        0.752700       0.001473       0.737192      0.005509


(XGBClassifier(alpha=9.6523, base_score=0.5, booster='gbtree',
        colsample_bylevel=1, colsample_bytree=1, cosample_bytree=0.9604,
        eta=0.1171, eval_metric='auc', gamma=0.179, learning_rate=0.1,
        max_delta_step=0, max_depth=7, min_child_weight=13, missing=None,
        n_estimators=100, n_jobs=1, nthread=4, objective='binary:logistic',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=918, silent=1, subsample=0.9609, verbose_eval=True),
 array([[0.9400627 , 0.05993729],
        [0.8798291 , 0.12017091],
        [0.9652743 , 0.03472573],
        ...,
        [0.9862132 , 0.01378676],
        [0.9527332 , 0.04726679],
        [0.8011434 , 0.19885659]], dtype=float32))

In [46]:
rs = RobustScaler()

In [48]:
train2 = rs.fit_transform(train, y_train)
test2 = rs.transform(test)

In [50]:
xgb_no_feature_select(train2, test2, y_train, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.705973       0.001680       0.700744      0.004082
1        0.714776       0.002003       0.708041      0.005586
2        0.721093       0.001433       0.712055      0.005532
3        0.729924       0.001927       0.719408      0.005051
4        0.733852       0.002252       0.722566      0.004702
5        0.738533       0.001697       0.726364      0.004990
6        0.742360       0.001394       0.729370      0.005237
7        0.745623       0.001581       0.731763      0.005240
8        0.749460       0.001838       0.734792      0.005098
9        0.752701       0.001473       0.737192      0.005509


(XGBClassifier(alpha=9.6523, base_score=0.5, booster='gbtree',
        colsample_bylevel=1, colsample_bytree=1, cosample_bytree=0.9604,
        eta=0.1171, eval_metric='auc', gamma=0.179, learning_rate=0.1,
        max_delta_step=0, max_depth=7, min_child_weight=13, missing=None,
        n_estimators=100, n_jobs=1, nthread=4, objective='binary:logistic',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=918, silent=1, subsample=0.9609, verbose_eval=True),
 array([[0.9540579 , 0.04594211],
        [0.88835865, 0.11164133],
        [0.96245784, 0.03754215],
        ...,
        [0.9888067 , 0.01119326],
        [0.96069896, 0.03930106],
        [0.79613644, 0.20386358]], dtype=float32))