# 这个文件做了什么

- XGBoost 11模型处理数据不平衡（为啥是11，因为正负样本比是1:11）
- LightGBM 11模型，获得最优结果0.772

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
import gc
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb


def data_prepare(num=None):
    train = pd.read_csv('../data/handled/train.csv', nrows=num, index_col=0)
    test = pd.read_csv('../data/handled/test.csv', nrows=num, index_col=0)
    y_train = pd.read_csv('../data/handled/y_train.csv', nrows=num, header=-1, index_col=0)
    return train, test, y_train


def output_result(test_id, test_prob, sid=''):
    result = pd.DataFrame(np.column_stack((test_id, test_prob)))
    result.columns = ['SK_ID_CURR', 'TARGET']
    result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
    result.to_csv('./submission/submission_' + str(sid) + '.csv', header=True, index=False)


def show_importance(model, num=20, height=0.8):
    xgb.plot_importance(model, max_num_features=num, height=height)
    plt.show()


def xgb_k_folder_cv(params, xgtrain, fold=5, seed=918):
    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=fold, seed=seed)
    return cv   # ['test-auc-mean'].values[-1]


def xgb_evaluate(params,
                 xgtrain,
                 #   以下需要再次调用匿名函数封装
                 eta,
                 min_child_weight,
                 cosample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):
    params['eta'] = max(eta, 0)
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(cosample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)

    cv = xgb.cv(params, xgtrain, metrics='auc', early_stopping_rounds=50,
                nfold=5, seed=918)
    return cv['test-auc-mean'].values[-1]


def xgb_no_feature_select(train: pd.DataFrame, test: pd.DataFrame, y_train, cv=False):
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def xgb_feature_select(train, test, y_train, importance, top_num=None, cv=False):
    if top_num:
        threshold = np.sort(importance)[-top_num-1]
    else:
        threshold = 0
    select_id = [True if i > threshold else False for i in importance]
    train = train.loc[:, select_id]
    test = test.loc[:, select_id]
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    xgtest = xgb.DMatrix(test)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict


def xgb_bayes_opt(train, y_train):
    # num_rounds = 3000
    random_state = 918
    num_iter = 25
    init_points = 5
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': random_state,
    }
    xgtrian = xgb.DMatrix(train, label=y_train)
    _xgb_evaluate = lambda a, b, c, d, e, f, g: xgb_evaluate(params, xgtrian, a, b, c, d, e, f, g)
    xgbBO = BayesianOptimization(_xgb_evaluate, {
        'eta': (0.1, 0.5),
        'min_child_weight': (1, 20),
        'cosample_bytree': (0.1, 1),
        'max_depth': (5, 15),
        'subsample': (0.5, 1),
        'gamma': (0, 10),
        'alpha': (0, 10)
    })
    xgbBO.maximize(init_points=init_points, n_iter=num_iter)
    return xgbBO


def xgb_unbalance_handle(train, test):
    pass


def models_stack(trian, test):
    pass


In [43]:
df_train, df_test, y_train  = data_prepare()

In [44]:
im = Imputer()
train = im.fit_transform(df_train.values)
test = im.transform(df_test.values)

In [33]:
idxs = np.arange(df_train.shape[0])
np.random.seed(918)
np.random.shuffle(idxs)

idx_list = []
num = df_train.shape[0] // 11
for i in range(11):
    if i != 0:
        idx_list.append(idxs[i * num:(i+1) * num])
    else:
        idx_list.append(idxs[10 * num: ])

length = 0
for i in idx_list:
    length += len(i)
length

# XGboost

In [32]:
params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }

In [33]:
cls_list = [XGBClassifier(**params) for i in range(11)]

In [37]:
for i, idx in enumerate(idx_list):
    new_train = train[idx, :]
    new_y_train = y_train.values[idx]
    cls_list[i].fit(new_train, new_y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [38]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test)[:, 1])


In [46]:
res = np.mean(np.array(predict_list), axis=0)

In [48]:
output_result(df_test.index, res, sid='_11models_mean')  # 0.769

# LightGBM

In [30]:
params_lgb = {
    'nthread': 4,
    #is_unbalance=True,
    'n_estimators' : 10000,
    'learning_rate' : 0.1171,
    #'num_leaves' : 32,
    'colsample_bytree' : 0.9604,
    'subsample' : 0.9609,
    'max_depth' : 7,
    'reg_alpha' : 9.6523,
    'reg_lambda' : 1,
    'min_split_gain' : 0.179,
    'min_child_weight' : 13,
    'metric': 'auc',
    'silent': -1,
    'verbose': -1,
    #scale_pos_weight=11
}

In [2]:
clf = lgb.LGBMClassifier(**params_lgb)

In [29]:
xgtrain = lgb.Dataset(train, label=y_train.values.ravel())
lgb.cv(params_lgb, xgtrain, 10, nfold=5, metrics='auc', early_stopping_rounds=10,
       # This is what I added
        stratified=False)


#model = lgb.LGBMClassifier(**params)
#model.fit(train, y_train)
#y_predict = model.predict_proba(test)
#return model, y_predict

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


{'auc-mean': [0.709094446889912,
  0.7166094169418263,
  0.7200212031463803,
  0.723998477597493,
  0.7272192375196305,
  0.7292442359058453,
  0.7321313463225254,
  0.7343587797677004,
  0.736540781308255,
  0.7386130375435341,
  0.7406999834370118,
  0.7422023446570056,
  0.744160993789141,
  0.7457347829907458,
  0.7469706972597552,
  0.7489137982776963,
  0.7502076508807051,
  0.751499659917497,
  0.752744465126623,
  0.7539589416303054,
  0.7551041135587486,
  0.7560605269835544,
  0.7569793442113315,
  0.757837913523469,
  0.758697630905913,
  0.7593878533176487,
  0.7601775242526669,
  0.7607420993914591,
  0.7615622586615319,
  0.7621683750204936,
  0.7627208560159296,
  0.7632610123146928,
  0.7637425534514876,
  0.764109089804606,
  0.7646056938879124,
  0.7649797515261294,
  0.7655595282570864,
  0.7658867011224862,
  0.766130505408092,
  0.7664208763825641,
  0.7668262267757617,
  0.7671115924866918,
  0.767338052290021,
  0.7676093329841817,
  0.7679317914012883,
  0.76821

In [31]:
cls_list = [lgb.LGBMClassifier(**params_lgb) for i in range(11)]

In [36]:
for i, idx in enumerate(idx_list):
    new_train = train[idx, :]
    new_y_train = y_train.values[idx]
    cls_list[i].fit(new_train, new_y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
predict_list = []
for cls in cls_list:
    predict_list.append(cls.predict_proba(test)[:, 1])


In [38]:
res = np.mean(np.array(predict_list), axis=0)

output_result(df_test.index, res, sid='11models_mean_lgb')  # 0.772

# 测试 RobusxtScaler

In [41]:
from sklearn.preprocessing import RobustScaler

In [45]:
y_train = y_train.values.ravel()

In [49]:
xgb_no_feature_select(train, test, y_train, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.705973       0.001680       0.700744      0.004082
1        0.714776       0.002003       0.708041      0.005586
2        0.721093       0.001433       0.712055      0.005532
3        0.729924       0.001927       0.719409      0.005051
4        0.733852       0.002252       0.722567      0.004702
5        0.738533       0.001697       0.726365      0.004990
6        0.742360       0.001394       0.729371      0.005238
7        0.745623       0.001581       0.731764      0.005240
8        0.749460       0.001838       0.734793      0.005099
9        0.752700       0.001473       0.737192      0.005509


(XGBClassifier(alpha=9.6523, base_score=0.5, booster='gbtree',
        colsample_bylevel=1, colsample_bytree=1, cosample_bytree=0.9604,
        eta=0.1171, eval_metric='auc', gamma=0.179, learning_rate=0.1,
        max_delta_step=0, max_depth=7, min_child_weight=13, missing=None,
        n_estimators=100, n_jobs=1, nthread=4, objective='binary:logistic',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=918, silent=1, subsample=0.9609, verbose_eval=True),
 array([[0.9400627 , 0.05993729],
        [0.8798291 , 0.12017091],
        [0.9652743 , 0.03472573],
        ...,
        [0.9862132 , 0.01378676],
        [0.9527332 , 0.04726679],
        [0.8011434 , 0.19885659]], dtype=float32))

In [46]:
rs = RobustScaler()

In [48]:
train2 = rs.fit_transform(train, y_train)
test2 = rs.transform(test)

In [50]:
xgb_no_feature_select(train2, test2, y_train, cv=True)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.705973       0.001680       0.700744      0.004082
1        0.714776       0.002003       0.708041      0.005586
2        0.721093       0.001433       0.712055      0.005532
3        0.729924       0.001927       0.719408      0.005051
4        0.733852       0.002252       0.722566      0.004702
5        0.738533       0.001697       0.726364      0.004990
6        0.742360       0.001394       0.729370      0.005237
7        0.745623       0.001581       0.731763      0.005240
8        0.749460       0.001838       0.734792      0.005098
9        0.752701       0.001473       0.737192      0.005509


(XGBClassifier(alpha=9.6523, base_score=0.5, booster='gbtree',
        colsample_bylevel=1, colsample_bytree=1, cosample_bytree=0.9604,
        eta=0.1171, eval_metric='auc', gamma=0.179, learning_rate=0.1,
        max_delta_step=0, max_depth=7, min_child_weight=13, missing=None,
        n_estimators=100, n_jobs=1, nthread=4, objective='binary:logistic',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=918, silent=1, subsample=0.9609, verbose_eval=True),
 array([[0.9540579 , 0.04594211],
        [0.88835865, 0.11164133],
        [0.96245784, 0.03754215],
        ...,
        [0.9888067 , 0.01119326],
        [0.96069896, 0.03930106],
        [0.79613644, 0.20386358]], dtype=float32))

In [55]:

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

gnb = RandomForestClassifier()
cross_val_score(gnb, X=train, y=y_train, cv=10, scoring='roc_auc')

array([0.64019574, 0.63121106, 0.64117802, 0.64215522, 0.6330202 ,
       0.6299061 , 0.63554476, 0.63466676, 0.6340634 , 0.64615358])

In [56]:
?RandomForestClassifier

In [58]:
def get_rf_result(n_estimators, 
                  max_depth, 
                  min_samples_split, 
                  min_samples_leaf, 
                  max_leaf_nodes,
                 ):
    rf = RandomForestClassifier(n_estimators=int(n_estimators),
                                max_depth = max(0, int(max_depth)),
                                min_samples_split = max(0, int(min_samples_split)),
                                min_samples_leaf = max(0, int(min_samples_leaf)),
                                max_leaf_nodes = max(0, int(max_leaf_nodes))
                               )
    return np.mean(cross_val_score(rf, train, y_train, cv=10, scoring='roc_auc'))



In [None]:
rfBO = BayesianOptimization(get_rf_result,
        {'n_estimators': (10, 500), 
         'max_depth': (5, 20),
        })