on default features for https://www.kaggle.com/c/allstate-claims-severity

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
from tqdm import tqdm

In [29]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])
    
    print(cv_result)
    return cv_result['test-auc-mean'].values[-1]

In [15]:
def prepare_data():
    train = pd.read_csv('../raw/application_train.csv', error_bad_lines=False)
    train = train[:-1]
    categorical_columns = train.select_dtypes(include=['object']).columns

    for column in tqdm(categorical_columns):
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])

    y = train['loss']

    X = train.drop(['loss', 'id'], 1)
    xgtrain = xgb.DMatrix(X, label=y)

    return xgtrain

In [6]:
train = pd.read_csv('./data/train_feature_baseline.csv')

In [8]:
train_df = train[train['TARGET'].notnull()]

In [None]:
# 释放内存
import gc

del train
gc.collect()

In [22]:
y = train_df['TARGET']

In [24]:
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

In [26]:
X = train_df[feats]

In [28]:
xgtrain = xgb.DMatrix(X, label=y)

In [None]:
if __name__ == '__main__':
#     xgtrain = prepare_data()

    num_rounds = 4000
    random_state = 42
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'silent': 1,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': random_state
    }
    
    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (5, 15),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
