In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [66]:
df_main_train = pd.read_csv('../data/handled/main_train.csv', index_col=0)
df_main_test = pd.read_csv('../data/handled/main_test.csv', index_col=0)

df_bureau_train = pd.read_csv('../data/handled/bureau_train.csv', index_col=0)
df_bureau_test = pd.read_csv('../data/handled/bureau_test.csv', index_col=0)

In [67]:
df_bureau_test.index = df_bureau_test.SK_ID_CURR
df_bureau_train.index = df_bureau_train.SK_ID_CURR

In [68]:
df_bureau_train = df_bureau_train.drop(columns='SK_ID_CURR')
df_bureau_test = df_bureau_test.drop(columns='SK_ID_CURR')

In [69]:
y_train = df_main_train.TARGET

df_main_train = df_main_train.drop(columns='TARGET')
#df_main_test = df_main_test.drop(columns='TARGET')
df_bureau_train = df_bureau_train.drop(columns='TARGET')
df_bureau_test = df_bureau_test.drop(columns='TARGET')

In [70]:
df_train = pd.merge(df_main_train, df_bureau_train, left_index=True, right_index=True)
df_test = pd.merge(df_main_test, df_bureau_test, left_index=True, right_index=True)

In [71]:
df_train = pd.get_dummies(df_train, dummy_na=True)
df_test = pd.get_dummies(df_test, dummy_na=True)

In [72]:
df_train, df_test = df_train.align(df_test, join='left', fill_value=0, axis=1)

In [73]:
df_train = df_train.drop(columns='SK_ID_CURR.1')
df_test = df_test.drop(columns='SK_ID_CURR.1')

In [76]:
im = Imputer()
im.fit(df_train)
train = im.transform(df_train)
test = im.transform(df_test)

In [77]:
lr = LogisticRegression()
lr.fit(train, y_train)
res = lr.predict_proba(test)

In [12]:
test_id = df_bureau_test.index

In [13]:
def output(test_id, test_prob, sid=0):
    result = pd.DataFrame(np.column_stack((test_id, test_prob)))
    result.columns = ['SK_ID_CURR', 'TARGET']
    result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
    result.to_csv('submission' + str(sid) + '.csv', header=True, index=False)

In [14]:
output(test_id, res[:, 1], sid='_add_bureau')

## GBDT test

In [60]:
def cross_val(est, x, y, support=None):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    #sfm = SelectFromModel(LogisticRegression(penalty='l1'), 0.02)
    #sfm.fit(x_train, y_train)
    if support:
        x_train = x_train[:, support]
        x_test = x_test[:, support]
    
    est.fit(x_train, y_train)
    y_predict = est.predict_proba(x_test)
    auc_score = get_auc_score(y_test, y_predict[:, 1])
    return auc_score


def get_auc_score(y_true, y_predict_proba):
    f, t, _ = roc_curve(y_true, y_predict_proba, pos_label=1)
    return auc(f, t)
    

### 特征选择

In [16]:
sfm = SelectFromModel(LogisticRegression(penalty='l1'), 0.2)

In [17]:
sfm.fit(train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.2)

In [18]:
sfm.get_support().sum()

55

### 相关系数和sfm同时进行特征选择


In [37]:
a = pd.DataFrame(np.column_stack((train.T[0], y_train)))
a.corr().values[0, 1]

0.019187133596327993

In [38]:
def corr(x, y):
    df = pd.DataFrame(np.column_stack((x, y)))
    df = df.dropna()
    return df.corr().values[0, 1]

corr = list(map(lambda x: corr(x, y_train), train.T))

## XGBoost测试

In [42]:
from xgboost.sklearn import XGBClassifier


In [95]:
xgb_model = XGBClassifier()

In [97]:
cross_val(xgb_model, train, y_train)

0.7530502060786818

In [98]:
res = pd.DataFrame(np.column_stack((df_train.columns, xgb_model.feature_importances_)))
res = res.loc[res[1] !=0]
#res.sort_values(by=1, ascending=False)

In [99]:
support_vec = xgb_model.feature_importances_ != 0

## XGBoost 贝叶斯调参

In [124]:
from bayes_opt import BayesianOptimization
import xgboost as xgb

def xgb_evaluate(eta,
                            min_child_weight,
                            cosample_bytree,
                            max_depth,
                            subsample,
                            gamma,
                            alpha):
    global params, xgb, xgtrain, num_rounds, random_state
    params['eta'] = max(eta, 0)
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(cosample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)
    
    cv = xgb.cv(params, xgtrain, num_boost_round=num_rounds, metrics='auc', early_stopping_rounds=50,
                nfold=5, seed=random_state, callbacks=[xgb.callback.early_stop(50)])
    return cv['test-auc-mean'].values[-1]
    


In [127]:
xgtrain = xgb.DMatrix(train[:, support_vec], label=y_train)

In [None]:
num_rounds = 3000
random_state = 918
num_iter = 25
init_points = 5
params = {
    'silent' : 1,
    'nthread': 4,
    'eval_metric' : 'auc',
    'verbose_eval' : True,
    'seed': random_state,
}

xgbBO = BayesianOptimization(xgb_evaluate, {
                                            'eta':(0.1, 0.5),
                                            'min_child_weight' : (1, 20),
                                            'cosample_bytree' : (0.1, 1),
                                            'max_depth' : (5, 15),
                                            'subsample' : (0.5, 1),
                                            'gamma': (0, 10),
                                            'alpha': (0, 10)
                                        })
xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   cosample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[696]	train-auc:0.761426+0.00127904	test-auc:0.75355+0.00340667

    1 | 29m46s | [35m   0.75355[0m | [32m   7.5220[0m | [32m           0.9244[0m | [32m   0.2803[0m | [32m   2.5626[0m | [32m     7.3636[0m | [32m           15.9469[0m | [32m     0.5370[0m | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[262]	train-auc:0.762515+0.000899799	test-auc:0.75073+0.00339422

    2 | 19m46s |    0.7

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[57]	train-auc:0.77252+0.000910023	test-auc:0.751984+0.0030076

    9 | 03m36s |    0.75198 |    0.0920 |            0.1406 |    0.2458 |    1.2795 |      5.1447 |             1.4981 |      0.5186 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[194]	train-auc:0.817735+0.000626569	test-auc:0.760405+0.00238065

   10 | 07m38s | [35m   0.76040[0m | [32m   9.6523[0m | [32m           0.9604[0m | [32m   0.1171[0m | [32m   0.1790[0m | [32m     6.9201[0m | [32m           12.9632[0m | [32m     0.9609[0m | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[34]	train-auc:0.779062+

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[250]	train-auc:0.741361+0.00133717	test-auc:0.738356+0.00483395

   15 | 09m15s |    0.73836 |    0.0000 |            1.0000 |    0.1000 |   10.0000 |      5.0000 |             1.0000 |      0.5000 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[41]	train-auc:0.856648+0.00128225	test-auc:0.752513+0.00240984

   16 | 04m35s |    0.75251 |    1.0320 |            0.9683 |    0.1452 |    0.4069 |      9.6462 |             1.3375 |      0.9985 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[222]	train-auc:0.744103+0.000977139	test-auc:0.740653+0.00493614

   17 | 22m43s |    0.74065 |   10.0000 |            1.0000 |    0.1000 |    9.7253 |     13.9751 |            12.8517 |      0.6927 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[510]	train-auc:0.758407+0.000857176	test-auc:0.751716+0.0038372

   18 | 20m21s |    0.75172 |    9.7325 |            0.5763 |    0.1103 |    3.9493 |      7.6420 |             6.7425 |      0.9093 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[72]	train-auc:0.852982+0.000934866	test-auc:0.756444+0.00293179

   19 | 07m56s |    0.75644 |    9.8838 |            0.8963 |    0.1011 |    0.0555 |     11.6548 |            19.6546 |      0.8368 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[86]	train-auc:0.828411+0.00095252	test-auc:0.756735+0.00251677

   20 | 07m52s |    0.75674 | 

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[936]	train-auc:0.783759+0.000371222	test-auc:0.760161+0.00314282

   22 | 84m03s |    0.76016 |    9.8471 |            0.9138 |    0.1259 |    1.6029 |     14.9452 |             9.0917 |      0.8392 | 




Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.


In [None]:
Step |   Time |      Value |     alpha |   cosample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   subsample | 
10 | 07m38s |    0.76040 |    9.6523 |            0.9604 |    0.1171 |    0.1790 |      6.9201 |            12.9632 |      0.9609 | 
22 | 84m03s |    0.76016 |    9.8471 |            0.9138 |    0.1259 |    1.6029 |     14.9452 |             9.0917 |      0.8392 | 

In [109]:
cv['test-auc-mean'].mean()


0.6480827684210527

In [113]:
cv


Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.560961,0.033322,0.830652,0.048688
1,0.557098,0.028375,0.853743,0.061532
2,0.550629,0.046166,0.892028,0.040672
3,0.608626,0.059304,0.920884,0.028417
4,0.623067,0.046418,0.947886,0.028996
5,0.642423,0.066361,0.959671,0.020887
6,0.639852,0.070188,0.962949,0.018274
7,0.634023,0.065931,0.96936,0.015126
8,0.657391,0.043328,0.977468,0.014496
9,0.662393,0.048306,0.983121,0.010186
