In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.

In [8]:
class DataHandle:
    def __init__(self, missing_rate=0.3):
        self.missing_rate = missing_rate
        self.clear()
    
    def clear(self):
        self.imputer = None
        self.ss = None
        self.sfm = None
        self.x_before = None
        self.x_after = None
        self.feature_by_missing = None
        
    def fit(self, x, y):
        self.clear()
        if not isinstance(x, pd.DataFrame):
            raise TypeError
        self.x_before = x
        self.y_before = y
        self.y_after = y
        self.feature_by_missing = x.columns[x.isnull().sum()/x.shape[0] < self.missing_rate]
        self.x_before = self.x_before[self.feature_by_missing]
        self.x_before = pd.get_dummies(self.x_before)
        self.x_after = self.data_preprocess(self.x_before.values, y)
        
    def data_preprocess(self, x, y=None):
        assert isinstance(x, np.ndarray)
        # 异常值
        pass

        # 缺失值
        if self.imputer is None:
            self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
            self.imputer.fit(x)
        x =  self.imputer.transform(x)

        # 标准化
        if self.ss is None:
            self.ss = StandardScaler()
            self.ss.fit(x)
        x = self.ss.transform(x)

        # 特征选择
        if self.sfm is None:
            self.sfm = SelectFromModel(LogisticRegression(penalty="l1", C=0.01))
            self.sfm.fit(x, y)
        x = self.sfm.transform(x)

        return x

    def transform(self, x_test):
        x_test = x_test[self.feature_by_missing]
        x_test = pd.get_dummies(x_test)
        # 统一train和test的特征
        _, x_test = self.x_before.align(x_test, join='left', axis=1, fill_value=0)
        x_test = self.data_preprocess(x_test.values)
        return self.x_after, self.y_after, x_test


def get_auc_score(y_true, y_predict_proba):
    f, t, _ = roc_curve(y_true, y_predict_proba, pos_label=1)
    return auc(f, t)

def output(test_id, test_prob, sid=0):
    result = pd.DataFrame(np.column_stack((test_id, test_prob)))
    result.columns = ['SK_ID_CURR', 'TARGET']
    result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
    result.to_csv('submission' + str(sid) + '.csv', header=True, index=False)

In [30]:
df_train = pd.read_csv("../data/application_train.csv")
df_test = pd.read_csv("../data/application_test.csv")

# 全部数据集，用以输出并且submit

In [20]:
x_train_all = df_train.iloc[:, 2:]
y_train_all = df_train['TARGET'].values

x_test_all = df_test.iloc[:, 1:]
test_id  = df_test.iloc[:, 0]

In [21]:
dh_all = DataHandle()
dh_all.fit(x_train_all, y_train_all)
x_train_all, y_train_all, x_test_all = dh_all.transform(x_test_all)

In [23]:
x_train_all.shape

(307511, 118)

# 切分数据集，用以获得模型参数并且交叉验证

In [26]:
df_x = df_train.iloc[:, 2:]
df_y = df_train['TARGET']
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x, df_y)

In [27]:
dh = DataHandle()
dh.fit(df_x_train, df_y_train)
x_train, y_train, x_test = dh.transform(df_x_test)

## GBDT模型

In [14]:
# GBDT调参

y_train = np.array(y_train, dtype='int')
y_test = np.array(df_y_test, dtype='int')

gp_params = {"alpha": 1e-5}


def gbt_cv(learning_rate, n_estimators, max_depth, min_samples_split, min_samples_leaf,sub_sample):
    val = cross_val_score(
        GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=int(n_estimators),
                                  max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                  min_samples_leaf=int(min_samples_leaf), subsample=sub_sample),
        x_train, y_train, 'roc_auc', cv=2
    ).mean()

    return val

svcBO = BayesianOptimization(gbt_cv,
                             {'learning_rate': (0.001, 1), 'n_estimators': (10, 500),
                             'max_depth': (2, 8), 'min_samples_split': (10, 100), 
                             'min_samples_leaf':(10, 100), 'sub_sample': (0.01, 1.0)})

svcBO.explore({'learning_rate': [0.001, 0.1, 0.5], 'n_estimators': [20, 100, 200],
                             'max_depth': [2,3,4], 'min_samples_split': [10, 50, 80], 
                             'min_samples_leaf': [10, 40, 80], 'sub_sample':[0.1,0.5,0.8]})
svcBO.maximize(n_iter=10, **gp_params)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   learning_rate |   max_depth |   min_samples_leaf |   min_samples_split |   n_estimators |   sub_sample | 
    1 | 00m12s | [35m   0.68233[0m | [32m         0.0010[0m | [32m     2.0000[0m | [32m           10.0000[0m | [32m            10.0000[0m | [32m       20.0000[0m | [32m      0.1000[0m | 
    2 | 02m49s | [35m   0.74365[0m | [32m         0.1000[0m | [32m     3.0000[0m | [32m           40.0000[0m | [32m            50.0000[0m | [32m      100.0000[0m | [32m      0.5000[0m | 
    3 | 14m00s |    0.72152 |          0.5000 |      4.0000 |            80.0000 |             80.0000 |       200.0000 |       0.8000 | 
    4 | 26m02s |    0.68976 |          0.5825 |      5.8895 |            60.9732 |             34.6326 |       251.1589 |       0.8872 | 
    5 | 04m50s

In [16]:
print(svcBO.res['max'])

{'max_val': 0.7459098616411393, 'max_params': {'learning_rate': 0.13349744579104986, 'n_estimators': 368.16555294973546, 'max_depth': 4.323369081020995, 'min_samples_split': 99.55056500297515, 'min_samples_leaf': 99.62824252348736, 'sub_sample': 0.9438267090259804}}


In [18]:
gbdt = GradientBoostingClassifier(learning_rate=0.1335, n_estimators=368, max_depth=4,
                                  min_samples_leaf=100, min_samples_split=100, subsample=0.9438)
gbdt.fit(x_train, y_train)
y_pred = gbdt.predict_proba(x_test)[:, 1]
print(get_auc_score(y_test, y_pred))

0.7521702069313638


In [24]:
# 当前最好的GBDT参数
gbdt = GradientBoostingClassifier(learning_rate=0.1335, n_estimators=368, max_depth=4,
                                  min_samples_leaf=100, min_samples_split=100, subsample=0.9438)
gbdt.fit(x_train_all, y_train_all)
y_pred = gbdt.predict_proba(x_test_all)[:, 1]

In [25]:
output(test_id, y_pred, sid=1)

## 随机森林模型

In [None]:
# 随机森林调参

gp_params = {"alpha": 1e-5}

def rf_cv(n_estimators, max_depth, max_features, min_samples_leaf, min_samples_split):
    val = cross_val_score(
        RandomForestClassifier(criterion="gini", n_estimators=int(n_estimators),
                               max_depth=int(max_depth),
                               max_features=int(max_features),
                               min_samples_leaf=int(min_samples_leaf), 
                               min_samples_split=int(min_samples_split)),
        x_train, y_train, 'roc_auc', cv=2
    ).mean()
    return val

svcBO = BayesianOptimization(rf_cv,
                             {'n_estimators':(10,500), 'max_depth': (2, 20), 'max_features': (1, x_train.shape[1]-1),
                             'min_samples_leaf': (2, 100), 'min_samples_split': (2, 100)})

svcBO.explore({'n_estimators': [10, 200,400], 'max_depth': [2, 10, 20], 'max_features': [1, x_train.shape[1]//2, x_train.shape[1]-1],
                             'min_samples_leaf': [2,50,100], 'min_samples_split': [2, 50, 100]})
svcBO.maximize(n_iter=10, **gp_params)

## LightGBM

In [29]:
import lightgbm as lgb

clf = lgb.LGBMClassifier( 
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

In [30]:
clf.fit(x_train, y_train)
pred = clf.predict_proba(x_test)

In [34]:
get_auc_score(y_test, pred[:, 1])

0.49833144085681846