# 测试各种sklearn模型，用以模型融合




In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
import gc
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb


In [12]:

def data_prepare(num=None):
    train = pd.read_csv('../data/handled/train.csv', nrows=num, index_col=0)
    test = pd.read_csv('../data/handled/test.csv', nrows=num, index_col=0)
    y_train = pd.read_csv('../data/handled/y_train.csv', nrows=num, header=-1, index_col=0)
    return train, test, y_train.values.ravel()

def imput_by_mean(train, test):
    im = Imputer()
    train = im.fit_transform(train)
    test = im.transform(test)
    return train, test

def read_importance():
    with open('./importance.txt', 'r') as f:
        imp = f.readline()
    imp = np.array([float(i) for i in imp.split(",")])
    return imp

def get_select_ids(importance, top_num=None):
    if top_num:
        threshold = np.sort(importance)[-top_num-1]
    else:
        threshold = 0
    select_id = [True if i > threshold else False for i in importance]
    return select_id

In [8]:
df_train, df_test, y_train = data_prepare()
train, test = imput_by_mean(df_train.values, df_test.values)
imp = read_importance()

# Random Forest

In [9]:
def get_rf_result(n_estimators, 
                  max_depth, 
                  min_samples_split, 
                  min_samples_leaf, 
                  max_leaf_nodes,
                 ):
    global train, y_train
    rf = RandomForestClassifier(n_estimators=int(n_estimators),
                                max_depth = max(0, int(max_depth)),
                                min_samples_split = max(0, int(min_samples_split)),
                                min_samples_leaf = max(0, int(min_samples_leaf)),
                                max_leaf_nodes = max(0, int(max_leaf_nodes))
                               )
    return np.mean(cross_val_score(rf, train, y_train, cv=5, scoring='roc_auc'))



In [13]:
rfBO = BayesianOptimization(get_rf_result,
        {'n_estimators': (10, 500), 
         'max_depth': (5, 20),
         'min_samples_split': (5, 100),
         'min_samples_leaf': (5, 100),
         'max_leaf_nodes' : (5, 100)
        })

rfBO.maximize(init_points=5, n_iter=30)

In [None]:
RandomForestClassifier(n_estimators=19,
                                max_depth = 99,
                                min_samples_split = 100,
                                min_samples_leaf =21,
                                max_leaf_nodes = 390)

# CART

In [None]:
def get_cls_result(
                  max_depth, 
                  min_samples_split, 
                  min_samples_leaf, 
                  max_leaf_nodes,
                 ):
    global train, y_train, imp
    cls = DecisionTreeClassifier(
                                max_depth = max(0, int(max_depth)),
                                min_samples_split = max(0, int(min_samples_split)),
                                min_samples_leaf = max(0, int(min_samples_leaf)),
                                max_leaf_nodes = max(0, int(max_leaf_nodes)),
                                random_state = 918
                               )
    return np.mean(cross_val_score(cls, train[:, get_select_ids(imp, 50)], y_train, cv=5, scoring='roc_auc'))

BO = BayesianOptimization(get_cls_result,
        {
         'max_depth': (5, 20),
         'min_samples_split': (5, 100),
         'min_samples_leaf': (5, 100),
         'max_leaf_nodes' : (5, 100)
        })

BO.maximize(init_points=5, n_iter=30)

In [None]:
def get_cls_result2(
                  max_depth, 
                  min_samples_split, 
                  min_samples_leaf, 
                  max_leaf_nodes,
                 ):
    global train, y_train, imp
    cls = DecisionTreeClassifier(criterion='entropy',
                                max_depth = max(0, int(max_depth)),
                                min_samples_split = max(0, int(min_samples_split)),
                                min_samples_leaf = max(0, int(min_samples_leaf)),
                                max_leaf_nodes = max(0, int(max_leaf_nodes)),
                                random_state = 918
                               )
    return np.mean(cross_val_score(cls, train[:, get_select_ids(imp, 50)], y_train, cv=5, scoring='roc_auc'))



In [None]:
BO = BayesianOptimization(get_cls_result2,
        {
         'max_depth': (5, 20),
         'min_samples_split': (5, 100),
         'min_samples_leaf': (5, 100),
         'max_leaf_nodes' : (5, 100)
        })

BO.maximize(init_points=5, n_iter=30)

In [None]:
DecisionTreeClassifier(max_depth=9, max_leaf_nodes= 92, min_samples_leaf=98, min_samples_split=26)

# Extra Tree

In [None]:
def get_cls_result(n_estimators, 
                  max_depth, 
                  min_samples_split, 
                  min_samples_leaf, 
                  max_leaf_nodes,
                 ):
    global train, y_train
    cls = ExtraTreesClassifier(n_estimators=int(n_estimators),
                                max_depth = max(0, int(max_depth)),
                                min_samples_split = max(0, int(min_samples_split)),
                                min_samples_leaf = max(0, int(min_samples_leaf)),
                                max_leaf_nodes = max(0, int(max_leaf_nodes)),
                                random_state = 918
                               )
    return np.mean(cross_val_score(cls, train[:, get_select_ids(imp, 10)], y_train, cv=5, scoring='roc_auc'))


BO = BayesianOptimization(get_cls_result,
        {'n_estimators': (10, 500), 
         'max_depth': (5, 20),
         'min_samples_split': (5, 100),
         'min_samples_leaf': (5, 100),
         'max_leaf_nodes' : (5, 100)
        })

BO.maximize(init_points=5, n_iter=30)

In [None]:
ExtraTreesClassifier(max_depth=20, max_leaf_nodes = 93,min_samples_leaf=81, min_samples_split=5, n_estimators = 248)