In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.naive_bayes import  GaussianNB

warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
df = pd.read_csv("../../data/handled/kernel_1/df_all.csv", header=0, index_col=0)


In [3]:

train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
del df
gc.collect()

feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]
test_id = test_df['SK_ID_CURR']
train_id = train_df['SK_ID_CURR']
y_train = train_df.TARGET.values
train_df = train_df.loc[:, feats]
test_df = test_df.loc[:, feats]

In [4]:
# cross_val_score(clf_1, train_df, y_train, scoring='roc_auc')

In [5]:
importance = pd.read_csv("feature_importance_330.csv", header=0, index_col=0)
select_feats = importance.groupby('feature')['importance'].mean().sort_values(ascending=False).index[:50]

In [6]:
train_df_tiny = train_df.loc[:, select_feats]
test_df_tiny = test_df.loc[:, select_feats]

In [7]:
from sklearn.preprocessing import Imputer
im = Imputer()
train_df_tiny = im.fit_transform(train_df_tiny)
test_df_tiny = im.transform(test_df_tiny)

In [8]:
lgb_params = {
    'nthread' : 8,
    'n_estimators' : 10000,
    'learning_rate' : 0.02,
    'num_leaves' : 32,
    'colsample_bytree' : 0.9497036,
    'subsample' : 0.8715623,
    'max_depth' : 8,
    'reg_alpha' : 0.04,
    'reg_lambda' : 0.073,
    'min_split_gain' : 0.0222415,
    'min_child_weight' : 40,
    'silent' : -1,
    'verbose' : -1,
}

rf_params = {
    'n_jobs' : 8, 
    'n_estimators' : 200, 
    'max_depth' : 14, 
    'max_leaf_nodes' : 100, 
    'min_samples_leaf' : 50    
}

et_params = {
    'n_jobs' : 8, 
    'n_estimators' : 400, 
    'max_depth' : 8, 
    'max_leaf_nodes' : 30, 
    'min_samples_leaf' : 80    
}


## 可以加入xgb

In [9]:
LGBMClassifier.__doc__

'LightGBM classifier.'

In [11]:
class wrapper:
    
    def __init__(self, cls, params, n_folder=5):
        self.cls_list = [cls(**params) for i in range(n_folder)]
        self.n_folder = n_folder
        self.doc = cls.__doc__
        
    def fit(self, x_train, y_train):
        folds = KFold(n_splits=self.n_folder, shuffle=True, random_state=47)
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(x_train, y_train)):
            
            if self.doc == 'LightGBM classifier.':
                self.cls_list[n_fold].fit(x_train[train_idx], y_train[train_idx], eval_set=[(x_train[train_idx], y_train[train_idx]), (x_train[valid_idx], y_train[valid_idx])],
                    eval_metric='auc', verbose=1000, early_stopping_rounds=200)
                y_valid_predict = self.cls_list[n_fold].predict_proba(x_train[valid_idx], num_iteration=self.cls_list[n_fold].best_iteration_)[:, 1]
            else:
                self.cls_list[n_fold].fit(x_train[train_idx], y_train[train_idx])
                y_valid_predict = self.cls_list[n_fold].predict_proba(x_train[valid_idx])[:, 1]
            
            roc = roc_auc_score(y_train[valid_idx], y_valid_predict)
            print("------------fold:", n_fold, " Done, ROC: ", roc)
            
            
    def predict_proba(self, x_test):
        res = np.zeros(x_test.shape[0])
        for cls in self.cls_list:
            if self.doc == 'LightGBM classifier.':
                res += cls.predict_proba(x_test, num_iteration=cls.best_iteration_)[:, 1]
            else:
                res += cls.predict_proba(x_test)[:, 1]
        return res / self.n_folder
        


In [12]:
lgb = wrapper(LGBMClassifier, lgb_params)
rf = wrapper(RandomForestClassifier, rf_params)
et = wrapper(ExtraTreesClassifier, et_params)

In [13]:
 lgb_params2 = {
            'nthread' : 8,
            'n_estimators' : 10000,
            'learning_rate' : 0.01,
            'num_leaves' : 43,
            'colsample_bytree' : 0.6558,
            'subsample' : 0.8927,
            'max_depth' : 14,
            'reg_alpha' : 1.4957,
            'reg_lambda' : 8.5621,
            'min_split_gain' : 0.4919,
            'min_child_weight' : 30,
            'silent' : -1,
            'verbose' : -1,
 }

In [14]:
model_list = [(LGBMClassifier(**lgb_params), train_df.values, test_df.values),
             (LGBMClassifier(**lgb_params), train_df_tiny, test_df_tiny),
             (LGBMClassifier(**lgb_params2),  train_df_tiny, test_df_tiny),
             ]

In [15]:
def model_fit(model_tuple, train_ids, valid_ids, y_train):
    model, train_df, test_df = model_tuple
    _train_x = train_df[train_ids]
    _train_y = y_train[train_ids]
    _valid_x = train_df[valid_ids]
    _valid_y = y_train[valid_ids]
    
    model.fit(_train_x, _train_y)
    _valid_y_pred = model.predict_proba(_valid_x)[:, 1] # 如果使用wrapper的话就不需要[:, 1]
    
    auc = roc_auc_score(_valid_y, _valid_y_pred)
    _test_y_pred = model.predict_proba(test_df)[:, 1]
    return _valid_y_pred, _test_y_pred, auc

In [16]:
from sklearn.cross_validation import StratifiedKFold
def stacking(model_list, y_train, sample_num, test_sample_num, meta_model=LogisticRegression(), k_folder=5):

    clfs_num = len(model_list)
    folds = list(StratifiedKFold(y_train, k_folder, shuffle=True, random_state=47))

    X_train_stack =  np.zeros((sample_num, clfs_num))
    X_test_stack = np.zeros((test_sample_num, clfs_num))
    auc_score_mat = np.zeros((clfs_num, k_folder))

    for i, (train, test) in enumerate(folds):
        print('fold:', i)
        valid_y_lst = []
        test_y_lst = []
        for j, model_tuple in enumerate(model_list):
            print("model:", j)
            valid_y, test_y, auc = model_fit(model_tuple, train, test, y_train)
            valid_y_lst.append(valid_y)
            test_y_lst.append(test_y)
            auc_score_mat[j, i] = auc

        X_train_stack[test, :] = np.column_stack(valid_y_lst)    # 训练集分 k_folder 次填充，得到[n_train_samples, clfs_num]
        X_test_stack += np.column_stack(test_y_lst)
    X_test_stack = X_test_stack / k_folder
    return X_train_stack, X_test_stack, auc_score_mat
#     meta_model.fit(X_train_stack, y_train)
#     return meta_model.predict_proba(X_test_stack)[:, 1], auc_score_mat



In [17]:
X_train_stack, X_test_stack, auc_score_mat = stacking(model_list, y_train, sample_num=train_df.shape[0], test_sample_num=test_df.shape[0])

fold: 0
model: 0
model: 1
model: 2
fold: 1
model: 0
model: 1
model: 2
fold: 2
model: 0
model: 1
model: 2
fold: 3
model: 0
model: 1
model: 2
fold: 4
model: 0
model: 1
model: 2


In [20]:
cross_val_score(LGBMClassifier(), X_train_stack, y_train, scoring='roc_auc', cv=10)

array([0.77992977, 0.78799419, 0.78760576, 0.79212439, 0.78430455,
       0.78047858, 0.78801144, 0.78466757, 0.79059221, 0.78856046])

In [42]:
cross_val_score(LGBMClassifier(), X_train_stack, y_train, scoring='roc_auc')

array([0.78460618, 0.78400739, 0.78561764])

In [61]:
# train_df_stacking = pd.DataFrame(X_train_stack, index=train_id)
# test_df_stacking = pd.DataFrame(X_test_stack, index=test_id)

In [63]:
# output = pd.concat((train_df_stacking, test_df_stacking))
# output.columns = ['MY_FEATURE_1', 'MY_FEATURE_2', 'MY_FEATURE_3']


In [64]:
# output.to_csv("../../data/handled/kernel_1/my_features.csv")

In [48]:
lgb = LGBMClassifier()
lgb.fit(X_train_stack, y_train)
y_predict = lgb.predict_proba(X_test_stack)[:, 1]

In [49]:
res = pd.DataFrame(y_predict, index=test_id)

In [50]:
res.index.name = 'SK_ID_CURR'

In [51]:
res.columns = ['TARGET']

In [52]:
res.to_csv('submission_stacking.csv')

In [53]:
res.index

Int64Index([100001, 100005, 100013, 100028, 100038, 100042, 100057, 100065,
            100066, 100067,
            ...
            456168, 456169, 456170, 456189, 456202, 456221, 456222, 456223,
            456224, 456250],
           dtype='int64', name='SK_ID_CURR', length=48744)

In [60]:
auc_score_mat

array([[0.78667354, 0.78238815, 0.78157611, 0.78620955, 0.78620213],
       [0.74709485, 0.7484572 , 0.74579839, 0.74225294, 0.74745149],
       [0.73404863, 0.73476041, 0.73187642, 0.72865263, 0.7359247 ]])

In [65]:
train_df.shape

(307507, 718)