## model ensemble
1.准备数据-载入各模型的预测概率  
2.融合训练-找到各模型的权重  
3.模型融合-Random Forest,LightGBM,XGBoost_LR,Wide and Deep

In [1]:
from sklearn.tree import DecisionTreeClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import lightgbm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

### 1.准备数据

In [2]:
rf_pred = pickle.load(open("./model/rf_pred.pkl","rb")).astype('float32')[:,1]
lgbm_pred = pickle.load(open("./model/lgbm_pred.pkl","rb")).astype('float32')[:,1]
xgb_lr_pred = pickle.load(open("./model/xgb_lr_pred.pkl","rb")).astype('float32')[:,1]
wd_pred = pickle.load(open("./model/wd_pred.pkl","rb")).astype('float32')[:,1]

In [3]:
%%time
X_train = np.hstack((rf_pred.reshape(-1,1),lgbm_pred.reshape(-1,1),\
                     xgb_lr_pred.reshape(-1,1),wd_pred.reshape(-1,1)))

Wall time: 120 ms


In [4]:
X_train.shape

(7377403, 4)

In [5]:
y_train = pickle.load(open("./model/y_train.pkl","rb")).astype('int32')

### 2.融合训练

2.1 LR

In [17]:
%%time
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=4)
lr.fit(X_train,y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Wall time: 13.8 s


In [18]:
y_proba = lr.predict_proba(X_train)
auc = roc_auc_score(y_train,y_proba[:,1])
print(auc)

0.8005876565437465


In [19]:
y_pred = lr.predict(X_train)
accuracy = accuracy_score(y_train,y_pred)
print(accuracy)

0.7208178813059284


In [21]:
lr.coef_

array([[-0.492108  ,  6.74121672, -1.24454395,  0.06377891]])

2.2 LRCV

In [23]:
%%time
from sklearn.linear_model import LogisticRegressionCV
lrcv = LogisticRegressionCV(n_jobs=4,cv=5,random_state=6)
lrcv.fit(X_train,y_train)

Wall time: 2min 18s


In [24]:
y_proba_cv = lrcv.predict_proba(X_train)
auc_cv = roc_auc_score(y_train,y_proba_cv[:,1])
print(auc_cv)

0.8005876565437465


2.3 Tree

In [6]:
%%time
tree = DecisionTreeClassifier(max_depth=8,min_samples_split=50,random_state=6)
tree.fit(X_train,y_train)

Wall time: 55.3 s


In [7]:
y_proba_tree = tree.predict_proba(X_train)

In [8]:
auc_tree = roc_auc_score(y_train,y_proba_tree[:,1])
print(auc_tree)

0.8006886022627495


In [9]:
tree.feature_importances_

array([0.00217747, 0.99290451, 0.00383019, 0.00108782])

### 3.模型融合

In [20]:
blend_w = dict(rf=0.12,lgbm=0.6,xgb_lr=0.22,wd=0.06)
blend_pred = rf_pred*blend_w['rf']+lgbm_pred*blend_w['lgbm']+\
       xgb_lr_pred*blend_w['xgb_lr']+wd_pred*blend_w['wd']

In [21]:
blend_auc = roc_auc_score(y_train,blend_pred)
print(blend_auc)

0.7963020748786793
