<a href="https://colab.research.google.com/github/LoosonWu/geek_homework/blob/main/stacking_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

!git clone https://github.com/LoosonWu/data.git


Cloning into 'data'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 8 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


In [3]:
import lightgbm as lgb  
import pandas as pd  
import numpy as np  
import pickle  
from sklearn.metrics import roc_auc_score  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score 
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets.samples_generator import make_blobs




In [125]:
def load_data():
  train_data = pd.read_csv('./data/train_final.csv', engine='python')
  train_y = train_data.iloc[:,15]
  train_x = train_data.iloc[:,0:15].drop(['continuous_annual_inc_joint','continuous_dti_joint'],1)
  train_x = train_x.fillna(0)
  test_data = pd.read_csv('./data/test_final.csv', engine='python')
  test_y = test_data.iloc[:,15]
  test_x = test_data.iloc[:,0:15].drop(['continuous_annual_inc_joint','continuous_dti_joint'],1)
  test_x = test_x.fillna(0)
  return train_x, train_y, test_x, test_y

In [139]:
print("Loading Data ... ")  
train_x, train_y, test_x, test_y = load_data()  

Loading Data ... 


In [128]:
train_x['continuous_income_expense_ratio'] = (train_x['continuous_annual_inc']/12)/(train_x['continuous_installment']+train_x['continuous_funded_amnt']*train_x['continuous_int_rate']/100/12) 
test_x['continuous_income_expense_ratio'] = (test_x['continuous_annual_inc']/12)/(test_x['continuous_installment']+test_x['continuous_funded_amnt']*test_x['continuous_int_rate']/100/12) 

In [130]:
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

In [131]:
def stacking(train_x, train_y, x_predict, y_predict, n_folds):
  '''采用2级stacking'''
  dataset_blend_train = np.zeros((train_x.shape[0], len(clfs)))
  dataset_blend_test = np.zeros((train_y.shape[0], len(clfs)))
  skf = StratifiedKFold(n_splits=n_folds)
  '''第一级stacking里有5个model'''
  for j, clf in enumerate(clfs):
      dataset_blend_test_j = np.zeros((train_y.shape[0], n_folds))
      for i, (train, test) in enumerate(skf.split(train_x,train_y)):
          print('i=',i)
          X_train, y_train, X_test, y_test = train_x[train], train_y[train], train_x[test], train_y[test]
          clf.fit(X_train, y_train)
          y_submission = clf.predict_proba(X_test)[:, 1]
          dataset_blend_train[test, j] = y_submission
          '''把每一个model的每一次split train起来的model用在测试数据集上测试后的结构放到dataset_blend_test_j的一个column里'''
          dataset_blend_test_j[:, i] = clf.predict_proba(x_predict)[:, 1]
      '''将一个model跑完后的预测结构求平均值作为这个model的结果'''
      dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
      print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
  '''第二级stacking里有1个model'''
  clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
  '''将第一级stacking的结构用来拟合'''
  clf.fit(dataset_blend_train, train_y)
  '''将测试数据集在第一级stacking里的预测值用来作为第二级stacking模型的测试级'''
  y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
  # y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
  print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))

In [140]:
np.set_printoptions(suppress=True)
npa_train_x = np.array(train_x) 
npa_train_y = np.array(train_y) 
npa_test_x = np.array(test_x) 
npa_test_y = np.array(test_y)
stacking(npa_train_x, npa_train_y, npa_test_x, npa_test_y, 5)

i= 0
i= 1
i= 2
i= 3
i= 4
val auc Score: 0.947179
i= 0
i= 1
i= 2
i= 3
i= 4
val auc Score: 0.947809
i= 0
i= 1
i= 2
i= 3
i= 4
val auc Score: 0.946946
i= 0
i= 1
i= 2
i= 3
i= 4
val auc Score: 0.947526
i= 0
i= 1
i= 2
i= 3
i= 4
val auc Score: 0.955308
val auc Score: 0.954491


In [132]:
# create dataset for lightgbm  
lgb_train = lgb.Dataset(train_x, train_y)  
lgb_test = lgb.Dataset(test_x, test_y, reference=lgb_train)  

In [133]:
# specify configurations as a dict  
params = {'boosting_type': 'gbdt','objective': 'binary','metric': {'binary_logloss', 'auc'},'num_leaves': 256,'max_depth': 8,'learning_rate': 0.005,'lambda_l1': 1,'is_unbalance': True} 


In [134]:
# train  
print('Start training...')  
gbm = lgb.train(params,lgb_train,num_boost_round=10000,valid_sets=lgb_test,early_stopping_rounds=1000)  
print('Finished training...')  

Start training...
[1]	valid_0's binary_logloss: 0.490925	valid_0's auc: 0.948834
Training until validation scores don't improve for 1000 rounds.
[2]	valid_0's binary_logloss: 0.487616	valid_0's auc: 0.951208
[3]	valid_0's binary_logloss: 0.484373	valid_0's auc: 0.951877
[4]	valid_0's binary_logloss: 0.481203	valid_0's auc: 0.951862
[5]	valid_0's binary_logloss: 0.478096	valid_0's auc: 0.951571
[6]	valid_0's binary_logloss: 0.475053	valid_0's auc: 0.951646
[7]	valid_0's binary_logloss: 0.472068	valid_0's auc: 0.95181
[8]	valid_0's binary_logloss: 0.469143	valid_0's auc: 0.951714
[9]	valid_0's binary_logloss: 0.466273	valid_0's auc: 0.9517
[10]	valid_0's binary_logloss: 0.463457	valid_0's auc: 0.951536
[11]	valid_0's binary_logloss: 0.460693	valid_0's auc: 0.951829
[12]	valid_0's binary_logloss: 0.45798	valid_0's auc: 0.95195
[13]	valid_0's binary_logloss: 0.455322	valid_0's auc: 0.951867
[14]	valid_0's binary_logloss: 0.452706	valid_0's auc: 0.95201
[15]	valid_0's binary_logloss: 0.4501

In [135]:
preds = gbm.predict(test_x, num_iteration=gbm.best_iteration) 
size = np.size(preds)
print(size)

50000


In [136]:
threshold = 0.5  
for i in range(0,size):
    if preds[i] >= threshold:
       preds[i] = 1
    else:  
       preds[i] = 0

In [137]:
accuracy = accuracy_score(test_y,preds) 
print(accuracy)

0.9014
