###Basic type of stacking model

In [2]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cancer_data = load_breast_cancer()

X_data = cancer_data.data
y_label = cancer_data.target

X_train, X_test, y_train, y_test = train_test_split(X_data, y_label, test_size=.2, random_state=0)

In [3]:
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)
xgb_clf = XGBClassifier(n_estimators=300)

# meta
lr_clf = LogisticRegression()

#
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [4]:
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)
xgb_pred = xgb_clf.predict(X_test)

print("knn : %.4f"%(accuracy_score(y_test, knn_pred)))
print("rf  : %.4f"%(accuracy_score(y_test, rf_pred)))
print("dt  : %.4f"%(accuracy_score(y_test, dt_pred)))
print("ada : %.4f"%(accuracy_score(y_test, ada_pred)))
print("xgb : %.4f"%(accuracy_score(y_test, xgb_pred)))

knn : 0.9211
rf  : 0.9649
dt  : 0.9123
ada : 0.9561
xgb : 0.9912


In [5]:
preds = np.array([knn_pred, rf_pred, dt_pred, ada_pred, xgb_pred])
preds = preds.T

In [6]:
lr_clf.fit(preds, y_test)
results = lr_clf.predict(preds)

print("Results : %.4f"%(accuracy_score(y_test, results)))

Results : 0.9737


###Stacking model based on CV-set

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
  kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
  train_fold_pred = np.zeros((X_train_n.shape[0], 1))
  test_fold_pred = np.zeros((X_test_n.shape[0], n_folds))
  print(model.__class__.__name__, ' model starts')

  for folder_index, (train_index, test_index) in enumerate(kf.split(X_train_n)):
    print(folder_index,' folder start =========')
    X_cv_train = X_train_n[train_index]
    y_cv_train = y_train_n[train_index]
    X_cv_test = X_train_n[test_index]

    model.fit(X_cv_train, y_cv_train)
    train_fold_pred[test_index] = model.predict(X_cv_test).reshape(-1, 1)
    test_fold_pred[:, folder_index] = model.predict(X_test_n)
  
  test_fold_pred_mean = np.mean(test_fold_pred, axis=1).reshape(-1, 1)

  return train_fold_pred, test_fold_pred_mean

In [8]:
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)
xgb_clf = XGBClassifier(n_estimators=300)

knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)
xgb_train, xgb_test = get_stacking_base_datasets(xgb_clf, X_train, y_train, X_test, 7)

KNeighborsClassifier  model starts
RandomForestClassifier  model starts








DecisionTreeClassifier  model starts
AdaBoostClassifier  model starts




XGBClassifier  model starts


In [9]:
stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train, xgb_train), axis=1)
stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test, xgb_test), axis=1)
print(X_train.shape, X_test.shape)
print(stack_final_X_train.shape, stack_final_X_test.shape)

(455, 30) (114, 30)
(455, 5) (114, 5)


In [10]:
lr_clf = LogisticRegression()
lr_clf.fit(stack_final_X_train, y_train)
result = lr_clf.predict(stack_final_X_test)

print("Accuracy : %.4f"%(accuracy_score(y_test, result)))

Accuracy : 0.9825
