<a href="https://colab.research.google.com/github/Gaurav-Jagnani/ML_micro_projects/blob/master/Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [300]:
iris = load_iris()
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [301]:
X = pd.DataFrame(iris.data, columns=iris.feature_names)
Y = iris.target
print(X.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [302]:
# Remove correlated columns
corr_df = X.corr().abs()
up_tri = np.triu(
	np.full(corr_df.shape, 1), k=1)
up_tri = up_tri.astype(bool)
corr_df = corr_df.where(up_tri)
correlated_cols = [col for col in corr_df if any(corr_df[col] > 0.75)]
print(correlated_cols)

['petal length (cm)', 'petal width (cm)']


In [303]:
X.drop(correlated_cols, axis=1, inplace=True)
print(X.head())

   sepal length (cm)  sepal width (cm)
0                5.1               3.5
1                4.9               3.0
2                4.7               3.2
3                4.6               3.1
4                5.0               3.6


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                    X, Y, test_size=0.2)

In [0]:
def stacking(model, n_folds, X_train, X_test):
    sFold = StratifiedKFold(n_folds)
    val_preds = list()
    Y_reindexed = list()
    for train_idx, val_idx in sFold.split(X_train, Y_train):
        X_fold = X_train.iloc[train_idx]
        Y_fold = Y_train[train_idx]
        X_val = X_train.iloc[val_idx]
        Y_val = Y_train[val_idx]
        model.fit(X_fold, Y_fold)
        val_preds.extend(model.predict(X_val))
        Y_reindexed.extend(Y_val)

    test_pred = model.predict(X_test)
    val_preds = np.array(val_preds).reshape(-1, 1)

    return val_preds, test_pred, Y_reindexed

In [0]:
model1 = DecisionTreeClassifier()
val_pred1, test_pred1, Y_reindexed = stacking(
                        model1, 5, X_train, X_test)

In [0]:
model2 = KNeighborsClassifier()
val_pred2, test_pred2, Y_reindexed = stacking(
                        model2, 5, X_train, X_test)

In [0]:
val_pred = np.column_stack([val_pred1, val_pred2])
test_pred = np.column_stack([test_pred1, test_pred2])

In [309]:
meta_model = LogisticRegression()
meta_model.fit(val_pred, Y_reindexed)
stacking_pred = meta_model.predict(test_pred)
print(accuracy_score(stacking_pred, Y_test))

0.8
