<a href="https://colab.research.google.com/github/Gaurav-Jagnani/ML_micro_projects/blob/master/Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [0]:
iris = load_iris()
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [0]:
X = pd.DataFrame(iris.data, columns=iris.feature_names)
Y = iris.target
print(X.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [0]:
# Remove correlated columns
corr_df = X.corr().abs()
up_tri = np.triu(
	np.full(corr_df.shape, 1), k=1)
up_tri = up_tri.astype(bool)
corr_df = corr_df.where(up_tri)
correlated_cols = [col for col in corr_df if any(corr_df[col] > 0.75)]
print(correlated_cols)

['petal length (cm)', 'petal width (cm)']


In [0]:
X.drop(correlated_cols, axis=1, inplace=True)
print(X.head())

   sepal length (cm)  sepal width (cm)
0                5.1               3.5
1                4.9               3.0
2                4.7               3.2
3                4.6               3.1
4                5.0               3.6


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                    X, Y, test_size=0.2)

In [0]:
X_train, X_val, Y_train, Y_val = train_test_split(
                                    X_train, Y_train, test_size=0.5)

In [0]:
model1 = RandomForestRegressor()
model2 = LinearRegression()

In [0]:
model1.fit(X_train, Y_train)
model2.fit(X_train, Y_train)
val_pred1 = model1.predict(X_val)
val_pred2 = model2.predict(X_val)
test_pred1 = model1.predict(X_test)
test_pred2 = model2.predict(X_test)
print(r2_score(test_pred1, Y_test))
print(r2_score(test_pred2, Y_test))

0.4591994811593182
0.5625685756995965


In [0]:
val_pred = np.column_stack((val_pred1, val_pred2))
test_pred = np.column_stack((test_pred1, test_pred2))

In [0]:
meta_model = LinearRegression()

In [0]:
meta_model.fit(val_pred, Y_val)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
blending_pred = meta_model.predict(test_pred)

In [0]:
r2_score(blending_pred, Y_test)

0.6650353944696372