# **Ensemble Learning in Practice**

This notebook covers implementations of main ensemble learning techniques: voting, averaging, stacking, bagging and boosting

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
import seaborn as sns
from sklearn import datasets

from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from mlxtend.classifier import StackingClassifier
from mlxtend.regressor import StackingCVRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostRegressor

## **1. Voting (Hard voting)**

In [79]:
# Create dataset
X, y = make_classification(n_samples=10000, n_features=5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize individual models
clf_1 = KNeighborsClassifier()
clf_2 = LogisticRegression()
clf_3 = DecisionTreeClassifier()

# Create voting classifier
voting_ens = VotingClassifier(
    estimators=[('knn', clf_1), ('lr', clf_2), ('dt', clf_3)], voting='hard')

for clf in (clf_1, clf_2, clf_3, voting_ens):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

KNeighborsClassifier 0.9385
LogisticRegression 0.929
DecisionTreeClassifier 0.932
VotingClassifier 0.942


## **2. Averaging (Soft voting)**

In [81]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

# Initialize individual models
reg1 = DecisionTreeRegressor()
reg2 = LinearRegression()

# Create voting regressor
voting_ens = VotingRegressor(estimators=[('dt', reg1), ('lr', reg2)], weights=[2,1])

# Fit and predict with the models and enseble
for reg in (reg1, reg2, voting_ens):
   reg.fit(X_train, y_train)
   y_pred = reg.predict(X_test)
   print(reg.__class__.__name__, mean_absolute_error(y_test, y_pred))

DecisionTreeRegressor 3.0392156862745106
LinearRegression 3.2648171537970883
VotingRegressor 2.5586572390061875


## **3. Stacking**

In [105]:
# Standard Stacking for Classification
from mlxtend.classifier import StackingClassifier

In [104]:
# Create dataset
X, y = make_classification(n_samples=10000, n_features=15)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize individual models
clf_1 = KNeighborsClassifier()
clf_2 = GaussianNB()
clf_3 = DecisionTreeClassifier()

# Initialize meta-model
clf_meta = LogisticRegression()

# Create stacking classifier
clf_stack = StackingClassifier(classifiers=[clf_1, clf_2, clf_3], 
                               meta_classifier=clf_meta,
                               use_probas=False,  
                               use_features_in_secondary=False)

for clf in (clf_1, clf_2, clf_3, clf_meta, clf_stack):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

KNeighborsClassifier 0.84
GaussianNB 0.8395
DecisionTreeClassifier 0.895
LogisticRegression 0.8595
StackingClassifier 0.898


In [106]:
# Stacking with cross-validation for regression
from mlxtend.regressor import StackingCVRegressor

In [115]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

# Initialize individual models
reg1 = DecisionTreeRegressor()
reg2 = SVR()

# Create meta-model 
meta_model = LinearRegression()

# Create stacking classifier
reg_stack = StackingCVRegressor(regressors=[reg1, reg2], 
                               meta_regressor=meta_model,
                               use_features_in_secondary=False)

for reg in (reg1, reg2, meta_model, reg_stack):
  reg.fit(X_train, y_train)
  y_pred = reg.predict(X_test)
  print(reg.__class__.__name__, mean_absolute_error(y_test, y_pred))

DecisionTreeRegressor 3.327450980392158
SVR 5.222470732130654
LinearRegression 3.2191778452433755
StackingCVRegressor 2.933358665677874


## **4. Bagging**

In [116]:
from sklearn.ensemble import BaggingClassifier

In [143]:
# Create dataset
X, y = make_classification(n_samples=10000, n_features=5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize weak model 
base_model = DecisionTreeClassifier(max_depth=3)

# Create bagging classifier
clf_bagging = BaggingClassifier(base_estimator=base_model, n_estimators=100, oob_score=True)

clf_bagging.fit(X_train, y_train)
print(clf_bagging.oob_score_)


0.918625


In [144]:
pred = clf_bagging.predict(X_test)
print(accuracy_score(y_test, pred))

0.916


## **5. Boosting**

In [145]:
from sklearn.ensemble import AdaBoostRegressor

In [147]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

In [151]:
# Initialize weak model 
base_model = LinearRegression(normalize=True)

# Create AdaBoost regressor
reg_adaboost = AdaBoostRegressor(base_estimator=base_model, n_estimators=100, random_state=500)
reg_adaboost.fit(X_train, y_train)

# Predict and compare with y_test
pred = reg_adaboost.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE:', rmse)

RMSE: 4.1854762804154415
