<a href="https://colab.research.google.com/github/HagarIbrahiem/Ensemble-Methods-in-Machine-Learning/blob/main/Ensemble_Methods_in_Machine_Learning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Intro

The datasets used in this tutorial are Sklearn:
- `Make_moon` Dataset for Classification
- `Diabetes` Dataset for Regression


# Import Libs

In [16]:
# imports
import pandas as pd
import numpy as np
import time


## datasets
from sklearn import datasets
from sklearn.datasets import make_moons


## ML models
# classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
# Ensemble
from sklearn.ensemble import RandomForestClassifier , RandomForestRegressor , GradientBoostingRegressor
from sklearn.ensemble import VotingClassifier,VotingRegressor,BaggingClassifier, StackingRegressor

## metrics
from sklearn.metrics import accuracy_score, f1_score ,  r2_score ,  mean_squared_error
from sklearn.model_selection import train_test_split

## visualizations
import matplotlib.pyplot as plt
import seaborn as sns

## ignore warnings
import warnings
warnings.filterwarnings('ignore')


# Load Data

In [17]:
# Load the make_moons dataset - classification
X, y =make_moons(n_samples=10000, noise=0.4)

# Split the data into training and testing sets
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X, y, random_state = 10, test_size = 0.30)


In [18]:

# Load the diabetes dataset - regression
diabetes = datasets.load_diabetes(as_frame=True)

# Separate out the data
X_diabetes = diabetes['data']
y_diabetes = diabetes['target']

# Split the data into training and testing sets
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes, y_diabetes, random_state = 50, test_size = 0.2)

# Classification - Single Predictor - Iris Dataset

In [19]:
_decisionTreeClassifier = DecisionTreeClassifier()

t0 = time.time()
_decisionTreeClassifier.fit(X_train_iris, y_train_iris)
Acc_dt_iris = accuracy_score(y_test_iris, _decisionTreeClassifier.predict(X_test_iris))
t1 = time.time()

print("Acc-Decision Tree:",Acc_dt_iris )
print("Computation Time: {}".format(round(t1-t0 , 4))  , '\n')


Acc-Decision Tree: 0.8006666666666666
Computation Time: 0.0256 



In [20]:
_decisionTreeClassifier = DecisionTreeClassifier()

t0 = time.time()
_decisionTreeClassifier.fit(X_train_iris, y_train_iris)
Acc_dt_iris = accuracy_score(y_test_iris, _decisionTreeClassifier.predict(X_test_iris))
t1 = time.time()

print("Acc-Decision Tree:",Acc_dt_iris )
print("Computation Time: {}".format(round(t1-t0 , 4))  , '\n')


Acc-Decision Tree: 0.802
Computation Time: 0.0315 



# Regression - Single Predictor - Diabetes Dataset

In [21]:
dt_reg = DecisionTreeRegressor()

t0 = time.time()
dt_reg.fit(X_train_diabetes, y_train_diabetes)
y_pred = dt_reg.predict(X_test_diabetes)
t1 = time.time()

print("RMSE-DecisionTreeRegressor:",mean_squared_error(y_test_diabetes, y_pred , squared=False) )
print("Computation Time: {}".format(round(t1-t0 , 4))  , '\n')



RMSE-DecisionTreeRegressor: 80.05510742431804
Computation Time: 0.0103 



# **Ensemble Methods**

# Ensemble Voting

Train different ML Predictors on same dataset:
- Voting Classifier - Iris dataset
- Voting Regressor - Diabates dataset

## Voting Classifier

In [22]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
xgboost_clf = XGBClassifier()
voting_clf = VotingClassifier( estimators=[('lr', log_clf), ('rf', rnd_clf), ('xgb', xgboost_clf)], voting='hard')

for clf in (log_clf, rnd_clf, xgboost_clf, voting_clf ):
   t0 = time.time()
   clf.fit(X_train_iris, y_train_iris)
   y_pred = clf.predict(X_test_iris)
   t1 = time.time()

   print('Acc-',clf.__class__.__name__, ':', accuracy_score(y_test_iris, y_pred) )
   print("Computation Time: {}".format(round(t1-t0,4))  , '\n')


Acc- LogisticRegression : 0.8353333333333334
Computation Time: 0.0113 

Acc- RandomForestClassifier : 0.853
Computation Time: 1.0809 

Acc- XGBClassifier : 0.8606666666666667
Computation Time: 0.4673 

Acc- VotingClassifier : 0.8623333333333333
Computation Time: 1.5495 



**There you have it! The voting classifier slightly outperforms all the individual classifiers.**

### Hard Vs. Soft Voting

In [23]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC( probability = True) # to enable predict_propba()
xgboost_clf = XGBClassifier()
voting_clf = VotingClassifier( estimators=[('lr', log_clf), ('rf', rnd_clf), ('xgb', xgboost_clf)], voting='soft') ## soft voting


for clf in (log_clf, rnd_clf,xgboost_clf, voting_clf ):
   t0 = time.time()
   clf.fit(X_train_iris, y_train_iris)
   y_pred = clf.predict(X_test_iris)
   t1 = time.time()
   print('Acc-',clf.__class__.__name__,':', accuracy_score(y_test_iris, y_pred))
   print("Computation Time: {}".format(round(t1-t0,4))  , '\n')


Acc- LogisticRegression : 0.8353333333333334
Computation Time: 0.0113 

Acc- RandomForestClassifier : 0.855
Computation Time: 1.0847 

Acc- XGBClassifier : 0.8606666666666667
Computation Time: 0.4497 

Acc- VotingClassifier : 0.8633333333333333
Computation Time: 3.2065 



## Voting Regressor

In [24]:
lin_reg = LinearRegression()
rnd_reg = RandomForestRegressor()
svr_reg = SVR( )
dr_reg = DecisionTreeRegressor()
knn_reg = KNeighborsRegressor()
voting_reg = VotingRegressor( estimators=[ ('lr', lin_reg), ('rf', rnd_reg), ('svr', svr_reg) , ('dt', dr_reg) , ('knn',knn_reg)])


for reg in (lin_reg, rnd_reg, svr_reg,dr_reg,knn_reg, voting_reg ):
   t0 = time.time()
   reg.fit(X_train_diabetes, y_train_diabetes)
   y_pred = reg.predict(X_test_diabetes)
   t1 = time.time()

   print('RMSE-',reg.__class__.__name__,':', mean_squared_error(y_test_diabetes, y_pred , squared=False) )
   print("Computation Time: {}".format(round(t1-t0,4))  , '\n')

RMSE- LinearRegression : 51.487471054434295
Computation Time: 0.007 

RMSE- RandomForestRegressor : 56.740281858599
Computation Time: 0.3296 

RMSE- SVR : 67.43165774761839
Computation Time: 0.0139 

RMSE- DecisionTreeRegressor : 78.10573357940842
Computation Time: 0.0057 

RMSE- KNeighborsRegressor : 58.31968465258988
Computation Time: 0.0066 

RMSE- VotingRegressor : 55.74637330472197
Computation Time: 0.3383 



# Ensemble Bagging

Train same ML Predictors on different subsets (With Replacments)

In [25]:
# define base predictor and bagging classifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500 , bootstrap=True) #base predictor may be = XGBClassifier() or any other.

t0 = time.time()
bag_clf.fit(X_train_iris, y_train_iris)
y_pred = bag_clf.predict(X_test_iris)
t1 = time.time()

print("Acc-Bagging Classifier :",accuracy_score(y_test_iris, y_pred ) )
print("Computation time:", round(t1-t0,4))

Acc-Bagging Classifier : 0.852
Computation time: 8.3594


# Ensemble Pasting

Train same ML Predictors on different subsets (Without Replacments)

In [26]:
# define base predictor and bagging classifier
pasting_clf = BaggingClassifier( XGBClassifier (), n_estimators = 300 , bootstrap=False) # if you want to use pasting instead, just set bootstrap=False

t0 = time.time()
pasting_clf.fit(X_train_iris, y_train_iris)
y_pred_pasting= pasting_clf.predict(X_test_iris)
t1 = time.time()
time_pasting_iris = round(t1-t0 , 4)

Acc_pasting_iris = accuracy_score(y_test_iris, y_pred_pasting )

print("Acc-Pasting Classifier :",Acc_pasting_iris )
print("Computation time:", time_pasting_iris)

Acc-Pasting Classifier : 0.8606666666666667
Computation time: 159.0112


Same code of Bagging/Pasting Classifiers using DecisionTreeClassifier as a base estimator can be replaced with the following code using Randome Forest.

 Random forest algorithm itself  is an ensemble learning technique combining the outputs of numerous decision trees classifiers to enhance a model’s performance, generally trained via the bagging method (or sometimes pasting) , thus instead of building a BaggingClassifier and passing
it a DecisionTreeClassifier, you can instead use the RandomForestClassifier
class, which is more convenient and optimized for Decision Trees


In [27]:

_randomForestClassifier = RandomForestClassifier(n_estimators=500)

t0 = time.time()
_randomForestClassifier.fit(X_train_iris, y_train_iris)
y_pred_rf= _randomForestClassifier.predict(X_test_iris)
t1 = time.time()

print("Acc-Random Forest :",accuracy_score(y_test_iris, y_pred_rf ) )
print("Computation time:", round(t1-t0 , 4))

Acc-Random Forest : 0.8536666666666667
Computation time: 5.1017


# Ensemble Stacking

#### **Implement Stacking using (np.column_stack)**

In [28]:
# Define the base models
models = [
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor()),
    ('gb', GradientBoostingRegressor(random_state=42))
]

# Train the base models and make predictions
base_predictions = []
for name, model in models:
    model.fit(X_train_diabetes, y_train_diabetes)
    predictions = model.predict(X_test_diabetes)
    base_predictions.append(predictions)

# Create a meta-model (second-level model) to combine the base model predictions
meta_model = LinearRegression()
meta_X = np.column_stack(base_predictions)

# Train the meta-model on the base model predictions
meta_model.fit(meta_X, y_test_diabetes)

# Make predictions using the stacked ensemble
stacked_predictions = []
for predictions in base_predictions:
    stacked_predictions.append(predictions)
stacked_predictions = np.column_stack(stacked_predictions)
ensemble_predictions = meta_model.predict(stacked_predictions)

# Calculate the RMSE for the ensemble prediction
ensemble_rmse = mean_squared_error(y_test_diabetes, ensemble_predictions,  squared=False)
print("Ensemble Staking RMSE:", ensemble_rmse)

Ensemble Staking RMSE: 50.449000640632555


**Implementing stacking, we do obtain a better result compared to individual models!**

### Ensemble of ensembles ( using 2 MetaModels)

In [29]:
# Define the base models
base_models = [
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor()),
    ('gb', GradientBoostingRegressor())
]


# Define the first-level meta-models
meta_models = [
    LinearRegression(),
    RandomForestRegressor()
]

# Train the base models and make predictions
base_predictions = []
for name, model in base_models:
    model.fit(X_train_diabetes, y_train_diabetes)
    predictions = model.predict(X_test_diabetes)
    base_predictions.append(predictions)

# Create the inputs for the second-level meta-models
meta_inputs = np.column_stack(base_predictions)

# Train the second-level meta-models
ensemble_predictions = []
for model in meta_models:
    model.fit(meta_inputs, y_test_diabetes)
    predictions = model.predict(meta_inputs)
    ensemble_predictions.append(predictions)

# Take the average of predictions from the second-level meta-models
ensemble_predictions = np.mean(ensemble_predictions, axis=0)

# Calculate the RMSE for the ensemble prediction
ensemble_rmse = np.sqrt(mean_squared_error(y_test_diabetes, ensemble_predictions))
print("Ensemble RMSE:", ensemble_rmse)

Ensemble RMSE: 34.41134490607928


### Ensemble of ensembles ( using 3 MetaModels)

In [30]:
# Define the base models
base_models = [
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor()),
    ('gb', GradientBoostingRegressor())
]


# Define the first-level meta-models
meta_models = [
    LinearRegression(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]

# Train the base models and make predictions
base_predictions = []
for name, model in base_models:
    model.fit(X_train_diabetes, y_train_diabetes)
    predictions = model.predict(X_test_diabetes)
    base_predictions.append(predictions)

# Create the inputs for the second-level meta-models
meta_inputs = np.column_stack(base_predictions)

# Train the second-level meta-models
ensemble_predictions = []
for model in meta_models:
    model.fit(meta_inputs, y_test_diabetes)
    predictions = model.predict(meta_inputs)
    ensemble_predictions.append(predictions)

# Take the average of predictions from the second-level meta-models
ensemble_predictions = np.mean(ensemble_predictions, axis=0)

# Calculate the RMSE for the ensemble prediction
ensemble_rmse = np.sqrt(mean_squared_error(y_test_diabetes, ensemble_predictions))
print("Ensemble RMSE:", ensemble_rmse)

Ensemble RMSE: 27.11790360620823


# Ensemble Boosting

 The general idea of most boosting methods is to train predictors sequentially, each trying to correct its predecessor. There are many boosting methods available, but by far the most popular are :
-	AdaBoost
-	Gradient Boost
-	Light GBM
-	XGBoost
-	CatBoost
