In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score, mean_absolute_error , mean_squared_error, root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split


#### CART Models - Classification and Regression Models
#### CART Models - KNN, DT, RF, NB, SVM

##### Each model inside the ensemble technique predicts a value and then the average of all predicted values is taken and returned.

In [2]:
X,y = make_regression(n_features=10, n_samples=10000,n_informative=3)

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

#### Decision Tree

In [10]:
model_dt = DecisionTreeRegressor(random_state =42)
model_dt.fit(X_train,y_train)

y_pred_dt = model_dt.predict(X_test)


print("Decision Tree r2 Score:",r2_score(y_test,y_pred_dt))
print("Decision Tree MAE Score:",mean_absolute_error(y_test,y_pred_dt))
print("Decision Tree MSE Score:",mean_squared_error(y_test,y_pred_dt))
print("Decision Tree RMSE Score:",root_mean_squared_error(y_test,y_pred_dt))

Decision Tree r2 Score: 0.9955548870728446
Decision Tree MAE Score: 2.102427298437536
Decision Tree MSE Score: 8.539437171495315
Decision Tree RMSE Score: 2.9222315396791054


#### Bagging

In [None]:
bag = BaggingRegressor(
    estimator = DecisionTreeRegressor(), n_estimators = 500 ,  # 500 dts or bags
    max_features=0.5,     # random subset of half records
    bootstrap=True,       # random half and repetition
    random_state=42       # random seeding
    )

bag.fit(X_train,y_train)

y_pred_bag = bag.predict(X_test)


print("Bagging DT r2 Score:",r2_score(y_test,y_pred_bag))
print("Bagging DT MAE Score:",mean_absolute_error(y_test,y_pred_bag))
print("Bagging DT MSE Score:",mean_squared_error(y_test,y_pred_bag))
print("Bagging DT RMSE Score:",root_mean_squared_error(y_test,y_pred_bag))

Bagging DT r2 Score: 0.7502893972262397
Bagging DT MAE Score: 17.558228277458245
Bagging DT MSE Score: 479.7151474861024
Bagging DT RMSE Score: 21.902400495975375


#### Random Forest

In [15]:
model_rf = RandomForestRegressor(n_estimators = 500)
model_rf.fit(X_train,y_train)

y_pred_rf = model_rf.predict(X_test)

print("RandomForest r2 Score:",r2_score(y_test,y_pred_rf))
print("RandomForest MAE Score:",mean_absolute_error(y_test,y_pred_rf))
print("RandomForest MSE Score:",mean_squared_error(y_test,y_pred_rf))
print("RandomForest RMSE Score:",root_mean_squared_error(y_test,y_pred_rf))

RandomForest r2 Score: 0.9983516390875393
RandomForest MAE Score: 1.0689213411218574
RandomForest MSE Score: 3.1666404607890644
RandomForest RMSE Score: 1.7795056787740422


#### Bagging using SVM

In [16]:
bag_svm = BaggingRegressor(
    estimator = SVR(), n_estimators = 500 ,  # 500 dts or bags
    max_features=0.25,     # random subset of half records
    bootstrap=True,       # random half and repetition
    random_state=42       # random seeding
    )

bag_svm.fit(X_train,y_train)

y_pred_bag_svm = bag.predict(X_test)

print("Bagging Regressor using SVM r2 Score:",r2_score(y_test,y_pred_bag_svm))
print("Bagging Regressor using SVM MAE Score:",mean_absolute_error(y_test,y_pred_bag_svm))
print("Bagging Regressor using SVM MSE Score:",mean_squared_error(y_test,y_pred_bag_svm))
print("Bagging Regressor using SVM RMSE Score:",root_mean_squared_error(y_test,y_pred_bag_svm))

KeyboardInterrupt: 

Takeaways:

- Random Forests are better than bagged models, and further bagged models are better than Pasting
- Good results come around 25% to 50% row sampling
- In order to ind best aprameters we need to do hyper parameter optimisation

