# Ensemble methods

work best when the predictors are as independent
from one another as possible. One way to get diverse classifiers
is to train them using very different algorithms. This increases the
chance that they will make very different types of errors, improving
the ensemble’s accuracy.

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
x, y = make_moons(n_samples=50000, noise=0.20)

In [7]:
X_train , x_test , y_train , y_test = train_test_split(
                            x,y , test_size =0.3
                            , random_state = 42
)

In [8]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

In [9]:
votti_clf = VotingClassifier(
    estimators = [('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)] 
    ,voting = 'hard'
)

In [10]:
models = [log_clf , rnd_clf , svm_clf , votti_clf]

In [11]:
for clf in models :
    clf.fit(X_train , y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8710666666666667
RandomForestClassifier 0.9668666666666667
SVC 0.9714666666666667
VotingClassifier 0.9699333333333333


If all classifiers are able to estimate class probabilities (i.e., they all have a pre
dict_proba() method), then you can tell Scikit-Learn to predict the class with the
highest class probability, averaged over all the individual classifiers. This is called soft
voting\

In [12]:
svm_clf = SVC(gamma="scale", probability=True, random_state=42)
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [13]:
votti_clf = VotingClassifier(
    estimators = [('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)] 
    ,voting = 'soft'
)

In [14]:
models2 = [log_clf , rnd_clf , svm_clf , votti_clf]

In [15]:
for clf in models2 :
    clf.fit(X_train , y_train)
    y_pred = clf.predict_proba(x_test)
    y_pred = np.argmax (y_pred , axis = 1)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8710666666666667
RandomForestClassifier 0.9668666666666667
SVC 0.9714666666666667
VotingClassifier 0.9686666666666667


# Bagging and Pasting in Scikit-Learn

In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [17]:
bag_clf = BaggingClassifier(
            DecisionTreeClassifier(),
            max_samples = 100 ,
            bootstrap = True ,
            n_estimators=500,
            n_jobs = -1 ,
            oob_score= True,
            random_state=42
)

In [18]:
bag_clf.fit(X_train , y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, oob_score=True, random_state=42)

In [19]:
bag_clf.oob_score_

0.9563714285714285

In [20]:
y_pred = bag_clf.predict(x_test)

In [21]:
accuracy_score(y_test, y_pred)

0.9586666666666667

In [22]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(x_test)
print(accuracy_score(y_test, y_pred_tree))

0.9571333333333333


# Random Forests

In [23]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16),
    n_estimators=500, random_state=42)

In [34]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(x_test)

In [35]:
accuracy_score(y_test,y_pred)

0.9680666666666666

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rnd_clf = RandomForestClassifier(n_estimators=500,
                                 max_leaf_nodes=16,
                                 random_state=42)

In [28]:
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, random_state=42)

In [31]:
y_pred_rf = rnd_clf.predict(x_test)

In [32]:
accuracy_score(y_test,y_pred_rf)

0.9686

In [37]:
np.sum(y_pred == y_pred_rf) / len(y_pred)

0.9982666666666666

 very similar predictions

# Extra-Trees
this technique trades more bias for
a lower variance. It also makes Extra-Trees much faster to train than regular Random
Forests, because finding the best possible threshold for each feature at every node is
one of the most time-consuming tasks of growing a tree.

In [40]:
from sklearn.ensemble import ExtraTreesClassifier

In [49]:
extra_trees = ExtraTreesClassifier(n_estimators=500,
                                 random_state=42)

In [58]:
extra_trees.fit(X_train, y_train)

ExtraTreesClassifier(n_estimators=500, random_state=42)

In [62]:
y_pred = extra_trees.predict(x_test)

In [63]:
accuracy_score(y_test,y_pred)

0.9678

In [60]:
from sklearn.model_selection import cross_val_score

In [64]:
cross_val_score(extra_trees,X_train, y_train)

array([0.96771429, 0.96542857, 0.965     , 0.96428571, 0.96471429])

# Feature Importance

In [86]:
from sklearn.datasets import load_iris 
import matplotlib.pyplot as plt
import matplotlib as mpl

In [76]:
iris = load_iris()

In [77]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [78]:
rnd_clf.fit(iris["data"], iris["target"])

RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [79]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
     print(name, score)

sepal length (cm) 0.10567655384314006
sepal width (cm) 0.024005064901658955
petal length (cm) 0.4492718909570577
petal width (cm) 0.42104649029814323


# AdaBoost
One way for a new predictor to correct its predecessor is to pay a bit more attention
to the training instances that the predecessor underfitted. This results in new predictors
focusing more and more on the hard cases. This is the technique used by
AdaBoost.

## Note
There is one important drawback to this sequential learning technique:
it cannot be parallelized (or only partially), since each predictor
can only be trained after the previous predictor has been
trained and evaluated. As a result, it does not scale as well as bagging
or pasting.

In [95]:
from sklearn.ensemble import AdaBoostClassifier

In [104]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)

In [105]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [106]:
y_pred = ada_clf.predict(x_test)

In [107]:
accuracy_score(y_test,y_pred)

0.9688

## Note
If your AdaBoost ensemble is overfitting the training set, you can
try reducing the number of estimators or more strongly regularizing
the base estimator

# Gradient Boosting
Another very popular boosting algorithm is Gradient Boosting. Just like AdaBoost,
Gradient Boosting works by sequentially adding predictors to an ensemble, each one
correcting its predecessor. However, instead of tweaking the instance weights at every
iteration like AdaBoost does, this method tries to fit the new predictor to the residual
errors made by the previous predictor.

In [110]:
np.random.seed(42)
x = np.random.rand(10000, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(10000)

In [111]:
X_train , x_test , y_train , y_test = train_test_split(
                            x,y , test_size =0.3
                            , random_state = 42)

In [112]:
from sklearn.tree import DecisionTreeRegressor

In [113]:
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [114]:
y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X_train, y2)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [116]:
y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X_train, y3)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [117]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [122]:
from sklearn.metrics import mean_squared_error

In [123]:
mean_squared_error (y_test ,y_pred)

0.00571677239717004

In [124]:
from sklearn.ensemble import GradientBoostingRegressor

In [133]:
gbrt = GradientBoostingRegressor(max_depth=2, 
                                 n_estimators=3, 
                                 learning_rate=1.0,
                                 random_state=42)


In [134]:
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3,
                          random_state=42)

In [135]:
y_pred = gbrt.predict(x_test)

In [136]:
mean_squared_error (y_test ,y_pred)

0.005716772397170041

# Gradient Boosting with Early stopping

In [137]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [138]:
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_test, y_pred)
          for y_pred in gbrt.staged_predict(x_test)]

In [139]:
bst_n_estimators = np.argmin(errors) + 1

In [140]:
gbrt_best = GradientBoostingRegressor(max_depth=2,
                                      n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=96)

In [141]:
y_pred = gbrt.predict(x_test)

In [142]:
mean_squared_error (y_test ,y_pred)

0.002499400942088274

# Using XGBoost

In [144]:
import xgboost

In [145]:
xgb_reg = xgboost.XGBRegressor()

In [146]:
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [147]:
y_pred = xgb_reg.predict(x_test)

In [148]:
mean_squared_error (y_test ,y_pred)

0.0026692760105529604

In [149]:
xgb_reg.fit(X_train, y_train,
eval_set=[(x_test, y_test)], early_stopping_rounds=2)

[0]	validation_0-rmse:0.24230
[1]	validation_0-rmse:0.17398
[2]	validation_0-rmse:0.12760
[3]	validation_0-rmse:0.09665
[4]	validation_0-rmse:0.07687
[5]	validation_0-rmse:0.06499
[6]	validation_0-rmse:0.05811
[7]	validation_0-rmse:0.05445
[8]	validation_0-rmse:0.05252
[9]	validation_0-rmse:0.05158
[10]	validation_0-rmse:0.05112
[11]	validation_0-rmse:0.05088
[12]	validation_0-rmse:0.05073
[13]	validation_0-rmse:0.05072
[14]	validation_0-rmse:0.05068
[15]	validation_0-rmse:0.05065
[16]	validation_0-rmse:0.05065
[17]	validation_0-rmse:0.05066


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [150]:
y_pred = xgb_reg.predict(x_test)

In [151]:
mean_squared_error (y_test ,y_pred)

0.002565526898865579