### Ensembling by Voting Classifiers

prepare dataset

In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

x = iris.data[:, 2:] # petal length and width
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='hard')
voting_clf.fit(x_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
   clf.fit(x_train, y_train)
   y_pred = clf.predict(x_test)
   print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9333333333333333
RandomForestClassifier 0.9333333333333333
SVC 0.9
VotingClassifier 0.9333333333333333


# Bagging & Pasting

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
   DecisionTreeClassifier(), n_estimators=500,
   max_samples=100, bootstrap=True, n_jobs=-1
)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

### Out-of-bag (OOB)

In [8]:
bag_clf = BaggingClassifier(
   DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

0.95

In [9]:
from sklearn.metrics import accuracy_score

y_pred = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred)

1.0

In [10]:
bag_clf.oob_decision_function_

array([[1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.78918919, 0.21081081],
       [1.        , 0.        , 0.        ],
       [0.        , 0.06898148, 0.93101852],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.

## Random Patches and Random Subspaces

## Random Forests

## Boosting

originally called hypothesis boosting refers to any Ensemble method that can combine several weak learners into a strong learner.
e.g AdaBoost and Gradient Boosting

### AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
   DecisionTreeClassifier(max_depth=1), n_estimators=200,
   algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(x_train, y_train)

### Gradient Boosting

In [13]:
tree_reg1 = DecisionTreeClassifier(max_depth=2)
tree_reg1.fit(x, y)

In [14]:
y2 = y - tree_reg1.predict(x)
tree_reg2 = DecisionTreeClassifier(max_depth=2)
tree_reg2.fit(x, y2)

In [15]:
y3 = y2 - tree_reg2.predict(x)
tree_reg3 = DecisionTreeClassifier(max_depth=2)
tree_reg3.fit(x, y3)

In [17]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [18]:
accuracy_score(y_test, y_pred)

0.9666666666666667

Using sklearn

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(x,y)

The following code trains a GBRT ensemble with 120 trees, then measures the validation error at each stage of training to find the optimal number of trees, and finally trains another GBRT ensemble using the optimal number of trees

In [21]:
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

x_train, x_val, y_train, y_val = train_test_split(x, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(x_train, y_train)

errors = [
   mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(x_val)
]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(x_train, y_train)

manual early stopping by setting warm_start = True

In [22]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
   gbrt.n_estimators_ = n_estimators
   gbrt.fit(x_train, y_train)
   y_pred = gbrt.predict(x_val)
   val_error = mean_squared_error(y_val, y_pred)
   if val_error < min_val_error:
      min_val_error = val_error
      error_going_up = 0
   else:
      error_going_up += 1
      if error_going_up == 5:
         break # early stopping

Using already optimized Gradient Boosting

In [23]:
### install it

## Stacking