# Ensemble Methods

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_clf), ('sv', svm_clf)], voting='soft')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.912
SVC 0.888
VotingClassifier 0.912


In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)

predictions = bag_clf.predict(X_test)
accuracy_score(y_test, predictions)

0.90400000000000003

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.89600000000000002

In [7]:
from sklearn.metrics import accuracy_score
predictions = bag_clf.predict(X_test)
accuracy_score(y_test, predictions)

0.90400000000000003

In [8]:
bag_clf.oob_decision_function_

array([[ 0.34183673,  0.65816327],
       [ 0.32954545,  0.67045455],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.11377246,  0.88622754],
       [ 0.33742331,  0.66257669],
       [ 0.0049505 ,  0.9950495 ],
       [ 0.99450549,  0.00549451],
       [ 0.97938144,  0.02061856],
       [ 0.7826087 ,  0.2173913 ],
       [ 0.        ,  1.        ],
       [ 0.77486911,  0.22513089],
       [ 0.81325301,  0.18674699],
       [ 0.95652174,  0.04347826],
       [ 0.07555556,  0.92444444],
       [ 0.005     ,  0.995     ],
       [ 0.96923077,  0.03076923],
       [ 0.96987952,  0.03012048],
       [ 0.99453552,  0.00546448],
       [ 0.01081081,  0.98918919],
       [ 0.27777778,  0.72222222],
       [ 0.92307692,  0.07692308],
       [ 1.        ,  0.        ],
       [ 0.96132597,  0.03867403],
       [ 0.        ,  1.        ],
       [ 1.        ,  0.        ],
       [ 1.        ,  0.        ],
       [ 0.        ,

In [9]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples = 1.0, bootstrap=True,n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_bag_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_bag_pred)

0.90400000000000003

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = ExtraTreesClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
extra_clf.fit(X_train, y_train)

extra_pred = extra_clf.predict(X_test)
accuracy_score(y_test, extra_pred)

0.91200000000000003

In [12]:
from sklearn.datasets import load_iris

iris_data = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris_data["data"],iris_data["target"])
for name, score in zip(iris_data["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.118261111034
sepal width (cm) 0.0253709507313
petal length (cm) 0.4518342915
petal width (cm) 0.404533646735


In [13]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
predictions = ada_clf.predict(X_test)
accuracy_score(y_test, predictions)

0.89600000000000002

In [14]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
m = 1000
X = 6 * np.random.randn(m,1) - 3
y = 0.5 * X **2 + X + 2 + np.random.randn(m, 1)

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X,y)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=200)
gbrt.fit(X, y)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
best_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators = best_n_estimators)
gbrt_best.fit(X, y)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=199, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [17]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X, y)
    val_error = mean_squared_error(y, gbrt.predict(X))
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break   

  y = column_or_1d(y, warn=True)


In [18]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST Original")
X = mnist["data"]
y = mnist["target"]

In [19]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = 1.0/7)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 1.0/3)

In [20]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC

extra_clf = ExtraTreesClassifier()
forest_clf = RandomForestClassifier()
svc_clf = LinearSVC()
voting_clf = VotingClassifier(estimators = [('extra', extra_clf), ('forest', forest_clf), ('svc', svc_clf)], voting='hard')

In [22]:
models = [('Extra Trees', extra_clf), ('Random Forests' ,forest_clf), ('Support Vector', svc_clf), ('Voting', voting_clf)]

extra_clf.fit(X_train, y_train)
forest_clf.fit(X_train, y_train)
svc_clf.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)

extra_pred = extra_clf.predict(X_val)
forest_pred = forest_clf.predict(X_val)
svc_pred = svc_clf.predict(X_val)
voting_pred = voting_clf.predict(X_val)

In [57]:
X_new = list(zip(extra_pred, forest_pred, svc_pred, voting_pred))
new_clf = RandomForestClassifier()
new_clf.fit(X_new, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [58]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator,TransformerMixin

models = [extra_clf, forest_clf, svc_clf, voting_clf]

class StagedPredict(BaseEstimator, TransformerMixin):
    def __init__(self,models):
        self.models = models
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_new = list(zip(*[model.predict(X) for model in models]))
        return X_new

trans = StagedPredict(models)
X_test_trans = trans.fit_transform(X_test)

staged_predictions = new_clf.predict(X_test_trans)
accuracy_score(y_test, staged_predictions)

0.95130000000000003