In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
from sklearn.datasets import make_moons
moons = make_moons(n_samples=10000, noise=0.4)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(moons[0], moons[1], test_size=0.2, random_state=42)

In [4]:
log_clf = LogisticRegression()
svm_clf = SVC(probability=True)
rnd_clf = RandomForestClassifier()

In [5]:
voting_clf = VotingClassifier(
    estimators= [("lr", log_clf), ("rf", rnd_clf),("svc", svm_clf),],
    voting= "soft"
)
voting_clf.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8265
RandomForestClassifier 0.8335
SVC 0.853
VotingClassifier 0.848


In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators =500,
    max_samples=100, bootstrap = True, n_jobs = -1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [8]:
accuracy_score(y_test, y_pred)

0.852

In [9]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)

In [10]:
bag_clf.oob_score_

0.83375

In [11]:
y_pred = bag_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.8285

In [12]:
bag_clf.oob_decision_function_

array([[0.90502793, 0.09497207],
       [0.94652406, 0.05347594],
       [0.88888889, 0.11111111],
       ...,
       [0.19021739, 0.80978261],
       [0.84357542, 0.15642458],
       [0.        , 1.        ]])

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.8285

In [15]:
# roughly equivalent to the previous example
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

In [16]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(X_train, y_train)

In [17]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.851

In [18]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)
y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)
y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)

In [19]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1,tree_reg2, tree_reg3))

In [20]:
accuracy_score(y_test, np.round(y_pred))

0.843

In [21]:
# same code as above
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1)
gbrt.fit(X_train, y_train)

In [22]:
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred) 
          for y_pred in gbrt.staged_predict(X_val)]
best_n_estimators = np.argmin(errors)+1
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)


In [23]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 1
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

In [24]:
gbrt.n_estimators_

107

In [25]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [26]:
accuracy_score(y_val, np.round(y_pred))

0.837

EXERCISES

In [29]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", version=1)

In [94]:
mnist.target = mnist.target.astype(np.uint8)

In [95]:
X, y = mnist["data"],mnist["target"]

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y
                    , test_size=10000, random_state=42)

In [97]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train
                    , test_size=10000, random_state=42)

In [134]:
from sklearn.ensemble import  ExtraTreesClassifier
from sklearn.svm import LinearSVC
forest_clf = RandomForestClassifier(n_estimators=100
            , n_jobs=-1, random_state=42)
xtra_clf = ExtraTreesClassifier(n_estimators=100
            , n_jobs=-1, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)


In [135]:
estimators = [forest_clf, xtra_clf, svm_clf]
for estimator in estimators:
    print("Training the ", estimator)
    estimator.fit(X_train, y_train)

Training the  RandomForestClassifier(n_jobs=-1, random_state=42)
Training the  ExtraTreesClassifier(n_jobs=-1, random_state=42)
Training the  LinearSVC(max_iter=100, random_state=42, tol=20)


In [136]:
[estimator.score(X_val,y_val) for estimator in estimators]

[0.9692, 0.9715, 0.859]

In [101]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("forest_clf", forest_clf),
    ("xtra_clf", xtra_clf),
    ("svm_clf", svm_clf),
]

In [102]:
voting_clf = VotingClassifier(named_estimators)



In [103]:
voting_clf.fit(X_train, y_train)

In [104]:
voting_clf.score(X_val, y_val)

0.9693

In [105]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.9692, 0.9715, 0.859]

In [106]:
voting_clf.set_params(svm_clf=None)

In [125]:
del y_val_pred_forest, y_val_pred_svm, y_val_pred_xtra

In [110]:
voting_clf.score(X_val, y_val)

0.9719

In [109]:
voting_clf.voting = "soft"

In [111]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.9645, 0.9691]

EXERCISE 9

In [137]:
X_val_preditions = np.empty((len(X_val),len(estimators)), dtype=np.float32)

In [138]:
for index, estimator in enumerate(estimators):
    X_val_preditions[:,index]= estimator.predict(X_val)

In [139]:
X_val_preditions

array([[5., 5., 5.],
       [8., 8., 8.],
       [2., 2., 3.],
       ...,
       [7., 7., 7.],
       [6., 6., 6.],
       [7., 7., 7.]], dtype=float32)

In [140]:
forest_blender =  RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
forest_blender.fit(X_val_preditions, y_val)

In [142]:
forest_blender.oob_score_

0.9703

In [143]:


X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)



In [148]:
y_pred = forest_blender.predict(X_test_predictions)

In [149]:
accuracy_score(y_test, y_pred)

0.9661

In [146]:
y_pred = voting_clf.predict(X_test)