### P245 - P 275

### Voting Classifiers 

In [2]:
# set up 

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
# import the dataset 
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    
# three classifiers
from sklearn.ensemble import VotingClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 

log_clf = LogisticRegression() 
rnd_clf = RandomForestClassifier()
svm_clf = SVC() 

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
)

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [5]:
### look at the accuracy_score 

from sklearn.metrics import accuracy_score 
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904


In [6]:
### soft voting 

# average over the prob results 

### Bagging and Pasting

In [10]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier 

bag_clf = BaggingClassifier( 
    DecisionTreeClassifier(), n_estimators = 500,
    max_samples = 100, bootstrap = True, n_jobs = -1
)

bag_clf.fit(X_train, y_train) 
y_pred = bag_clf.predict(X_test)

# test 
accuracy_score(y_test, y_pred)

0.928

### Out-of-Bag Evaluation

In [13]:
# set oob_score = True
bag_clf = BaggingClassifier( 
    DecisionTreeClassifier(), n_estimators = 500,
    bootstrap = True, n_jobs = -1, oob_score = True
)

bag_clf.fit(X_train, y_train) 
bag_clf.oob_score_

0.8986666666666666

In [14]:
# test 
accuracy_score(y_test, y_pred)

0.928

In [15]:
### the decision function for each training instance 

bag_clf.oob_decision_function_

array([[0.35326087, 0.64673913],
       [0.40306122, 0.59693878],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.09876543, 0.90123457],
       [0.34636872, 0.65363128],
       [0.        , 1.        ],
       [0.99462366, 0.00537634],
       [0.98265896, 0.01734104],
       [0.81714286, 0.18285714],
       [0.        , 1.        ],
       [0.77941176, 0.22058824],
       [0.80327869, 0.19672131],
       [0.95675676, 0.04324324],
       [0.04395604, 0.95604396],
       [0.00571429, 0.99428571],
       [0.98816568, 0.01183432],
       [0.93582888, 0.06417112],
       [0.9947644 , 0.0052356 ],
       [0.04191617, 0.95808383],
       [0.36094675, 0.63905325],
       [0.88888889, 0.11111111],
       [1.        , 0.        ],
       [0.96373057, 0.03626943],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.59116022, 0.40883978],
       [0.

### Random Patches and Random Subspeace 

In [16]:
# sampling both instance and features is called Random Patches method

# sampling features is Random Subspaces method

In [18]:
# Random forest

# RandomForestClassifier has almost all the params 

from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1) 
rnd_clf.fit(X_train, y_train) 

y_pred_rf = rnd_clf.predict(X_test)

In [19]:
# splitter = "random" : feature select when growing trees 

In [20]:
###  random thershold for each tree 

# API: ExtraTreesClassifier 

### Feature Importance

In [22]:
from sklearn.datasets import load_iris 
iris = load_iris() 
rnd_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09854232652974981
sepal width (cm) 0.023494142693854532
petal length (cm) 0.4225590684351814
petal width (cm) 0.45540446234121446


# Boosting

In [23]:
# AdaBoost 

# Gradient Boosting 


In [24]:
# Adaboost 

# squential training with instance weight updates 


In [25]:
# SAMME 

# Adaboost 

In [26]:
from sklearn.ensemble import AdaBoostClassifier 

ada_clf = AdaBoostClassifier( 
    DecisionTreeClassifier(max_depth = 1), n_estimators = 200,
    algorithm = "SAMME.R", learning_rate = 0.5
)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [27]:
# Gradient Boosting 

# Sequentially fit by the residual errors 

In [28]:
from sklearn.ensemble import GradientBoostingRegressor 

gdrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gdrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [29]:
# The learning rate 

# control the contribution of each tree 

In [30]:
 ### early stopping 
    
    

In [31]:
# train a second model using the trees with the min mse 

import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error 

X_train, X_val, y_train, y_val = train_test_split(X, y) 

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train) 

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) 

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators) 
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=46)

In [32]:
# actual early stopping, stop when validation stop improving 

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True) 

min_val_error = float("inf")
error_going_up = 0 
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators 
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val) 
    val_error = mean_squared_error(y_val, y_pred) 
    if val_error < min_val_error: 
        min_val_error = val_error
        error_going_up = 0 
    else:
        error_going_up += 1
        if error_going_up == 5:
            break 

In [34]:
# also could change the training instances fraction 

# using subsample = 0.25 e.g.

# called Stochastic Gradient Boosting 

# Stacking 

In [35]:
# train a model to do the aggregattion 

In [36]:
# could use open source software: brew  https://github.com/viisar/brew