In [62]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [72]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X = moons[0]
y = moons[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [74]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('forest', rnd_clf), ('svm', svm_clf)],
    voting='soft'
)

voting_clf.fit(X_train, y_train);

In [75]:
from sklearn.metrics import accuracy_score, f1_score

In [76]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))    

LogisticRegression 0.9
RandomForestClassifier 0.95
SVC 0.9
VotingClassifier 0.95


# BAGGING


In [77]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [78]:
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                            n_estimators=100, 
                            n_jobs=-1, 
                            oob_score=True)

bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [79]:
bag_clf.oob_score_

0.9625

In [80]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.95

# FEATURE IMPORTANCE


In [81]:
from sklearn.datasets import load_iris

In [83]:
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.1059694913231804
sepal width (cm) 0.023996807040731143
petal length (cm) 0.4361751799099859
petal width (cm) 0.43385852172610256


# GRADIENT BOOSTING

* is a sequential process where each new model is fit to the residual errors made by the 
 previous predictor

In [160]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

m = 1000
X = np.random.rand(m, 1) * 10 - 5
y = X**2 + 5 + np.random.randn(m, 1) * 5

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [156]:
class GradientBoostedTrees:
    def __init__(self, n_estimators=100, max_depth=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.estimators = []
    
    def fit(self, X, y):
        target = y
        
        # at each step, train new estimator using residuals from the previous step as 
        # a target
        for i in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, target)
            y_pred = tree.predict(X)
            target = target - y_pred.reshape((-1, 1))
            self.estimators.append(tree)
            
    def predict(self, X):
        return sum(clf.predict(X) for clf in self.estimators)

In [183]:
gbtree = GradientBoostedTrees(n_estimators=10, max_depth=3)
gbtree.fit(X_train, y_train)
y_pred = gbtree.predict(X_test)
print(mean_squared_error(y_test, y_pred))

26.929074055822014


# MNIST ENSEMBLE CHALLENGE

In [191]:
from sklearn.datasets import fetch_openml

In [192]:
mnist = fetch_openml('mnist_784', version=1)

In [193]:
X, y = mnist['data'], mnist['target']
X_train, X_val, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_val, y_test = y[:50000], y[50000:60000], y[60000:]

In [210]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score

forest_clf = RandomForestClassifier()
forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [211]:
y_train_pred = forest_clf.predict(X_train)
y_val_pred = forest_clf.predict(X_val)

print(f'f1: {f1_score(y_train, y_train_pred, average="macro")}') 
print(f'f1: {f1_score(y_val, y_val_pred, average="macro")}') 

f1: 1.0
f1: 0.9721039004920712


In [213]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [215]:
y_val_pred = gb_clf.predict(X_val)
print(f'f1: {f1_score(y_val, y_val_pred, average="macro")}') 

f1: 0.9493926155201908


In [217]:
from xgboost import XGBClassifier

In [218]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [220]:
y_pred = xgb_clf.predict(X_val)
print(f'f1: {f1_score(y_val, y_pred, average="macro")}')

f1: 0.9380728728328194
