In [18]:
from sklearn.datasets import make_moons
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

import numpy as np

##Hard voting classifiers in Scikit-Learn three classifiers on moons dataset
X, y = make_moons()
X_train, X_test, y_train, y_test = X[:80], X[20:], y[:80], y[20:]
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(
 estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
 voting='hard'
 )
voting_clf.fit(X_train, y_train)
#looking at each classifier’s accuracy on test set
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
 clf.fit(X_train, y_train)
 y_pred = clf.predict(X_test)
 print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
#bagging (similar to hard voting but using the same 
#classifier and sampling different subsets without replacement)
bag_clf = BaggingClassifier(
 DecisionTreeClassifier(), n_estimators=500,
 max_samples=60, bootstrap=True, n_jobs=-1
 )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

#automatic oob evaluation after training
bag_clf = BaggingClassifier(
 DecisionTreeClassifier(), n_estimators=500,
 bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
print ( bag_clf.oob_score_ )
y_pred = bag_clf.predict(X_test)
print ( accuracy_score(y_test, y_pred) )
##

##Random forest classifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
##

##Feature importances
#single decision tree
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
 print(name, score)
##

##Ada Boost Classifier
ada_clf = AdaBoostClassifier(
 DecisionTreeClassifier(max_depth=1), n_estimators=200,
 algorithm="SAMME.R", learning_rate=0.5
 )
ada_clf.fit(X_train, y_train)
##

##Gradient Boost Classifier
#fits consecutive predictor to the residual errors made my previous
#first
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)
#second
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)
#third
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)
#sum
y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))

#all equivalently
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)
#early stopping with Gradient Boosting Regressor 
X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred)
 for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)
#early stopping by setting warm_start=true 
#causing skit to keep existing trees when fit is called
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
 gbrt.n_estimators = n_estimators
 gbrt.fit(X_train, y_train)
 y_pred = gbrt.predict(X_val)
 val_error = mean_squared_error(y_val, y_pred)
 if val_error < min_val_error:
  min_val_error = val_error
  error_going_up = 0
 else:
  error_going_up += 1
  if error_going_up == 5:
   break # early stopping
##







LogisticRegression 0.8625
RandomForestClassifier 1.0
SVC 0.975
VotingClassifier 0.975


  if diff:


0.9625
1.0
sepal length (cm) 0.08351658412150206
sepal width (cm) 0.02268252014976099
petal length (cm) 0.44899668310932606
petal width (cm) 0.4448042126194099


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)