In [53]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import load_iris, make_moons, load_boston
from sklearn.model_selection import train_test_split

# Voting Classifier
Aggregates the predictions of the classifiers being used for voting and predicts the class with the most number of votes.

In [2]:
#Using the iris dataset for features and labels
iris = load_iris()

In [3]:
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
#Creating models for different classifiers to be used for Voting
logistic_reg = LogisticRegression()
forest_classifier = RandomForestClassifier()
svm_classifier = SVC()

In [5]:
#Creating a voting classifier from the classifiers
vote_classifier = VotingClassifier(
    estimators = [('lr',logistic_reg), ('fc',forest_classifier), ('svc',svm_classifier)],
    voting="hard"
)

In [6]:
#Training the voting classifier
vote_classifier.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('fc',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [7]:
#Comparing the accuracy of each classifier with the voting classifier
for classifier in (logistic_reg, forest_classifier, svm_classifier, vote_classifier):
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  print(classifier.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9333333333333333
RandomForestClassifier 0.9111111111111111
SVC 0.9333333333333333
VotingClassifier 0.9333333333333333


# Bagging and Pasting
Aggregrates predictions by training the same algorithm on different random subsets of the training data. Bagging refers to the aggregating process in which sampling is performed via replacement and the one without the replacement is termed as Paging

In [21]:
#Using the Bagging Classifier for performing classification using 500 Decision Tree Classifiers
bag_classifier = BaggingClassifier(
    DecisionTreeClassifier(), #The classifier to use 
    n_estimators = 500, #The number of classifiers to ensemble
    max_samples = 100, #The number of samples to use for training each classifier
    bootstrap = True, #To allow replacement (bagging) or not (pasting)
    oob_score = True,
    n_jobs = -1 #Number of cores to utilize (-1 refers to all available cores)
)

In [14]:
#Training the bagging classifier
bag_classifier.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [15]:
#Predicting the values using the bagging classifier
y_pred = bag_classifier.predict(X_test)

In [16]:
#Viewing OOB (Out Of Bag) score for the classifier
bag_classifier.oob_score_

0.9714285714285714

In [11]:
#Viewing the accuracy of the classifier
accuracy_score(y_pred, y_test)

0.8888888888888888

In [19]:
#Viewing the OOB decision function values (starting 5 rows)
bag_classifier.oob_decision_function_[:5]

array([[0.        , 0.01621622, 0.98378378],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ]])

# Random Patches and Random Subspaces
Aggregates predictions by sampling features for given training instances. When both training instances and features are sampled, its called "Random Patches Method" whereas if sampling is performed only for features its called "Random Subspaces Method"

In [24]:
#Using the Bagging Classifier for performing classification using Random Patches Method
patches_classifier = BaggingClassifier(
    DecisionTreeClassifier(), #The classifier to use 
    n_estimators = 500, #The number of classifiers to ensemble
    max_samples = 1.0, #The number of samples to use for training each classifier
    bootstrap = False, #To allow replacement (bagging) or not (pasting)
    n_jobs = -1 #Number of cores to utilize (-1 refers to all available cores)
)

In [25]:
#Training the patches classifier
patches_classifier.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [26]:
#Predicting values using random patches classifier and viewing the accuracy score
y_pred = patches_classifier.predict(X_test)
accuracy_score(y_pred, y_test)

0.9333333333333333

# Random Forests
Aggregates predictions by training many Decision Tree classifiers/regressors, mostly using Bagging method (sometimes using Pasting as well)

In [27]:
#Creating a Random Forest Classifier
forest_classifier = RandomForestClassifier(
    n_estimators = 500, #Number of Decision Tree Classifiers to ensemble 
    max_leaf_nodes = 10, #Number of max leaf nodes for each Decision Tree
    n_jobs = -1 #Number of cores to utilize for training and prediction
)

In [28]:
#Training the Random Forest Classifier
forest_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=10, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [29]:
#Prediciting results using the Random Forest Classifier
y_forest_pred = forest_classifier.predict(X_test)

In [30]:
#Viewing the accuracy of the classifier
accuracy_score(y_forest_pred, y_test)

0.9111111111111111

# Extra Trees 
Aggregates the predictions by using the same procedure as Random Forests but using random thresholds for each feature rather than finding the best one for each.

In [32]:
#Creating an Extra Trees Classifier
extra_classifier = ExtraTreesClassifier(
    n_estimators = 500, #Number of Decision Tree Classifiers to ensemble 
    max_leaf_nodes = 10, #Number of max leaf nodes for each Decision Tree
    n_jobs = -1 #Number of cores to utilize for training and prediction
)

In [33]:
#Training the Extra Trees Classifier
extra_classifier.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=10, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [34]:
#Prediciting results using the Extra Trees Classifier
y_extra_pred = extra_classifier.predict(X_test)

In [35]:
#Viewing the accuracy of the classifier
accuracy_score(y_extra_pred, y_test)

0.9111111111111111

# Boosting Methods
Aggregates the predictions by training predictors sequentially such that the new predictor corrects the errors of its predecessors

## Ada Boost
In Ada Boost, the new predictor focuses more on the training instances than the underfitted predecessor

In [36]:
#Using SAMME.R algorithm for Ada Boost Classifier
ada_classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators = 200,
    algorithm = "SAMME.R",
    learning_rate = 0.5
)

In [37]:
#Training the Ada Boost Classifer
ada_classifier.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [38]:
#Predicting using trained Ada Boost Classifier and viewing accuracy
y_ada_pred = ada_classifier.predict(X_test)
accuracy_score(y_ada_pred, y_test)

0.9111111111111111

## Gradient Boost
Applies boosting by fitting new predictor on residual errors of predecessors

In [41]:
#Using the Boston House Prices Dataset
boston = load_boston()

X = boston.data
y = boston.target

In [42]:
'''
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
'''

#Using the gradient boost regressor for performing recursive Decision Tree Regressions as commented above
gbrt = GradientBoostingRegressor(
    max_depth = 2,
    n_estimators = 3,
    learning_rate = 1.0
)

In [43]:
#Training the Gradient Boost Regressor on the dataset
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Using GBRT Stage-Predict to perform Early Stopping

In [44]:
#Using early stopping to train Gradient Boost Regressor 

#Splitting data into training and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X, y)

#Training first GBRT for finding optimal number of trees
gbrt = GradientBoostingRegressor(
    max_depth = 2, 
    n_estimators = 120
)

gbrt.fit(X_train, y_train)

#Computing the validation errors for the Regressor for each prediction stage
gbrt_errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

#Computing best estimator for which the error is minimum
best_estimator = np.argmin(gbrt_errors) 

In [45]:
#Training a new GBRT using computed best estimator
best_gbrt = GradientBoostingRegressor(
    max_depth = 2,
    n_estimators=best_estimator
)

best_gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=118,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [46]:
#Viewing prediction results for the best GBRT and computing their mean squared error result
y_gbrt_pred = best_gbrt.predict(X_val)
mean_squared_error(y_val, y_gbrt_pred)

7.576964840518322

### Using Normal Early Stopping 

In [47]:
#Using a GBRT on the training set until the validation error does not improve for five iterations
gbrt = GradientBoostingRegressor(
    max_depth = 2,
    warm_start = True
)

#Defining pre-variables for early stopping process
min_val_error = float("inf")
error_going_up = 0

#Implementing the early stopping procedure
for n_estimators in (1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_gbrt_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_gbrt_pred)

  if val_error < min_val_error:
    min_val_error = val_error
    error_going_up = 0

  else:
    error_going_up += 1
  
  if error_going_up == 5: #Performing stopping after 5 iterations of unchanging
    break

In [48]:
#Viewing the error rate for the trained GBRT regressor
min_val_error

7.60969292843212

# Stacking (Stack Generalization)
Aggregates predictions by training an aggregation model. It uses a predictor called Blender which takes the predictions of the individual models as inputs and gives an aggregate prediction value

In [51]:
#Defining the estimators for Stacking Regressor
estimators = [
              ('dtr', DecisionTreeRegressor()),
              ('svr', SVR())
]

In [54]:
#Creating Stacking Regressor from the estimators with Elastic Net as final estimator
stack_reg = StackingRegressor(
    estimators = estimators,
    final_estimator = ElasticNet(alpha = 0.1, l1_ratio=0.5)
)

In [55]:
#Training the Stacking Regressor for the data
stack_reg.fit(X_train, y_train)

StackingRegressor(cv=None,
                  estimators=[('dtr',
                               DecisionTreeRegressor(ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort='deprecated',
                                                     random_state=None,
                                               

In [56]:
#Prediciting values from the Regressor and viewing the error score
y_stack_pred = stack_reg.predict(X_val)
mean_squared_error(y_val, y_stack_pred)

13.175465499609226