# Ensemble Learning: 
## Classification using Bagging and Random Forest


### Import Dependencies

In [0]:
import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score


### Load IRIS Data

In [0]:
# Load the iris datasets
dataset = datasets.load_iris ()

X=dataset.data
y=dataset.target

# Split dataset into training set and test set
# Split also shuffles data before splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test



### Create a Random Forest Classifier

A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement if bootstrap=True (default).

In [0]:

# Create the Classifier...for a Random Forest Classifier a decision tree/stump is always the base estimator
rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True)

# Train the model using the training sets y_pred=clf.predict(X_test)
rf_clf.fit(X_train, y_train)

# This section to be uncommented if we want to use the random forest for classification
# Test
y_pred = rf_clf.predict(X_test)
print('\n All predicted labels in the test set : ')
print(y_pred)

# Model Accuracy, how often is the classifier correct
print('\n Accuracy on Test set = ')
print(accuracy_score(y_test, y_pred)*100)

print('\n  Confusion Matrix = ')
print(metrics.confusion_matrix(y_test, y_pred))


# Individual prediction
pred = rf_clf.predict([[1.5, 0.2, 4.6, 3.1]])
print('\n Individual prediction :')
print(pred)

# Feature Importance
print('\n Feature Names: ')
print(dataset.feature_names)

print('\n Feature Importances (by score) : ')
print(rf_clf.feature_importances_)


 All predicted labels in the test set : 
[1 1 1 1 1 0 0 0 1 2 1 2 1 2 0 2 1 0 1 0 2 2 0 0 2 1 1 2 2 2 1 2 2 1 0 1 1
 2 1 0 1 2 2 2 2]

 Accuracy = 
93.33333333333333

 Individual prediction :
[2]

 Feature Names: 
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

 Feature Importances (by score) : 
[0.13443032 0.02422453 0.36415611 0.47718904]


### Bagging Classifier

A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. 

Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.

In [0]:

# First crea a base estimator - Decision Tree Classifier
dt_clf = DecisionTreeClassifier()

# Create the Bagging Classifier
bg_clf = BaggingClassifier(base_estimator=dt_clf, n_estimators=100, bootstrap=True, max_features=3, oob_score=True)

# Train the model using the training sets y_pred=clf.predict(X_test)
bg_clf.fit(X_train, y_train)

# Test
y_pred = bg_clf.predict(X_test)

print('\n Actual labels in the Test set : ')
print(y_test)

print('\n All predicted labels on the Test set : ')
print(y_pred)

# Model Accuracy, how often is the classifier correct
print('\n Accuracy on the Test set = ')
print(accuracy_score(y_test, y_pred)*100)

print('\n  Confusion Matrix = ')
print(metrics.confusion_matrix(y_test, y_pred))


#-----------------------------------------------
# Prediction on individual test data
pred = bg_clf.predict([[1.5, 0.2, 4.6, 3.1]])
print('\n Individual prediction :')
print(pred)

#-----------------------------------------------

# Feature Names
print('\n Feature Names: ')
print(dataset.feature_names)


### --------------- Model Parameters ---------------

print('\n Features used by the base estimators : ')
print(bg_clf.estimators_features_)

print('\n Out-of-bag score of the base estimators : ')
print(bg_clf.oob_score_)

print('\n Base estimator : ')
print(bg_clf.base_estimator_)




 All predicted labels in the test set : 
[1 1 1 1 1 0 0 0 1 2 1 2 1 2 0 2 1 0 1 0 2 2 0 0 2 1 1 2 2 2 1 2 2 1 0 1 1
 2 1 0 1 2 2 2 2]

 Accuracy = 
93.33333333333333

 Individual prediction :
[2]

 Feature Names: 
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

 Features used by the base estimators : 
[array([1, 3, 0]), array([0, 2, 3]), array([1, 2, 3]), array([3, 2, 1]), array([3, 1, 2]), array([0, 1, 3]), array([2, 3, 0]), array([0, 1, 2]), array([0, 1, 2]), array([2, 1, 3]), array([0, 1, 2]), array([1, 3, 2]), array([0, 3, 2]), array([2, 1, 0]), array([2, 1, 3]), array([2, 0, 1]), array([2, 1, 0]), array([1, 2, 3]), array([3, 0, 1]), array([3, 0, 1]), array([3, 1, 2]), array([0, 1, 2]), array([2, 1, 3]), array([1, 3, 0]), array([3, 2, 0]), array([1, 2, 0]), array([3, 1, 2]), array([3, 1, 0]), array([2, 3, 0]), array([3, 1, 0]), array([1, 3, 0]), array([3, 2, 0]), array([3, 0, 1]), array([3, 0, 2]), array([2, 1, 3]), array([1, 0, 3]), array([1, 0