# Examples of using Boosted Decision Trees with scikit-learn 

Using BDTs in sklearn:  
1. BDT regressor
2. BDT classifier

Show interface on how to setup, train and apply the BDT model

# Decision Tree Regression with AdaBoost


A decision tree is boosted using the AdaBoost algorithm on a 1D
sinusoidal dataset with a small amount of Gaussian noise.
A 299 boosts (300 decision trees) regressor is compared with a single decision tree
regressor. The boosted regressor can fit more
detail than the single tree

In [None]:
# Adapted from Noel Dawe <noel.dawe@gmail.com>
#
# License: BSD 3 clause

# First: import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

# Create a simple dataset of x and y values that we want to describe

# Seed the random number generator "rng" to make result repeatable
rng = np.random.RandomState(1)
# create 100 x-axis values from 0 to 6 
X = np.linspace(0, 6, 100)[:, np.newaxis]
print("Type of X is ",type(X))
print("X = ",X)

# For each x value create a y value that is determined based on the 
# sum of two sine functions and Gaussian noise from the random number generator
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
print("Type of y is ",type(y))
print("y = ",y)

In [None]:
# Let's plot this

plt.scatter(X,y)
plt.xlabel("x value (data input)")
plt.ylabel("y value (target)")

In [None]:
# Set up two regression models
#regr_1 is a single tree with a maximum depth of 4
regr_1 = DecisionTreeRegressor(max_depth=4)

#regr_2 is a boosted decision tree of up to 300 individual trees with a maximum depth of 4
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=300, random_state=rng)
# train the two models
# this is all!
regr_1.fit(X, y)
regr_2.fit(X, y)

# Use the models to predict values
# predict() takes all the X values in the array and predicts a y value based on the trained decision tree

y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)

# Plot the results of the first regressor
plt.figure()
plt.scatter(X, y, c="k", label="training sample")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)

plt.xlabel("x value (data input)")
plt.ylabel("y value (target)")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()


In [None]:
#Compare with the Adaboost BDT result:
plt.figure()
plt.scatter(X, y, c="k", label="training sample")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("x value (data input)")
plt.ylabel("y value (target)")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()

In [None]:
# The BDT gives a better description that the simple depth=4 single tree
# what is we increase the depth of the single tree to 300? 

regr_3 = DecisionTreeRegressor(max_depth=300)

# train the tree

regr_3.fit(X, y)

# Use the models to predict values
y_3 = regr_3.predict(X)


# Plot the results of the first regressor
plt.figure()
plt.scatter(X, y, c="k", label="training sample")
plt.plot(X, y_3, c="b", label="n_estimators=1", linewidth=2)

plt.xlabel("x value (data input)")
plt.ylabel("y value (target)")
plt.title("Decision Tree Regression for max depth = 400")
plt.legend()
plt.show()

In [None]:
# how does this regressor do when we give it some data that it has not seen before?

# Shift X values by 0.1 compared to training sample
X2 = np.linspace(0.1, 6.1, 100)[:, np.newaxis]

# New target values with different random noise
y2 = np.sin(X2).ravel() + np.sin(6 * X2).ravel() + rng.normal(0, 0.1, X2.shape[0])
plt.figure()

y_3 = regr_3.predict(X2)


# Plot the results of the first regressor
plt.figure()
plt.scatter(X2, y2, c="k", label="training sample 2")
plt.plot(X2, y_3, c="b", label="n_estimators=1", linewidth=2)

plt.xlabel("x value (data input)")
plt.ylabel("y value (target)")
plt.title("Decision Tree Regression for max depth = 400")
plt.legend()
plt.show()

# Example of a BDT classifier

__First, load the libraries we need__

In [None]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier

# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

__Load the "iris" dataset that comes with Scikit learn__

In [None]:
# sklearn comes with some example data sets
from sklearn import datasets

# load the "iris" data set
iris = datasets.load_iris()

print(f"Iris data set contains {len(iris.data)} objects (different flowers)")
print("Each object has 4 features:",iris.feature_names)
print("and belongs in one of three classes:",iris.target_names)
print("Feature data: \n", iris.data)
print("Object classes:\n", (iris.target))
X = iris.data
y = iris.target



In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target)
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
ax.legend(scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes")
ax.set_title("3 types of flowers; features 0, 1")

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(iris.data[:, 2], iris.data[:, 3], c=iris.target)
ax.set(xlabel=iris.feature_names[2], ylabel=iris.feature_names[3])
ax.legend(scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes")
ax.set_title("3 types of flowers; features 2, 3")

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [None]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=1,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)



In [None]:
#Predict the response for test dataset
y_pred = model.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(y_test)
print(y_pred)

In [None]:
# it is interesting to see which class of flowers the BDT gets right and wrong
# we can look at the "confusion matrix"

print(metrics.confusion_matrix(y_test, y_pred))
print(iris.target_names)