## Decision Trees

In [1]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [14]:
from sklearn.datasets import load_iris
iris=load_iris()
X=iris.data[:,2:]
y=iris.target

In [15]:
X[1],y[1]

(array([1.4, 0.2]), 0)

In [16]:
from sklearn.tree import DecisionTreeClassifier
tree_cls=DecisionTreeClassifier(max_depth=2)

In [17]:
tree_cls.fit(X,y)

In [18]:
tree_cls.predict([[4,1.4]])

array([1])

In [19]:
tree_cls.predict([[3,2.5]])


array([2])

In [2]:
from sklearn.datasets import make_moons

In [4]:
""". Train and fine-tune a Decision Tree for the moons dataset.
a. Generate a moons dataset using make_moons(n_samples=10000, noise=0.4).
b. Split it into a training set and a test set using train_test_split().
c. Use grid search with cross-validation (with the help of the GridSearchCV
class) to find good hyperparameter values for a DecisionTreeClassifier.
Hint: try various values for max_leaf_nodes.
d. Train it on the full training set using these hyperparameters, and measure
your model’s performance on the test set. You should get roughly 85% to 87%
accuracy."""


'. Train and fine-tune a Decision Tree for the moons dataset.\na. Generate a moons dataset using make_moons(n_samples=10000, noise=0.4).\nb. Split it into a training set and a test set using train_test_split().\nc. Use grid search with cross-validation (with the help of the GridSearchCV\nclass) to find good hyperparameter values for a DecisionTreeClassifier.\nHint: try various values for max_leaf_nodes.\nd. Train it on the full training set using these hyperparameters, and measure\nyour model’s performance on the test set. You should get roughly 85% to 87%\naccuracy.'

In [5]:
X,y=make_moons(n_samples=10000,noise=0.4)

In [6]:
X[1],y[1]

(array([ 0.12454026, -0.42477546]), 0)

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
dtcls=DecisionTreeClassifier()
params={'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
Gridcls=GridSearchCV(dtcls,params,verbose=1,cv=3)

In [9]:
Gridcls.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


In [10]:
ypred=Gridcls.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(ypred,y_test))

0.86


In [12]:
Gridcls.best_estimator_

## Regression

In [13]:
# Quadratic training set + noise
np.random.seed(42)
m = 200
X = np.random.rand(m, 1)
y = (4 * (X - 0.5) ** 2) + np.random.randn(m, 1) / 10

In [14]:
from sklearn.tree import DecisionTreeRegressor
dtreg=DecisionTreeRegressor(max_depth=2)

In [15]:
dtreg.fit(X,y)

In [16]:
X[1],y[1]

(array([0.95071431]), array([0.83579891]))

In [18]:
dtreg.predict([[0.6]])

array([0.11063973])

In [19]:
dtreg.predict([[0.7]])

array([0.11063973])