In [None]:
import pathlib
import requests

import graphviz
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, metrics, model_selection
from sklearn import pipeline, preprocessing, tree

# Decision Trees

[Decision Trees](https://scikit-learn.org/stable/modules/tree.html) are a non-parametric supervised learning method used for [classification](https://scikit-learn.org/stable/modules/tree.html#tree-classification) and [regression](https://scikit-learn.org/stable/modules/tree.html#tree-regression). The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

## Training and Visualizing a Decision Tree

In [None]:
diabetes_data = datasets.load_diabetes(
    as_frame=True
)

In [None]:
print(diabetes_data.DESCR)

In [None]:
features_df = diabetes_data.data
target = diabetes_data.target

In [None]:
features_df.head()

In [None]:
target

In [None]:
regressor = tree.DecisionTreeRegressor(max_depth=2, random_state=42)
_ = regressor.fit(features_df, target)

In [None]:
_ = tree.export_graphviz(
    regressor,
    out_file="diabetes-tree.dot",
    feature_names=features_df.columns,
    rounded=True,
    filled=True
)

In [None]:
(
    graphviz.Source
            .from_file("diabetes-tree.dot")
)

In [None]:
help(regressor.tree_)

## Making Predictions

In [None]:
regressor = tree.DecisionTreeRegressor()
_ = regressor.fit(features_df, target)

_predictions = regressor.predict(features_df)

metrics.root_mean_squared_error(
    target,
    _predictions,
)

## Regularization

In [None]:
regressor = tree.DecisionTreeRegressor()
cv_scores = model_selection.cross_val_score(
    regressor,
    features_df,
    target,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
cv_scores

In [None]:
np.mean(-cv_scores)

## Exercise

Regularize the decision tree using grid search with 5-fold CV to choose the best hyperparameters.

### Solution

In [None]:
tree.DecisionTreeRegressor?

In [None]:
# fine-tune regularization hyperparameters
param_grid = [
    {
      "max_depth": [1, 2, 4, 8, None]
    },
    {
      "max_depth": [None],
      "min_samples_leaf": [1, 0.01, 0.05, 0.1]
    },
]

grid_search_cv = model_selection.GridSearchCV(
    regressor,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
grid_search_cv

In [None]:
_ = grid_search_cv.fit(features_df, target)

In [None]:
-grid_search_cv.best_score_

In [None]:
grid_search_cv.best_params_

## Exercise

Fit a decision tree regressor to the following dataset. Select an appropriate scoring metric and evaluate the performance of your regressor using cross-validation. Is your regressor under-fitting? Over-fitting? Tune the regularization hyperparameters to improve the performance of your regressor.

In [None]:
train_df = pd.read_csv("./sample_data/california_housing_train.csv")
test_df = pd.read_csv("./sample_data/california_housing_test.csv")

features_df = train_df.drop("median_house_value", axis=1)
target = train_df.loc[:, "median_house_value"]


### Solution



In [None]:

# define and fit your pipeline
regression_pipeline = pipeline.make_pipeline(
    tree.DecisionTreeRegressor(),
)
_ = regression_pipeline.fit(features_df, target)

# assess training performance
_predictions = regression_pipeline.predict(features_df)
training_score = metrics.root_mean_squared_error(
    target,
    _predictions,
)
print(f"Training RMSE score {training_score}")

# assess validation performance using cv
validation_scores = model_selection.cross_val_score(
    regression_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)
print(f"Validation RMSE score {np.mean(-validation_scores)}")


In [None]:
# fine-tune regularization hyperparameters
param_grid = [
    {
      "max_depth": [1, 2, 4, 8, 16, None],
    },
    {
      "max_depth": [None],
      "min_samples_leaf": [1, 0.01, 0.05, 0.1]
    },
    {
      "monotonic_cst": [np.array([0, 0, 0, 1, 1, 0, 0, 1])],
    },
]

grid_search_cv = model_selection.GridSearchCV(
    regressor,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

_ = grid_search_cv.fit(features_df, target)
fine_tuned_tree_score = -grid_search_cv.best_score_

print(f"Fine-tuned RMSE score {fine_tuned_tree_score}")


In [None]:
grid_search_cv.best_params_

## Understanding Feature Importance

One of the nice features of decision trees is that they provide a way to measure the importance of each of feature. Understanding feature importance is a topic all unto itself. If you are interested in pulling this thread, then I recommend that you start with [SHapley Additive Explanations (SHAP)](https://shap.readthedocs.io/en/latest/index.html) and then take a look through [*Interpretable Machine Learning*](https://christophm.github.io/interpretable-ml-book/).

In [None]:
regressor = tree.DecisionTreeRegressor()
_ = regressor.fit(
    features_df,
    target
)

In [None]:
pd.Series(
    regressor.feature_importances_,
    index=features_df.columns
).rename(
    "feature_importance"
).sort_values(
     ascending=False
)

## Exercise

Grow a forest by following these steps:

1. Continuing the previous exercise, generate 1,000 subsets of the training set, each containing 80% of the training instances selected randomly. Hint: you can use Scikit-Learn’s ShuffleSplit class for this.

2. Train one decision tree on each subset, using the best hyperparameter values found in the previous exercise. Evaluate these 1,000 decision trees on the test set. Since they were trained on smaller sets, these decision trees will likely perform worse than the first decision tree.


3. Now comes the magic. For each test set instance, generate the predictions of the 1,000 decision trees, and average the predictions together to compute the overall prediction.

4. Evaluate these predictions on the test set: you should obtain a slightly lower error than your first model. Congratulations, you have trained a random forest regressor!