# СЕМИНАР. Деревья решений для задачи регрессии

---

Папулин С.Ю. (papulin.study@yandex.ru)

### Contents

1. [Загрузка набора данных](#Загрузка-набора-данных)
2. [Дерево решений](#Дерево-решений)
3. [Выбор лучших параметров](#Выбор-лучших-параметров)
4. [Ансамбли деревьев](#Ансамбли-деревьев)

In [None]:
import time

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.utils import shuffle

In [None]:
from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    KFold, 
    cross_validate
)
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
from sklearn.tree import plot_tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor
)

## Загрузка набора данных

Данные были использованы в работе: Harrison, D. and Rubinfeld, D.L. "Hedonic prices and the demand for clean air", J. Environ. Economics & Management, vol.5, 81-102, 1978.

Данные на основе: 1970 U.S. Census

Признаки:
1. `CRIM`: per capita crime rate by town
2. `ZN`: proportion of residential land zoned for lots over 25,000 sq.ft.
3. `INDUS`: proportion of non-retail business acres per town
4. `CHAS`: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
5. `NOX`: nitric oxides concentration (parts per 10 million)
6. `RM`: average number of rooms per dwelling
7. `AGE`: proportion of owner-occupied units built prior to 1940
8. `DIS`: weighted distances to five Boston employment centres
9. `RAD`: index of accessibility to radial highways
10. `TAX`: full-value property-tax rate per \$10.000
11. `PTRATIO`: pupil-teacher ratio by town
12. `B`: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. `LSTAT`: % lower status of the population
14. `MEDV`: Median value of owner-occupied homes in \$1000’s


Категории признаков:

1. Характеристики жилья: `RM` и `AGE`
2. Район: `CRIM`, `ZN`, `INDUS`, `CHAS`, `TAX`, `PTRATIO`, `B` и `LSTAT`
3. Доступность: `DIS` и `RAD`
4. Загрязненность: `NOX`

Целевое значение: `MEDV`

In [None]:
RANDOM_STATE = 12345

In [None]:
DATA_FILE = "../data/boston-house-price.csv"

In [None]:
CLMNS = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV"
]

In [None]:
df_house = pd.read_csv(DATA_FILE, header=None, names=CLMNS)
df_house.head()

In [None]:
df_house.describe()

## Дерево решений

In [None]:
# Формирование матрицы признаков и целевого значения

# Столбец целевого значения
target_clmn = "MEDV"

# Столбцы признаков
all_feature_clmns = CLMNS.copy()
all_feature_clmns.remove(target_clmn)

# Матрица признаков и вектор целевых значений
X = df_house[all_feature_clmns].to_numpy()
y = df_house[target_clmn].to_numpy()

# X, y = shuffle(X, y, random_state=RANDOM_STATE)

# Разбиение исходных данных на обучающее и тестовое множества
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [None]:
# Обучение
tick = time.time()
dtr_model = DecisionTreeRegressor(criterion="mse", max_depth=3, random_state=RANDOM_STATE)
dtr_model.fit(X_train, y_train)
print("Time =", time.time()-tick)

y_test__pred = dtr_model.predict(X_test)

# Проверка на тестовом подмножестве
r2_dtr_model = dtr_model.score(X_test, y_test)
mse_dtr_model = mean_squared_error(y_test, y_test__pred)
mae_dtr_model = mean_absolute_error(y_test, y_test__pred)

print("R^2 =", r2_dtr_model)
print("MSE =", mse_dtr_model)
print("MAE =", mae_dtr_model)

In [None]:
# Отображение дерева решений
plt.figure(figsize=[14, 4])
plot_tree(dtr_model, filled=True, feature_names=all_feature_clmns)
plt.show()

In [None]:
feature_importances = dtr_model.feature_importances_
feature_importances

In [None]:
def plot_feature_importances(feature_importances, feature_names, figsize=[4,6]):
    
    x_num = len(feature_importances)
    y_num = len(feature_names)
    
    if x_num != y_num:
        raise ValueError("Vectors have different dimensions.")
    
    x = range(x_num)
    
    plt.figure(figsize=figsize)
    plt.barh(x, width=feature_importances[::-1], height=0.5, color="green")
    plt.yticks(x, feature_names[::-1], rotation="horizontal")
    plt.tick_params(labelbottom="on", labeltop="on")
    plt.xlabel("Feature Importances")
    plt.grid(True)

    plt.show()


def plot_prediction_vs_true(y_true, y_pred):
    plt.scatter(y_pred, y_true, color="slategrey")
    xlim = plt.gca().get_xlim() 
    plt.plot(xlim, xlim, '--', color="grey")
    plt.xlim(xlim) 
    plt.xlabel("$\\bar{y}$")
    plt.ylabel("$y$")
    plt.grid(True)
    plt.show()

In [None]:
plot_feature_importances(feature_importances, all_feature_clmns)

In [None]:
plot_prediction_vs_true(y_test, y_test__pred)

## Выбор лучших параметров

In [None]:
SPLITS = 5

In [None]:
# Инициализация делителя для кросс-валидации
kf = KFold(n_splits=SPLITS, shuffle=False)

### Параметры дерева решения

In [None]:
# Сетка параметров
parameters = {
    "max_depth": np.arange(5, 21, 5),
    "min_samples_leaf": np.arange(100, 4, -10)
}

model = DecisionTreeRegressor(criterion="mse")

# Обучение
tick = time.time()
grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=kf)
grid_search.fit(X_train, y_train)
print("Time =", time.time()-tick)

In [None]:
# TODO: plot

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_
best_model

In [None]:
y_test__pred = best_model.predict(X_test)

# Проверка на тестовом подмножестве
r2_dtr_model = best_model.score(X_test, y_test)
mse_dtr_model = mean_squared_error(y_test, y_test__pred)
mae_dtr_model = mean_absolute_error(y_test, y_test__pred)

print("R^2 =", r2_dtr_model)
print("MSE =", mse_dtr_model)
print("MAE =", mae_dtr_model)

In [None]:
plot_feature_importances(best_model.feature_importances_, all_feature_clmns)

In [None]:
plot_prediction_vs_true(y_test, y_test__pred)

### Подрезка дерева решений

In [None]:
# TODO: control complexity parameter ccp_alpha that was introduced in version 0.22 of sklearn

## Ансамбли деревьев

### Оценка качества с отложенной выборкой

In [None]:
NUM_TREES = 100

model = DecisionTreeRegressor(criterion="mse")

# Обучение
tick = time.time()
bagging = BaggingRegressor(base_estimator=model, n_estimators=NUM_TREES, max_samples=1.0, 
                           max_features=1.0, bootstrap=True, bootstrap_features=False, 
                           oob_score=False, random_state=RANDOM_STATE)
bagging.fit(X_train, y_train)
print("Time =", time.time()-tick)

y_test__pred = bagging.predict(X_test)

# Проверка на тестовом подмножестве
r2_bagging = bagging.score(X_test, y_test)
mse_bagging = mean_squared_error(y_test, y_test__pred)
mae_bagging = mean_absolute_error(y_test, y_test__pred)

print("MSE =", mse_bagging)
print("R^2 =", r2_bagging)
print("MAE =", mae_bagging)

In [None]:
plot_prediction_vs_true(y_test, y_test__pred)

### Оценка качества посредством кросс-валидации

In [None]:
SPLITS = 5

kf = KFold(n_splits=SPLITS, shuffle=False)

model = BaggingRegressor(
        base_estimator=model, 
        n_estimators=NUM_TREES, 
        random_state=RANDOM_STATE
)

tick = time.time()
scores = cross_validate(model, X, y, cv=kf, 
                        scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"])
dt = time.time()-tick

mse_avg = np.abs(scores["test_neg_mean_squared_error"].mean())
mse_std = scores["test_neg_mean_squared_error"].std()
r2_avg = scores["test_r2"].mean()
r2_std = scores["test_r2"].std()
mae_avg = np.abs(scores["test_neg_mean_absolute_error"].mean())
mae_std = scores["test_neg_mean_absolute_error"].std()

print("Time \t= {:0.5f}s".format(dt))
print("CI MSE \t= {:0.3f} +/- {:0.3f}".format(mse_avg, mse_std * 2.0))
print("CI R^2 \t= {:0.3f} +/- {:0.3f}".format(r2_avg, r2_std * 2.0))
print("CI MAE \t= {:0.3f} +/- {:0.3f}\n".format(mae_avg, mae_std * 2.0))

### Выбор модели/алгоритма посредством кросс-валидации

#### Использование `cross_validate`

In [None]:
models = [
#     BaggingRegressor(
#         base_estimator=model, 
#         n_estimators=NUM_TREES, 
#         random_state=RANDOM_STATE
#     ),
    RandomForestRegressor(
        n_estimators=NUM_TREES, 
        criterion="mse", 
        max_features="sqrt",  
        random_state=RANDOM_STATE
    ),
    ExtraTreesRegressor(
        n_estimators=NUM_TREES, 
        criterion="mse", 
        bootstrap=True, 
        max_features="sqrt",
        random_state=RANDOM_STATE
    ),
    GradientBoostingRegressor(
        n_estimators=NUM_TREES,
        random_state=RANDOM_STATE
    )
]


min_mse = float("inf")
best_model = None

for model in models:
    
    tick = time.time()
    scores = cross_validate(model, X_train, y_train, cv=kf, 
                            return_train_score=True, 
                            scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"])
    dt = time.time()-tick
    
    mse_avg = np.abs(scores["test_neg_mean_squared_error"].mean())
    mse_std = scores["test_neg_mean_squared_error"].std()
    r2_avg = scores["test_r2"].mean()
    r2_std = scores["test_r2"].std()
    mae_avg = np.abs(scores["test_neg_mean_absolute_error"].mean())
    mae_std = scores["test_neg_mean_absolute_error"].std()
    
    if mse_avg < min_mse:
        min_mse = mse_avg
        best_model = model
    
    print("{}".format(model.__class__.__name__))
    print("\tTime \t= {:0.5f}s".format(dt))
    print("\tCI MSE \t= {:0.3f} +/- {:0.3f}".format(mse_avg, mse_std * 2.0))
    print("\tCI R^2 \t= {:0.3f} +/- {:0.3f}".format(r2_avg, r2_std * 2.0))
    print("\tCI MAE \t= {:0.3f} +/- {:0.3f}\n".format(mae_avg, mae_std * 2.0))
    

print("Best model:", best_model.__class__.__name__)

# Повторное обучение   
best_model.fit(X_train, y_train)

if hasattr(best_model, "feature_importances_"):
    print("Feature Importances:")
    plot_feature_importances(best_model.feature_importances_, all_feature_clmns)

y_test__pred = best_model.predict(X_test)

# Проверка на тестовом подмножестве
r2_best_model = best_model.score(X_test, y_test)
mse_best_model = mean_squared_error(y_test, y_test__pred)
mae_best_model = mean_absolute_error(y_test, y_test__pred)

print("Test set:")
print("\tR^2 =", r2_best_model)
print("\tMSE =", mse_best_model)
print("\tMAE =", mae_best_model)

plot_prediction_vs_true(y_test, y_test__pred)

#### Использование `GridSearchCV`

In [None]:
# Пайплайн для модели регрессии
pipeline = Pipeline([
    ("model", models[0]),
])

# Оцениваемые модели 

# В общем виде
# parameters = [
#     { "model": [models[0], ], "model__n_estimators": [50, 100, 200] }, 
#     { "model": [models[1], ], "model__n_estimators": [50, 100, 200] }
# ]

# Для моделей с одинаковыми параметрами
parameters = [
    { "model": models, "model__n_estimators": [50, 100, 200, ] }, 
]

# Инициализация параметров поиска модели
grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=parameters, 
                           cv=kf,
                           scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"],
                           refit="neg_mean_squared_error")

In [None]:
# Поиск лучшей модели
grid_search.fit(X_train, y_train)

In [None]:
def print_scoring(cv_results):
    """Вывод параметров обучения."""
    for i in range(len(cv_results["params"])):
        print("{} ({})".format(cv_results["params"][i]["model"].__class__.__name__, 
                               cv_results["params"][i]["model__n_estimators"]))
        print("\tTime \t= {:0.5f}s".format(SPLITS*cv_results["mean_fit_time"][i]))
        print("\tCI MSE \t= {:0.3f} +/- {:0.3f}".format(
            -cv_results["mean_test_neg_mean_squared_error"][i], 
            cv_results["std_test_neg_mean_squared_error"][i] * 2.0))
        print("\tCI R^2 \t= {:0.3f} +/- {:0.3f}".format(
            cv_results["mean_test_r2"][i], 
            cv_results["std_test_r2"][i] * 2.0))
        print("\tCI MAE \t= {:0.3f} +/- {:0.3f}".format(
            -cv_results["mean_test_neg_mean_absolute_error"][i], 
            cv_results["std_test_neg_mean_absolute_error"][i] * 2.0))

In [None]:
# Оценка качества на проверочном множестве при кросс-валидации
print_scoring(grid_search.cv_results_)

In [None]:
# Лучшая модель
print(grid_search.best_params_)

# Значимость признаков
best_model = grid_search.best_estimator_.named_steps["model"]
if hasattr(best_model, "feature_importances_"):
    print("Feature Importances:")
    plot_feature_importances(best_model.feature_importances_, all_feature_clmns)

# Значения метрик качества на тестовом множестве
print("Test set:")
print("\tTime = {:0.5f}s".format(grid_search.refit_time_))
print("\tR^2 =", grid_search.scorer_['r2'](grid_search, X_test, y_test))
print("\tMSE =", -grid_search.best_score_)
print("\tMAE =", -grid_search.scorer_['neg_mean_absolute_error'](grid_search, X_test, y_test))
plot_prediction_vs_true(y_test, y_test__pred)