# Decision Tree

> - 분류와 회귀에 사용되는 지도학습 방식
> - 데이터의 특성으로 부터 추론된 결정 규칙을 통해 값을 예측한다.
> - if-then-else 결정 규칙을 통해 데이터를 학습한다.
> - 트리의 깊이가 깊을 수록 복잡한 모델이 된다.

> ***parameter***
>- ***max_depth***  
>- ***max_leaf_nodes***  
>- ***min_samples_leaf***

In [65]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd
import graphviz
import multiprocessing
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris, load_boston
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline

## Classification dataset(iris)

In [34]:
iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df["target"] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## Regression dataset(boston)

In [37]:
boston = load_boston()
boston_df = pd.DataFrame(data = boston.data, columns = boston.feature_names)
boston_df["target"] = boston.target
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


# DecisionTreeClassifire

> 두개의 배열 X, y를 입력 받는다.    
> - X는 [n_samples, n_features]크기의 데이터 특성 배열  
> - y는 [n_samples]크기의 정답 배열

## iris (전처리 X)

In [39]:
iris_X = DecisionTreeClassifier()

In [40]:
cross_val_score(estimator = iris_X,
               X = iris.data, y = iris.target,
               cv = 5,
               n_jobs = multiprocessing.cpu_count()
               )

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

## iris (전처리 O)

In [41]:
iris_O = make_pipeline(StandardScaler(),
                      DecisionTreeClassifier()
                      )

In [42]:
cross_val_score(estimator = iris_O,
               X = iris.data, y = iris.target,
               cv = 5,
               n_jobs = multiprocessing.cpu_count()
               )

# 전처리과정을 진행하기 전, 후가 크게 차이가 없어보임.
# decision tree는 규칙을 학습하기 때문에 전처리에 큰 영향을 받지 않는다.

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

## iris GridSearchCV

In [48]:
iris_df["target"].to_numpy()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [63]:
iris_df = iris_df.sample(frac = 1, random_state = 24)

In [64]:
iris_df["target"].to_numpy()

array([1, 2, 1, 1, 0, 0, 0, 0, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2,
       2, 0, 0, 2, 0, 1, 0, 0, 1, 0, 1, 0, 2, 2, 2, 0, 1, 2, 1, 2, 0, 2,
       1, 1, 1, 1, 1, 0, 1, 2, 2, 1, 0, 2, 1, 2, 1, 2, 1, 0, 0, 1, 0, 1,
       0, 0, 2, 0, 0, 0, 1, 2, 1, 0, 2, 2, 1, 2, 0, 1, 1, 1, 0, 1, 2, 2,
       1, 1, 1, 2, 2, 1, 0, 2, 0, 1, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 0, 0,
       1, 0, 1, 2, 1, 2, 1, 2, 0, 0, 0, 1, 0, 1, 0, 0, 1, 2, 1, 1, 2, 1,
       0, 1, 0, 1, 2, 0, 0, 2, 2, 2, 2, 0, 0, 1, 2, 2, 1, 0])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 24, test_size = 0.2)

In [76]:
params = {"max_depth" : range(2,31,2)}

gs = GridSearchCV(estimator = DecisionTreeClassifier(),
                 param_grid = params,
                 cv = 5, n_jobs = multiprocessing.cpu_count(),
                 verbose = True)

gs.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=4,
             param_grid={'max_depth': range(2, 31, 2)}, verbose=True)

In [77]:
iris_model = gs.best_estimator_

print("best parameter : ", gs.best_params_)
print("best score : ", gs.best_score_)

best parameter :  {'max_depth': 18}
best score :  0.9416666666666667


# DecisionTreeRegressor

## boston (전처리 X)

In [66]:
boston_X = DecisionTreeRegressor()

In [67]:
cross_val_score(estimator = boston_X,
               X = boston.data, y = boston.target,
               cv = 5,
               n_jobs = multiprocessing.cpu_count()
               )

array([ 0.61980814,  0.42177526,  0.61948656,  0.39424626, -1.83849454])

## boston (전처리 O)

In [69]:
boston_O = make_pipeline(StandardScaler(),
                        DecisionTreeRegressor()
                        )

In [70]:
cross_val_score(estimator = boston_O,
               X = boston.data, y = boston.target,
               cv = 5,
               n_jobs = multiprocessing.cpu_count()
               )

array([ 0.61310853,  0.43565063,  0.61987598,  0.36359459, -1.46502223])

## boston GridSearchCV

In [80]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state = 24)

In [73]:
help(DecisionTreeRegressor)

Help on class DecisionTreeRegressor in module sklearn.tree._classes:

class DecisionTreeRegressor(sklearn.base.RegressorMixin, BaseDecisionTree)
 |  DecisionTreeRegressor(*, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0)
 |  
 |  A decision tree regressor.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : {"squared_error", "friedman_mse", "absolute_error",             "poisson"}, default="squared_error"
 |      The function to measure the quality of a split. Supported criteria
 |      are "squared_error" for the mean squared error, which is equal to
 |      variance reduction as feature selection criterion and minimizes the L2
 |      loss using the mean of each terminal node, "friedman_mse", which uses
 |      mean squared error with Friedman's improv

In [81]:
params = {"max_depth" : range(2,31,2),
          "max_leaf_nodes" : range(2, 31)}

gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                 param_grid = params,
                 cv = 5, n_jobs = multiprocessing.cpu_count(),
                 verbose = True)

gs.fit(X_train, y_train)

Fitting 5 folds for each of 435 candidates, totalling 2175 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=4,
             param_grid={'max_depth': range(2, 31, 2),
                         'max_leaf_nodes': range(2, 31)},
             verbose=True)

In [82]:
boston_model = gs.best_estimator_

print("best parameter : ", gs.best_params_)
print("best score : ", gs.best_score_)

best parameter :  {'max_depth': 20, 'max_leaf_nodes': 28}
best score :  0.7833195552280539
