In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')

In [3]:
wine_input = wine[['alcohol', 'sugar', 'pH']].to_numpy()
wine_target = wine['class'].to_numpy()

## 훈련셋과 테스트셋 분리

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wine_input, wine_target, test_size = 0.2,
                                                    random_state=42)

## 훈련세트에서 검증세트 분리

In [5]:
from sklearn.model_selection import train_test_split
sub_input, val_input, sub_target, val_target = train_test_split(X_train, y_train, test_size = 0.2,
                                                    random_state=42)


In [6]:
sub_input.shape, val_input.shape

((4157, 3), (1040, 3))

## 모델 구축(Decision Tree Classifier)

In [7]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증(k-fold cross validation)

In [8]:
from sklearn.model_selection import cross_validate

# 평가할 모델 객체를 매개변수로 전달한다.(검정세트를 떼어내지 않고 훈련세트 전체를 전달해야 한다.)
scores = cross_validate(dt, X_train, y_train)
scores

{'fit_time': array([0.00598335, 0.00600696, 0.00499797, 0.004987  , 0.00598359]),
 'score_time': array([0.00099874, 0.        , 0.        , 0.        , 0.        ]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [9]:
np.mean(scores['test_score'])

0.855300214703487

In [10]:
from sklearn.model_selection import StratifiedKFold
# cv=StratifiedKFold(), splitter
scores = cross_validate(dt, X_train, y_train, cv=StratifiedKFold())
np.mean(scores['test_score'])

0.855300214703487

In [11]:
# 만약 k-fold가 10이라면(10-겹 교차검증)
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, X_train, y_train, cv=splitter)
np.mean(scores['test_score'])

0.8574181117533719

## 그리드 서치

In [19]:
from sklearn.model_selection import GridSearchCV
model = DecisionTreeClassifier(random_state = 2022)

params = {
    'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, None],
    'min_samples_leaf':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14,15,16,17,18,19,20],
    'min_samples_split':[2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14,15,16,17,18,19,20]
}

gs = GridSearchCV(model, param_grid = params, cv = 3, refit = True, n_jobs=-1)
gs.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2022),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                       24, 25, None],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19,
                                              20],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 15, 16, 17, 18, 19,
                                               20]})

In [20]:
model = gs.best_estimator_
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8885895709062921
0.8569230769230769


In [21]:
print(gs.best_params_)

{'max_depth': 9, 'min_samples_leaf': 17, 'min_samples_split': 2}


In [22]:
print(gs.best_score_)

0.8716573448349235


In [23]:
np.max(gs.cv_results_['mean_test_score'])

0.8716573448349235

In [26]:
model = DecisionTreeClassifier(random_state = 2022, max_depth = 9,
                               min_samples_leaf = 17, min_samples_split = 2)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8885895709062921
0.8569230769230769
