In [5]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import export_graphviz
import graphviz

# HPO 를 위한 설정
from sklearn.model_selection import train_test_split, GridSearchCV

In [6]:
# 데이터를 불러오기
wine = load_wine()

In [7]:
# 데이터 프레임 생성
df = pd.DataFrame(data = wine.data, columns = wine.feature_names)
df['target'] = wine.target

In [8]:
# 모형 학습
# 특성 (Feature)와 타겟(target) 의 데이터를 분리
X = df.drop('target', axis = 1)
y = df['target']

In [9]:
# 학습데이터와 테스트 데이터로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
# HyperParameter 를 '수기' 변경
clf_mannual = DecisionTreeClassifier(
                             criterion = 'entropy', 
                             max_depth=3,
                             min_samples_split=3,
                             min_samples_leaf=1,
                             splitter='random',
                             random_state=4)

clf_mannual.fit(X_train, y_train)

y_pred_mannual = clf_mannual.predict(X_test)
accuracy_mannual = accuracy_score(y_test, y_pred_mannual)
print("accuracy_mannual:", accuracy_mannual)

accuracy_mannual: 0.8611111111111112


In [23]:
# HyperParameter Tunning
# GridSearch를 HyperParameter 를 범위를 한정

param_grid = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : [2,3,4,5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [24]:
#HPO 및 Fitting
clf_grid = DecisionTreeClassifier(random_state= 42)
# core
grid_search = GridSearchCV(clf_grid, param_grid, cv=5) # 교차 검증 5회
# HyperParameter 를 찾고, 이걸 가지고 fitting 이 모두 수행
grid_search.fit(X_train, y_train)

print("Best Hyper-parameter", grid_search.best_params_)
print("Best Score", grid_search.best_score_)

Best Hyper-parameter {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score 0.9224137931034484


In [None]:
# HPO 로 만들어진 모형의 정확도 계산
best_model = grid_search.best_estimator_

y_pred_grid = best_model.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)
print('Accuracy Grid:', accuracy_grid)


Accuracy Grid: 0.9444444444444444


In [27]:
GridSearchCV?

[31mInit signature:[39m
GridSearchCV(
    estimator,
    param_grid,
    *,
    scoring=[38;5;28;01mNone[39;00m,
    n_jobs=[38;5;28;01mNone[39;00m,
    refit=[38;5;28;01mTrue[39;00m,
    cv=[38;5;28;01mNone[39;00m,
    verbose=[32m0[39m,
    pre_dispatch=[33m'2*n_jobs'[39m,
    error_score=nan,
    return_train_score=[38;5;28;01mFalse[39;00m,
)
[31mDocstring:[39m     
Exhaustive search over specified parameter values for an estimator.

Important members are fit, predict.

GridSearchCV implements a "fit" and a "score" method.
It also implements "score_samples", "predict", "predict_proba",
"decision_function", "transform" and "inverse_transform" if they are
implemented in the estimator used.

The parameters of the estimator used to apply these methods are optimized
by cross-validated grid-search over a parameter grid.

Read more in the :ref:`User Guide <grid_search>`.

Parameters
----------
estimator : estimator object
    This is assumed to implement the scikit-learn 