In [13]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# 데이터 불러오기
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')
X = wine.drop(['class'], axis=1).values
y = wine['class'].values

# 데이터 분리
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)

# 하이퍼파라미터 범위 지정 (RandomizedSearch용)
param_dist = {
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_depth': range(5, 30, 2),
    'min_samples_split': range(2, 30, 2),
    'min_samples_leaf': range(1, 20, 2),
    'splitter': ['best', 'random'],
    'class_weight': ['balanced', None]
}

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
dt_cls = DecisionTreeClassifier(random_state=50)

# 1단계) RandomizedSearchCV로 넓게 탐색
random_search = RandomizedSearchCV(
    dt_cls, param_dist, n_iter=30, cv=splitter, n_jobs=-1, random_state=50
)
random_search.fit(X_train, y_train)
best_random_params = random_search.best_params_
print('[RandomizedSearch 최적 파라미터]')
print(best_random_params)

# 2단계) GridSearchCV로 정밀 탐색 (최적값 근처로 범위 좁힘)
param_grid = {
    'min_impurity_decrease': [best_random_params['min_impurity_decrease'] - 0.0002,
                              best_random_params['min_impurity_decrease'],
                              best_random_params['min_impurity_decrease'] + 0.0002],
    'max_depth': [best_random_params['max_depth']-1,
                  best_random_params['max_depth'],
                  best_random_params['max_depth']+1],
    'min_samples_split': [best_random_params['min_samples_split']-1,
                          best_random_params['min_samples_split', 6],
                          best_random_params['min_samples_split']+1],
    'min_samples_leaf': [best_random_params['min_samples_leaf']-1,
                         best_random_params['min_samples_leaf'],
                         best_random_params['min_samples_leaf']+1],
    'splitter': [best_random_params['splitter']],
    'class_weight': [best_random_params['class_weight']]
}
# 범위에서 0 이하 값이 들어가면 오류나니, 실제 실행 전에 수동으로 체크/수정

# 음수/0 방지
for k in ['min_impurity_decrease','max_depth','min_samples_split','min_samples_leaf']:
    param_grid[k] = [x for x in param_grid[k] if (type(x) == float and x >= 0) or (type(x) == int and x >= 1)]

grid_search = GridSearchCV(
    dt_cls, param_grid, cv=splitter, n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("[GridSearch 최적 파라미터]")
print(grid_search.best_params_)
print("[CV 최고 점수]", grid_search.best_score_)

# 변수별 min/max값 출력(데이터 체크)
print("alcohol:", wine['alcohol'].min(), "~", wine['alcohol'].max())
print("sugar:", wine['sugar'].min(), "~", wine['sugar'].max())
print("pH:", wine['pH'].min(), "~", wine['pH'].max())

# 모델 저장(폴더 생성후 저장)
os.makedirs('./model', exist_ok=True)
model_filename = './model/wine_optimized.joblib'
joblib.dump(best_model, model_filename)
print('최적화 모델이 저장되었습니다:', model_filename)

[RandomizedSearch 최적 파라미터]
{'splitter': 'best', 'min_samples_split': 20, 'min_samples_leaf': 17, 'min_impurity_decrease': 0.0016, 'max_depth': 13, 'class_weight': None}


KeyError: ('min_samples_split', 6)