In [3]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# wine 데이터 호출
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')

# 매개변수 지정
X = wine.drop(['class'], axis=1).values
y = wine['class'].values

# 학습+검증 / 테스트 데이터 분리
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# 학습 / 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)

# 파라미터 정의
params = {'min_impurity_decrease': np.arange(0.0001, 0.1000, 0.0001),
          'max_depth': range(5, 100, 1),
          'min_samples_split': range(2, 100, 1)}

# 값 최적화
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=50)
dt_cls = DecisionTreeClassifier(splitter='best', random_state=50)
gs = RandomizedSearchCV(dt_cls, params, cv=splitter, n_iter=100, n_jobs=-1, random_state=50)
gs.fit(X_train, y_train)

# 최적의 파라미터
print(gs.best_params_)

# min, max
print(wine['alcohol'].min(), wine['alcohol'].max())
print(wine['sugar'].min(), wine['sugar'].max())
print(wine['pH'].min(), wine['pH'].max())

# 최고의 모델 사용
dt = gs.best_estimator_

# 최고 점수
print(gs.best_score_)

# 모델 저장 폴더 생성
model_filename = './model/wine.joblib'
joblib.dump(dt, model_filename)

{'min_samples_split': 74, 'min_impurity_decrease': 0.0008, 'max_depth': 22}
8.0 14.9
0.6 65.8
2.72 4.01
0.8633584337349397


['./model/wine.joblib']