In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
import platform

# seaborn 설정 리셋
sns.reset_defaults()

# 폰트설정
if platform.system() == 'Windows' :
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
else :
    print('Check your OS System')
    
# 그래프에 마이너스 표시
matplotlib.rcParams['axes.unicode_minus'] = False

In [4]:
wine = pd.read_csv('./data/08_wine.csv')

In [5]:
wine

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [11]:
# 독립변수, 종속변수 분리
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

In [12]:
print(data.shape, target.shape)

(6497, 3) (6497,)


## 교차검증 - train_test_split() 방식

In [None]:
# 훈련데이터 : 검증데이터 : 테스트데이터로 분리
    # 6 : 2 : 2 비율
    # 일반적으로 많이 사용하는 방식

In [15]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = \
    train_test_split(data, target, test_size= 0.2, random_state=42)

print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)

(5197, 3) (5197,)
(1300, 3) (1300,)


In [26]:
# 훈련데이터 > 검증데이터 생성
from sklearn.model_selection import train_test_split

sub_input, val_input, sub_target, val_target = \
    train_test_split(train_input, train_target, test_size= 0.2, random_state=42)

print(sub_input.shape, sub_target.shape)
print(val_input.shape, val_target.shape)

(4157, 3) (4157,)
(1040, 3) (1040,)


In [27]:
# 결정트리를 통한 훈련 및 정확도 확인
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(round(dt.score(sub_input, sub_target),4))
print(round(dt.score(val_input, val_target),4))

0.9971
0.8644


In [28]:
print(round(dt.score(test_input, test_target),4))

0.8569


## 교차검증

In [29]:
# 훈련데이터와 검증데이터를 구분할 필요없이 내부적으로 구분해서 사용
    # - 내부적으로 구분에 사용하는 데이터는 훈련(train) 데이터
    # - 데이터 준비는 기존처럼 훈련과 테스트 데이터만 준비
    # - 검증데이터는 별도로 준비하지않아도 됨

In [30]:
# 교차검증에 사용되는 주요 키워드
    # - 3-폴드(fold) 교차검증
        # - 훈련데이터를 세부분으로 나눠서 데이터를 분리하고 수행
        # - 구간마다 훈련데이터를 검증데이터로 바꿔가면서 수행
        # - 명칭 : k-폴드 교차검증 or k-겹 교차검증

    # - 5-폴드 교차검증 또는 10-폴드 교차검증 사용
        # - 훈련데이터로 80%이상을 모두 모델 훈련에 사용가능

In [31]:
# 사용하는 데이터 확인
print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)

(5197, 3) (5197,)
(1300, 3) (1300,)


## 교차검증 모듈 : cross_validate

In [47]:
from sklearn.model_selection import cross_validate

# 첫번쨰 값 : dt = 결정트리 훈련모델(다른 모델을 사용한 경우 해당모델)
# 두번째 값 : 훈련데이터(fold 에서 훈련데이터 분리시 사용)
# 세번째 값 : 검증데이터(fold 에서 검증데이터 분리시 사용)
scores = cross_validate(dt, train_input, train_target)
# print(scores)

# test_score의 평균값 = 최종 훈련모델의 평가점수(정확도)
print('최종 평가점수 :',round(scores['test_score'].mean(),3))

최종 평가점수 : 0.855


## 분할기 : StratifiedKFold

In [64]:
# 훈련데이터를 셔플하거나 폴드의 갯수를 지정할 수 있는 클래스
from sklearn.model_selection import StratifiedKFold

# 교차검증 함수 그대로 사용
# cv : 분할기 속성
    # 분할기 : StratifiedKFold
    # 디폴트 : fold 5, 셔플하지않음
    # n_splits = fold 갯수
    # shuffle = True : 셔플하기
splitter = StratifiedKFold(n_splits=50, shuffle=True, random_state=42)

scores = cross_validate(dt, train_input, train_target,
                        cv = splitter)
print(scores)
print()
print('최종 평가점수 :',round(scores['test_score'].mean(),3))

{'fit_time': array([0.00897717, 0.00698137, 0.00797796, 0.0067749 , 0.00698161,
       0.00695992, 0.00698113, 0.00600457, 0.00596166, 0.00698137,
       0.00601864, 0.00601602, 0.        , 0.        , 0.01773119,
       0.        , 0.        , 0.0177331 , 0.        , 0.01469016,
       0.00708246, 0.00201178, 0.0110898 , 0.        , 0.00903177,
       0.00919676, 0.00099802, 0.00995541, 0.00706506, 0.00205755,
       0.01010585, 0.        , 0.00698209, 0.00598311, 0.00616455,
       0.00598407, 0.00498676, 0.00586557, 0.00404334, 0.00710607,
       0.        , 0.01010704, 0.00808287, 0.00154638, 0.00960684,
       0.        , 0.01107669, 0.00710034, 0.0009675 , 0.0091536 ]), 'score_time': array([0.        , 0.        , 0.0009985 , 0.00096941, 0.00133204,
       0.        , 0.0009985 , 0.00099874, 0.        , 0.00099778,
       0.00052977, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.

## 하이퍼파라미터 튜닝(AutoML)
- 교차검증, 하이퍼파라미터 찾기, 모델훈련을 한번에 자동으로 수행
- 사용패키지 : sklearn.model_selection
- 사용클래스 : GridSearchCV (그리드서치)

In [78]:
# 결정트리의 max_depth(트리의 깊이)값 찾기
# 그리드서치(GridSearchCV) 사용
from sklearn.model_selection import GridSearchCV

# 찾을 하이퍼파라미터값(딕셔너리)
params = {'max_depth' : range(5, 20, 1)}

# 객체(모델)생성
    # 첫번째 값 : 훈련모델
    # 두번째 값 : 하이퍼파라미터 값(딕셔너리)
    # 세번째 값 : CPU 코어 갯수(-1은 모든 코어사용, 병렬처리)
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  params, n_jobs= -1)

# 훈련시키기
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20)})

In [80]:
# 가장 좋은 하이퍼파라미터 값
print(gs.best_estimator_)

DecisionTreeClassifier(max_depth=8, random_state=42)


In [88]:
dt = gs.best_estimator_
print(dt.score(train_input,train_target))

0.9003271117952665


In [90]:
print(gs.best_params_)

{'max_depth': 8}


In [93]:
print(gs.cv_results_.keys())

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])


In [94]:
# params 값 확인하기
print(gs.cv_results_['params'])

[{'max_depth': 5}, {'max_depth': 6}, {'max_depth': 7}, {'max_depth': 8}, {'max_depth': 9}, {'max_depth': 10}, {'max_depth': 11}, {'max_depth': 12}, {'max_depth': 13}, {'max_depth': 14}, {'max_depth': 15}, {'max_depth': 16}, {'max_depth': 17}, {'max_depth': 18}, {'max_depth': 19}]


In [96]:
# 0번째 fold 값 확인하기
print(gs.cv_results_['split0_test_score'])

[0.84711538 0.84807692 0.85769231 0.85288462 0.85769231 0.84519231
 0.85865385 0.86730769 0.86538462 0.86826923 0.86730769 0.86442308
 0.86346154 0.86634615 0.87211538]


In [99]:
# 훈련모델 최종 검증
print(round(dt.score(test_input, test_target),4))

0.8585


In [108]:
params = gs.cv_results_['params']
split_0 = gs.cv_results_['split0_test_score']
split_1 = gs.cv_results_['split1_test_score']
split_2 = gs.cv_results_['split2_test_score']
split_3 = gs.cv_results_['split3_test_score']
split_4 = gs.cv_results_['split4_test_score']

In [120]:
gs_data = [[par, spl_0, spl_1, spl_2, spl_3, spl_4] 
           for par, spl_0, spl_1, spl_2, spl_3, spl_4 
           in zip(params,split_0,split_1,split_2,split_3,split_4)]

In [121]:
gs_df = pd.DataFrame(gs_data, columns=['params','split_0','split_1','split_2','split_3','split_4'])

In [122]:
gs_df

Unnamed: 0,params,split_0,split_1,split_2,split_3,split_4
0,{'max_depth': 5},0.847115,0.863462,0.880654,0.836381,0.861405
1,{'max_depth': 6},0.848077,0.854808,0.873917,0.843118,0.85948
2,{'max_depth': 7},0.857692,0.851923,0.875842,0.843118,0.847931
3,{'max_depth': 8},0.852885,0.854808,0.87873,0.846968,0.865255
4,{'max_depth': 9},0.857692,0.859615,0.87103,0.842156,0.854668
5,{'max_depth': 10},0.845192,0.847115,0.876805,0.846968,0.852743
6,{'max_depth': 11},0.858654,0.850962,0.880654,0.852743,0.848893
7,{'max_depth': 12},0.867308,0.848077,0.882579,0.848893,0.848893
8,{'max_depth': 13},0.865385,0.853846,0.873917,0.853705,0.839269
9,{'max_depth': 14},0.868269,0.851923,0.870067,0.858518,0.841193


## 하이퍼 파라미터가 여러개일 경우 

In [124]:
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
          'max_depth' : range(5, 20, 1),
          'min_samples_split' : range(2, 100, 10)}

# 그리드서치 훈련시키기
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  params, n_jobs= -1)
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [126]:
print(gs.best_estimator_)

DecisionTreeClassifier(max_depth=14, min_impurity_decrease=0.0004,
                       min_samples_split=12, random_state=42)


In [128]:
print(round(gs.score(train_input, train_target),4))
print(round(gs.score(test_input, test_target),4))

0.8921
0.8615


In [192]:
# 10-fold 설정하기
splitter = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
          'max_depth' : range(5, 20, 1),
          'min_samples_split' : range(2, 100, 10)}

# 그리드서치 훈련시키기
gs = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  params, cv=splitter, n_jobs= -1)
gs.fit(train_input, train_target)

GridSearchCV(cv=StratifiedKFold(n_splits=7, random_state=42, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [195]:
print(gs.cv_results_.keys())

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_impurity_decrease', 'param_min_samples_split', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])


In [197]:
print(gs.cv_results_['std_test_score'])

[0.01017227 0.01040485 0.01040485 ... 0.00522866 0.0051469  0.0051469 ]


In [191]:
print(np.mean(gs.cv_results_['mean_test_score']))
print(round(gs.score(test_input, test_target),4))

0.8618127815668565
0.8692


## 랜덤서치(RandomSearch)

In [202]:
# 랜덤서치 사용시 만족해야할 조건
    # - 하이퍼파라미터 값이 수치형 데이터
    # - 범위나 간격을 미리 정하기 어려울 경우
    # - 너무많은 매개변수가 있어서 그리드서치 수행시간이 오래 걸리는 경우
    # - 매개변수를 샘플링 할 때 확률 분포 객체를 전달

# 사용 모듈 : uniform(실수값), randint(정수값)
# 사용 클래스 : RandomizedSearchCV

In [136]:
from scipy.stats import uniform, randint

In [147]:
# 정수값 범위설정
rgen = randint(0, 10)
rgen

# 범위 내에서 임의값 출력하기
print(rgen.rvs(100))

[2 4 0 8 5 6 3 3 9 8 6 0 4 8 8 1 6 7 2 2 5 1 4 6 2 2 7 8 6 0 9 5 5 1 1 4 3
 5 4 8 4 7 8 9 5 8 6 7 9 7 3 1 5 8 0 5 2 8 3 8 6 9 3 5 3 1 5 2 5 6 4 1 9 4
 8 7 5 1 9 6 8 6 7 8 5 4 8 9 2 5 3 7 5 2 9 1 7 0 4 4]


In [146]:
# 실수값 범위 설정
ugen = uniform(0,1)
print(ugen.rvs(30))

[0.63474093 0.44271537 0.08996715 0.82724314 0.77572931 0.97708884
 0.34421523 0.29005376 0.2022359  0.32371915 0.32801225 0.98443104
 0.54457463 0.23865335 0.35167341 0.50667483 0.88032154 0.97557024
 0.1666297  0.62907859 0.02471782 0.95735602 0.62287428 0.88882906
 0.14915696 0.44093374 0.34441166 0.37687735 0.45865891 0.69360819]


In [172]:
# 하이퍼파라미터 찾을 매개변수 정의하기
params = {'min_impurity_decrease' : uniform(0.0001, 0.001),
          'max_depth' : randint(20, 50),
          'min_samples_split' : randint(2, 25)}

In [198]:
# 클래스 호출
from sklearn.model_selection import RandomizedSearchCV

# 그리드서치와 동일하게 사용
# 결정트리모델, 훈련반복횟수 100회, 코어전체, 시드 42
rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42),
                        params, n_iter=100 ,n_jobs=-1)
rs.fit(train_input, train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': range(5, 20),
                                        'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                                        'min_samples_split': range(2, 100, 10)})

In [199]:
print(rs.best_estimator_)

DecisionTreeClassifier(max_depth=16, min_impurity_decrease=0.0004,
                       min_samples_split=22, random_state=42)


In [181]:
print(rs.cv_results_.keys())

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_impurity_decrease', 'param_min_samples_split', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])


In [200]:
print(len(rs.cv_results_['mean_test_score']))

100


In [187]:
print(round(max(rs.cv_results_['mean_test_score']),4))
print(round(rs.score(test_input, test_target),4))

0.8695
0.8638
