# Model Selection 모듈 

- train/Test 데이터를 분리하지 않고 머신러닝 수행

In [5]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [6]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [9]:
pred = dt_clf.predict(iris.data)
accuracy_score(iris.target,pred)

1.0

- cross_validate() method

In [12]:
from sklearn.model_selection import cross_validate

In [16]:
dtc= DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target)

{'fit_time': array([0.00133657, 0.0009985 , 0.00112247, 0.00099707, 0.00100255]),
 'score_time': array([0.00065351, 0.00099707, 0.00087214, 0.00099874, 0.00100183]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])}

In [17]:
dtc= DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target, return_train_score =True) #옵션을 바꾸면 이렇게 나옴 why 과적합 방지를 위해 

{'fit_time': array([0.00099754, 0.00099969, 0.00099659, 0.        , 0.00100017]),
 'score_time': array([0.00099707, 0.00099254, 0.        , 0.        , 0.00099564]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

- train_test_split

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train,y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021
)

In [21]:
import pandas as pd 
pd.Series(y_test).value_counts()

0    41
1    41
2    38
dtype: int64

- Stratifide 분리

In [22]:
X_train, X_test, y_train,y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021,
    stratify=iris.target #비율을 맞춰주기 stratify
)
pd.Series(y_test).value_counts

<bound method IndexOpsMixin.value_counts of 0      2
1      1
2      1
3      1
4      2
      ..
115    1
116    2
117    1
118    0
119    1
Length: 120, dtype: int32>

- cross_val_score() method

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
# 성능 지표는 정확도(accuracy), 교차검증 세트는 5개 
cross_val_score(dtc,iris.data, iris.target, cv =5 )

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

In [None]:
import numpy as np 
scores = cross_val_score(dtc, iris.data, iris.target, cv = 5)

### GridSearchCV
- 교차 검증과 최적 하이퍼 파라메터 튜닝을 한꺼번에 수행

In [25]:
dtc = DecisionTreeClassifier()

In [26]:
# parameter를 dictionary형태로 설정 
params = {
    'max_depth':[2,3,4,5],
    'min_samples_split': [2,3]
}

In [27]:
from sklearn.model_selection import GridSearchCV
grid_dtc = GridSearchCV(dtc,param_grid=params, cv=3) #cv교차검정의 갯수 3개 #이런식으로 처리를.... 

In [28]:
grid_dtc.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4, 5],
                         'min_samples_split': [2, 3]})

In [29]:
grid_dtc.cv_results_

{'mean_fit_time': array([0.00066018, 0.00033251, 0.00066551, 0.00034356, 0.        ,
        0.        , 0.00066479, 0.        ]),
 'std_fit_time': array([0.00046684, 0.00047025, 0.00047059, 0.00048587, 0.        ,
        0.        , 0.00047008, 0.        ]),
 'mean_score_time': array([0.00033394, 0.00066527, 0.00033267, 0.00066551, 0.00033124,
        0.00034412, 0.00033267, 0.0009853 ]),
 'std_score_time': array([4.72269487e-04, 4.70415116e-04, 4.70471221e-04, 4.71944838e-04,
        4.68448172e-04, 4.86655611e-04, 4.70471221e-04, 1.71422056e-05]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4, 5, 5],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'min_samples_split'

In [30]:
# 최적 파라미터
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [31]:
# 최고 정확도
grid_dtc.best_score_

0.9666666666666667

In [32]:
# 최고 정확도를 가지는 최적 파라미터로 학습한 estimator
best_estimator = grid_dtc.best_estimator_
best_estimator.score(X_test,y_test)

0.9583333333333334