## 다양한 교차검증
---
- model_selection 모듈
    * cross_val_score()
    * cross_val_validata
    * cross_val_predict()

In [1]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

### [1] 데이터 로딩

In [2]:
iris=load_iris()

In [3]:
# Bunch 타입 => dict
data = iris['data']
target = iris['target']
featureName = iris['feature_names']   # 컬럼명
className=iris['target_names']

In [4]:
featureName, className

(['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

### [2] 모델 생성

In [5]:
# max_iter : 처음~끝까지 샘플데이터 학습 => 에포크(epoch) 학습 횟수 (기본 100)
lrModel=LogisticRegression(max_iter=500)

In [6]:
## 교차 검증으로 모델 학습 진행
## => 학습 데이터 기반 5개 Fold로 학습 & 검증 진행
result = cross_val_score(lrModel, data, target, cv=10)

In [7]:
# 5개 모델에 대한 정확도(accuracy)
result

array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [8]:
allResult=cross_validate(lrModel, data, target, 
                         return_train_score=True, 
                         cv=7)

In [9]:
resultDF=pd.DataFrame(allResult)
resultDF

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.015058,0.000257,0.954545,0.96875
1,0.014184,0.000805,1.0,0.96875
2,0.013963,0.0,0.909091,0.976562
3,0.014179,0.0,0.952381,0.976744
4,0.012976,0.000983,0.952381,0.992248
5,0.014922,3.7e-05,1.0,0.968992
6,0.013964,0.0,1.0,0.976744


In [10]:
## Splitter 객체 생성 - KFold()
from sklearn.model_selection import KFold, StratifiedKFold

In [11]:
kSplitter=KFold(n_splits=7, shuffle=True)
sSplitter=StratifiedKFold(n_splits=7, shuffle=True)

# parameter --------------------------------------
# return_train_score : 학습 데이터 평가 결과 반환 여부 설정
# return_estimator : 모델 객체 반환 여부 설정
allResult=cross_validate(lrModel, data, target, 
                         return_train_score=True,
                         return_estimator=True,
                         cv=sSplitter)

In [12]:
resultDF=pd.DataFrame(allResult)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.018037,0.0,LogisticRegression(max_iter=500),0.909091,0.992188
1,0.016955,0.0,LogisticRegression(max_iter=500),1.0,0.96875
2,0.013004,0.0,LogisticRegression(max_iter=500),0.954545,0.960938
3,0.015955,0.0,LogisticRegression(max_iter=500),1.0,0.976744
4,0.017967,0.0,LogisticRegression(max_iter=500),1.0,0.968992
5,0.013816,0.0,LogisticRegression(max_iter=500),0.904762,0.976744
6,0.015352,0.0,LogisticRegression(max_iter=500),0.952381,0.976744
