# 교차 검증 (Cross Validation)
- 모델을 더욱 신뢰성 있게 평가하는 방법
- 데이터셋을 여러개로 나누고, 각 부분이 한번씩 검증 데이터로 사용되도록 하는 방법
- 훈련-검증을 반복하며 학습 진행
- 과대적합 방지

# K-fold

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

iris_input, iris_target = load_iris(return_X_y=True)
np.unique(iris_target, return_counts=True) # 클래스간 개수 같음

(array([0, 1, 2]), array([50, 50, 50]))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()

kfold = KFold(n_splits=5, shuffle=True, random_state=42) # 데이터를 나누기만 함

cv_accuracy = []

for train_index, val_index in kfold.split(iris_input):
    X_train, y_train = iris_input[train_index], iris_target[train_index]
    X_val, y_val = iris_input[val_index], iris_target[val_index]    # fancy indexing
    
    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))
    print('================================================')

    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_val)
    acc_score = accuracy_score(y_val, y_pred)
    cv_accuracy.append(acc_score)
    
print(f"훈련별 정확도:", cv_accuracy)
print("분류모델 정확도:", np.mean(cv_accuracy))

(array([0, 1, 2]), array([40, 41, 39]))
(array([0, 1, 2]), array([10,  9, 11]))
(array([0, 1, 2]), array([37, 40, 43]))
(array([0, 1, 2]), array([13, 10,  7]))
(array([0, 1, 2]), array([38, 40, 42]))
(array([0, 1, 2]), array([12, 10,  8]))
(array([0, 1, 2]), array([42, 40, 38]))
(array([0, 1, 2]), array([ 8, 10, 12]))
(array([0, 1, 2]), array([43, 39, 38]))
(array([0, 1, 2]), array([ 7, 11, 12]))
훈련별 정확도: [1.0, 1.0, 0.9333333333333333, 0.9666666666666667, 0.9666666666666667]
분류모델 정확도: 0.9733333333333334


### Stratified-K-Fold

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold # 타겟 속성값 (라벨) 개수 동일하게 해줌
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 데이터를 나누기만 함

cv_accuracy = []

for train_index, val_index in stratified_kfold.split(iris_input, iris_target):
    X_train, y_train = iris_input[train_index], iris_target[train_index]
    X_val, y_val = iris_input[val_index], iris_target[val_index]    # fancy indexing
    
    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))
    print('================================================')

    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_val)
    acc_score = accuracy_score(y_val, y_pred)
    cv_accuracy.append(acc_score)
    
print(f"훈련별 정확도:", cv_accuracy)
print("분류모델 정확도:", np.mean(cv_accuracy))

(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
훈련별 정확도: [1.0, 0.9666666666666667, 0.9333333333333333, 1.0, 0.9333333333333333]
분류모델 정확도: 0.9666666666666668


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### corss_val_score
- 교차 검증을 통해 모델 성능을 평가하는 함수
- 내부적으로 지정한 횟수만큼 학습/검증을 나누어 반복처리

In [None]:
from sklearn.model_selection import cross_val_score

# 첫 번째 인자: 모델
# 두 번째 인자: 입력 데이터 (X 데이터)
# 세 번째 인자: 라벨 데이터 (y 데이터)
# cv: 반복 횟수 (KFold 사용, StratifiedKFlod 객체 전달 가능)
# scoring: 퍙가 지표 (accuracy, precision, recall, f1)
# 반환값 = 반복한 훈련별 검증 점수 '배열'01_evaluation.ipynb

scores = cross_val_score(lr_clf, iris_input, iris_target, cv=5, scoring='accuracy')

print('훈련별 정확도:', scores)
print('모델 정확도:', np.mean(scores))

훈련별 정확도: [0.96666667 1.         0.93333333 0.96666667 1.        ]
모델 정확도: 0.9733333333333334


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.model_selection import cross_validate

lr_clf = LogisticRegression(max_iter=1000)

# 과적합 방지용으로 train score도 볼 수 있음
scores = cross_validate(lr_clf, iris_input, iris_target, cv=5, scoring=['accuracy', 'f1_macro'], return_train_score=True)

scores

{'fit_time': array([0.03416896, 0.0431776 , 0.01788783, 0.01017499, 0.01275635]),
 'score_time': array([0.00200915, 0.        , 0.00101781, 0.        , 0.        ]),
 'test_accuracy': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_accuracy': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ]),
 'test_f1_macro': array([0.96658312, 1.        , 0.93265993, 0.96658312, 1.        ]),
 'train_f1_macro': array([0.96664582, 0.96664582, 0.98333333, 0.98332291, 0.97499609])}