# 교차 검증 (Cross Validation)

- 모델을 더욱 신뢰성 있게 평가하는 방법
- 데이터셋을 여러(k) 개로 나누고, 각 부분이 한번씩 검증 데이터로 사용되도록 하는 방법
- 훈련-검증을 반복하며 학습 진행
- 과대적합 방지 효과

In [35]:
import numpy as np
import pandas as pd

### K-fold

In [36]:
# 데이터 로드
from sklearn.datasets import load_iris

iris_input, iris_target = load_iris(return_X_y=True)

In [37]:
np.unique(iris_target, return_counts=True)

(array([0, 1, 2]), array([50, 50, 50]))

In [38]:
# 모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()

In [39]:
# 교차 검증
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# KFold()
# - n_splits: 폴드 개수 (K)
# - shuffle: 폴드로 나누기 전에 섞을건지 여부
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []

for train_idx, eval_idx in kfold.split(iris_input):
    X_train, y_train = iris_input[train_idx], iris_target[train_idx]
    X_eval, y_eval = iris_input[eval_idx], iris_target[eval_idx]

    # print(np.unique(y_train, return_counts=True))
    # print(np.unique(y_eval, return_counts=True))
    # print('==================================================')

    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_eval)
    # print(accuracy_score(y_eval, y_pred))
    cv_accuracy.append(accuracy_score(y_eval, y_pred))

In [40]:
# KFold 교차 검증에 따른 분류모델 정확도
print(cv_accuracy)
np.mean(cv_accuracy)

[1.0, 1.0, 0.9333333333333333, 0.9666666666666667, 0.9666666666666667]


np.float64(0.9733333333333334)

### Stratified-K-Fold

In [41]:
from sklearn.model_selection import StratifiedKFold

lr_clf = LogisticRegression()

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []

for train_idx, eval_idx in stratified_kfold.split(iris_input, iris_target):
    X_train, y_train = iris_input[train_idx], iris_target[train_idx]
    X_eval, y_eval = iris_input[eval_idx], iris_target[eval_idx]

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_eval, return_counts=True))
    print('==================================================')

    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_eval)
    cv_accuracy.append(accuracy_score(y_eval, y_pred))

(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))
(array([0, 1, 2]), array([40, 40, 40]))
(array([0, 1, 2]), array([10, 10, 10]))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
print(cv_accuracy)
np.mean(cv_accuracy)

[1.0, 0.9666666666666667, 0.9333333333333333, 1.0, 0.9333333333333333]


np.float64(0.9666666666666668)

### Scikit-learn 교차검증 함수

**cross_val_score & cross_validate**

- 교차 검증을 통해 모델 성능을 평가하는 함수
- 내부적으로 지정한 횟수만큼 학습/검증을 나누어 반복 처리

- 공통되는 파라미터
    - 첫 번째 인자: 학습/검증 대상 모델
    - 두 번째 인자: feature data
    - 세 번째 인자: label(target) data
    - scoring 키워드 인자: 평가 지표
    - cv 키워드 인자: 반복 횟수 (기본적으로 KFold 사용, StratifiedKFold 객체 전달 가능)

##### cross_val_score()

In [43]:
from sklearn.model_selection import cross_val_score

lr_clf = LogisticRegression()

# 반복한 훈련별 검증 점수 배열 반환
scores = cross_val_score(lr_clf, iris_input, iris_target, scoring='accuracy', cv=5)
np.mean(scores)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


np.float64(0.9733333333333334)

In [44]:
# StratifiedKFold 객체 사용 가능
scores = cross_val_score(lr_clf, iris_input, iris_target, scoring='accuracy', cv=stratified_kfold)
np.mean(scores)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


np.float64(0.9666666666666668)

##### cross_validate()

In [45]:
from sklearn.model_selection import cross_validate

lr_clf = LogisticRegression()

# 반복한 훈련별 학습시간, 평가시간, 검증점수 반환
cross_validate(
    lr_clf, iris_input, iris_target,
    scoring=['accuracy', 'f1_macro'],   # 다중 지표 사용 가능
    cv=5,
    return_train_score=True             # 학습 데이터에 대한 검증점수 반환 가능
)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'fit_time': array([0.0244801 , 0.01503539, 0.01459718, 0.00850534, 0.01855779]),
 'score_time': array([0.        , 0.        , 0.00102377, 0.        , 0.        ]),
 'test_accuracy': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_accuracy': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ]),
 'test_f1_macro': array([0.96658312, 1.        , 0.93265993, 0.96658312, 1.        ]),
 'train_f1_macro': array([0.96664582, 0.96664582, 0.98333333, 0.98332291, 0.97499609])}

### [한번 해보기] 생선 다중 분류 with cross_val_score

- 생선의 Weight, Length, Diagonal, Height, Width를 사용해 생선의 Species를 예측하는 문제를 LogisticRegression 모델로 해결해 보세요.
    - cross_val_score() 함수를 사용해 교차 검증을 적용
    - LogisticRegression의 Hyper Parameter를 바꿔가며 테스트
    - 최적의 성능을 내는 Hyper Parameter를 찾아보세요 ^_^@

In [None]:
# 데이터 로드
fish = pd.read_csv('./data/fish.csv')

# 특성 데이터(Weight, Length, Diagonal, Height, Width)와 타깃 데이터(Species) 분리
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()
fish_target = fish['Species'].to_numpy()

fish_input, fish_target

(array([[2.42000e+02, 2.54000e+01, 3.00000e+01, 1.15200e+01, 4.02000e+00],
        [2.90000e+02, 2.63000e+01, 3.12000e+01, 1.24800e+01, 4.30560e+00],
        [3.40000e+02, 2.65000e+01, 3.11000e+01, 1.23778e+01, 4.69610e+00],
        [3.63000e+02, 2.90000e+01, 3.35000e+01, 1.27300e+01, 4.45550e+00],
        [4.30000e+02, 2.90000e+01, 3.40000e+01, 1.24440e+01, 5.13400e+00],
        [4.50000e+02, 2.97000e+01, 3.47000e+01, 1.36024e+01, 4.92740e+00],
        [5.00000e+02, 2.97000e+01, 3.45000e+01, 1.41795e+01, 5.27850e+00],
        [3.90000e+02, 3.00000e+01, 3.50000e+01, 1.26700e+01, 4.69000e+00],
        [4.50000e+02, 3.00000e+01, 3.51000e+01, 1.40049e+01, 4.84380e+00],
        [5.00000e+02, 3.07000e+01, 3.62000e+01, 1.42266e+01, 4.95940e+00],
        [4.75000e+02, 3.10000e+01, 3.62000e+01, 1.42628e+01, 5.10420e+00],
        [5.00000e+02, 3.10000e+01, 3.62000e+01, 1.43714e+01, 4.81460e+00],
        [5.00000e+02, 3.15000e+01, 3.64000e+01, 1.37592e+01, 4.36800e+00],
        [3.40000e+02, 3.2

In [63]:
# 학습-평가 데이터 분리
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target, test_size=0.2, random_state=42
)

In [66]:
# 데이터 스케일링
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_input)

train_scaled = scaler.transform(train_input)
test_scaled = scaler.transform(test_input)

In [73]:
# 모델 생성(하이퍼 파라미터 조작) 및 교차검증
lr_clf = LogisticRegression(max_iter=1000)

# 교차 검증 수행 (cv=5)
scores = cross_val_score(lr_clf, train_scaled, train_target, cv=5)

print(f"평균 교차 검증 정확도: {np.mean(scores)}")

평균 교차 검증 정확도: 0.7864615384615384




In [None]:
# 테스트 데이터에 대한 추론 결과 평가
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []

for train_idx, eval_idx in stratified_kfold.split(iris_input, iris_target):
    X_train, y_train = iris_input[train_idx], iris_target[train_idx]
    X_eval, y_eval = iris_input[eval_idx], iris_target[eval_idx]

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_eval, return_counts=True))
    print('==================================================')

    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_eval)
    cv_accuracy.append(accuracy_score(y_eval, y_pred))

최종 학습 세트 점수: 0.9921
최종 테스트 세트 점수: 0.9375
