### 교차 검증
- 부족한 데이터셋 및 특성 데이터에 과대적합되는 문제 해결하기 위한 방안
- 학습 데이터셋을 일정 크기의 데이터로 n개 분리 후 1/n은 검증용, 나머지는 학습용으로 사용

[1] 모듈 로딩 및 데이터 준비 <hr>

In [255]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [256]:
# KFold 인스턴스 생성 => 데이터를 2개로 분할해주는 객체
k_fold = KFold(n_splits=2)

In [257]:
# 데이터를 분할
datasets = k_fold.split(X, y)

for dataset in datasets:
    print(dataset)

(array([2, 3]), array([0, 1]))
(array([0, 1]), array([2, 3]))


In [258]:
# perch3.csv 파일 데이터 기본 5 등분
import pandas as pd
perchDF = pd.read_csv('../data/perch3.csv')
perchDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Weight  56 non-null     float64
 1   Length  56 non-null     float64
 2   Height  56 non-null     float64
 3   Width   56 non-null     float64
dtypes: float64(4)
memory usage: 1.9 KB


In [259]:
# perchDF => 5등분
fold_5 = KFold(n_splits=3)

datasets = fold_5.split(perchDF)

for index, (train, test) in enumerate(datasets):
    print(f'{index} => {train.shape} {test.shape}')

0 => (37,) (19,)
1 => (37,) (19,)
2 => (38,) (18,)


In [260]:
# perchDF => 5등분(기본값)
fold_5 = KFold()

datasets = fold_5.split(perchDF)

for index, (train, test) in enumerate(datasets):
    print(f'{index} => {train.shape} {test.shape}')

0 => (44,) (12,)
1 => (45,) (11,)
2 => (45,) (11,)
3 => (45,) (11,)
4 => (45,) (11,)


In [261]:
# 타겟이 분류인 경우
irisDF = pd.read_csv('../data/iris.csv')
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [262]:
k_fold = KFold(n_splits=3, shuffle=True)
ret = k_fold.split(irisDF[irisDF.columns[:-1]])

In [263]:
from sklearn.linear_model import LogisticRegression

In [264]:
train_score3 = []
    
for idx, (train, test) in enumerate(ret):
    # 학습용, 테스트용 인덱스 반환
    #print(idx, train, test)
    train_idx = train.tolist()
    test_idx = test.tolist()

    # 인덱스에 해당하는 데이터셋 추출
    trainDF=irisDF.iloc[train_idx]
    testDF=irisDF.iloc[test_idx]
    print(trainDF['variety'].value_counts()/trainDF.shape[0])
    print(testDF['variety'].value_counts()/testDF.shape[0])

    X_train=trainDF[trainDF.columns[:-1]]
    y_train=trainDF[trainDF.columns[-1]]

    X_test=testDF[testDF.columns[:-1]]
    y_test=testDF[testDF.columns[-1]]

    # 분류 모델 학습
    log_model = LogisticRegression(max_iter=1000, solver='liblinear')
    log_model.fit(X_train, y_train)

    # 훈련 및 검증용 성능
    train_score = log_model.score(X_train, y_train)
    test_score = log_model.score(X_test, y_test)
    
    # 예측
    pre_y = log_model.predict(X_test)
    
    train_score3.append(train_score)

variety
Setosa        0.35
Versicolor    0.34
Virginica     0.31
Name: count, dtype: float64
variety
Virginica     0.38
Versicolor    0.32
Setosa        0.30
Name: count, dtype: float64
variety
Virginica     0.34
Setosa        0.33
Versicolor    0.33
Name: count, dtype: float64
variety
Setosa        0.34
Versicolor    0.34
Virginica     0.32
Name: count, dtype: float64
variety
Virginica     0.35
Versicolor    0.33
Setosa        0.32
Name: count, dtype: float64
variety
Setosa        0.36
Versicolor    0.34
Virginica     0.30
Name: count, dtype: float64


In [265]:
sum(train_score3)/3

0.9633333333333333