#### 붓꽃 품종 분류
- 목표 : 붓꽃의 3개 품종을 분류하기
- 데이터셋 : 내장 데이터셋 사용
- 피쳐 : 4개
- 타겟 : 품종 1개
- 학습 : 지도학습 > 분류

[1] 데이터 준비

In [1]:
# 모듈 로딩
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# 내장 데이터셋 로딩
data=load_iris(as_frame=True)

In [3]:
# Bunch 인스턴스 => dict와 유사한 형태
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
featureDF=data['data']
targetSR=data['target']

In [5]:
featureDF.shape, targetSR.shape

((150, 4), (150,))

In [6]:
featureDF.head(), targetSR.head()

(   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                5.1               3.5                1.4               0.2
 1                4.9               3.0                1.4               0.2
 2                4.7               3.2                1.3               0.2
 3                4.6               3.1                1.5               0.2
 4                5.0               3.6                1.4               0.2,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: target, dtype: int32)

[2] 학습을 위한 데이터셋 준비 => 학습용, 검증용, 테스트용

In [7]:
# 학습용, 테스트용 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, stratify=targetSR)

In [8]:
# 학습용, 검증용 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train)

In [14]:
print(f'Train DS : {X_train.shape[0]}  {X_train.shape[0]/featureDF.shape[0]}%')
print(f'Val DS : {X_val.shape[0]}  {X_val.shape[0]/featureDF.shape[0]:.2f}%')
print(f'Test DS : {X_test.shape[0]}  {X_test.shape[0]/featureDF.shape[0]:.2f}%')

Train DS : 84  0.56%
Val DS : 28  0.19%
Test DS : 38  0.25%


[3] 교차검증 방식

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [16]:
# 모델 인스턴스 생성
dtc_model=DecisionTreeClassifier()

# [3-1] KFold 기반-----------------
# 정확도 저장 리스트
accuracys=[]

# KFold 인스턴스 생성
kfold=KFold()

In [21]:
# K번 만큼 K개 데이터셋으로 학습 진행
# -> K등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스

for train_index, val_index in kfold.split(featureDF):
    print(f'train_index : {train_index.tolist()}')
    # print('val_index : {val_index.tolist()})

    # X_train, X_val 데이터셋 설정
    X_train, y_train = featureDF.loc[train_index.tolist()], targetSR[train_index.tolist()]
    X_val, y_val= featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습 진행
    dtc_model.fit(X_train, y_train)

    # 평가
    accuracy=dtc_model.score(X_val, y_val)
    accuracys.append(accuracy)


train_index : [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
train_index : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 

In [26]:
# K번 만큼 K개 데이터셋으로 학습 진행
# -> K등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스

for idx, (train_index, val_index) in enumerate (kfold.split(featureDF), 1):
    print(f'train_index : {train_index.tolist()}')
    # print('val_index : {val_index.tolist()})

    # X_train, X_val 데이터셋 설정
    X_train, y_train = featureDF.loc[train_index.tolist()], targetSR[train_index.tolist()]
    X_val, y_val= featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습 진행
    dtc_model.fit(X_train, y_train)

    # 평가 => 분류의 경우 score() 메서드 => 정확도 반환
    train_accuracy=dtc_model.score(X_train, y_train)
    val_accuracy=dtc_model.score(X_val, y_val)
    
    accuracys.append([train_accuracy, val_accuracy])
    print(f'[{idx}번째] Train 정확도 : {train_accuracy}, Val 정확도 : {val_accuracy}')

train_index : [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] Train 정확도 : 1.0, Val 정확도 : 1.0
train_index : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126