# Iris Dataset을 활용한 실습

## k-Nearest Neighbor를 활용하여 분류 문제를 해결

### 1. 필요한 패키지를 Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# set random seed
np.random.seed(20)

### 2. 데이터 불러오기 및 시각화

In [None]:
# load iris.csv from ./data/iris.csv
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/opensource/week11/Iris_Data.csv')
# 상위 5개 데이터 출력
print(df.head())

In [None]:
# visualize the data
sns.pairplot(df, hue='species')
plt.show()

### 3. 데이터에서 정답을 분리, 모델 선언
k-Nearest Neighbor에서 k값은 1, 3, 5로 설정

데이터를 (학습 + 검증 데이터셋) (테스트 데이터셋)으로 분리

In [None]:
# split data into features and labels
X = df.drop('species', axis=1)
y = df['species']

# create a k-nearest neighbors classifier
knn_one = KNeighborsClassifier(n_neighbors=1)
knn_three = KNeighborsClassifier(n_neighbors=3)
knn_five = KNeighborsClassifier(n_neighbors=5)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=20)


### 4. 시나리오 (1) : Holdout Cross-validation
75% 학습 데이터셋, 25% 검증 데이터셋
10번 반복 실험하여 정확도의 평균을 계산

In [None]:
repeat = 10
accuracy_one = np.zeros(repeat)
accuracy_three = np.zeros(repeat)
accuracy_five = np.zeros(repeat)

for i in range(repeat):
    # random_state=42
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=20)

    # train the classifier
    knn_one.fit(X_train, y_train)
    knn_three.fit(X_train, y_train)
    knn_five.fit(X_train, y_train)

    # make predictions
    y_pred_one = knn_one.predict(X_val)
    y_pred_three = knn_three.predict(X_val)
    y_pred_five = knn_five.predict(X_val)

    # calculate accuracy
    accuracy_one[i] = accuracy_score(y_val, y_pred_one)
    accuracy_three[i] = accuracy_score(y_val, y_pred_three)
    accuracy_five[i] = accuracy_score(y_val, y_pred_five)

print('K=1 Accuracy:', accuracy_one.mean())
print('K=3 Accuracy:', accuracy_three.mean())
print('K=5 Accuracy:', accuracy_five.mean())


테스트 데이터셋에서의 정확도를 평가하여 모델 일반화 능력을 평가

In [None]:
# train the classifier
knn_one.fit(X_train_val, y_train_val)
test_accuracy = accuracy_score(y_test, knn_one.predict(X_test))
print('Test Accuracy:', test_accuracy)

### 4. 시나리오 (2) : K-Fold Cross-validation
K값을 10으로 설정하여 실험

In [None]:
# k fold cross validation with k=10
kf = KFold(n_splits=10, shuffle=True, random_state=20)
accuracy_one = np.zeros(10)
accuracy_three = np.zeros(10)
accuracy_five = np.zeros(10)

for i, (train_index, val_index) in enumerate(kf.split(X_train_val)):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    knn_one.fit(X_train, y_train)
    knn_three.fit(X_train, y_train)
    knn_five.fit(X_train, y_train)

    y_pred_one = knn_one.predict(X_val)
    y_pred_three = knn_three.predict(X_val)
    y_pred_five = knn_five.predict(X_val)

    accuracy_one[i] = accuracy_score(y_val, y_pred_one)
    accuracy_three[i] = accuracy_score(y_val, y_pred_three)
    accuracy_five[i] = accuracy_score(y_val, y_pred_five)

print('K=1 Accuracy:', accuracy_one.mean())
print('K=3 Accuracy:', accuracy_three.mean())
print('K=5 Accuracy:', accuracy_five.mean())

테스트 데이터셋에서의 정확도를 평가하여 모델 일반화 능력을 평가

In [None]:
# train the classifier
knn_one.fit(X_train_val, y_train_val)
test_accuracy = accuracy_score(y_test, knn_one.predict(X_test))
print('Test Accuracy:', test_accuracy)