<a href="https://colab.research.google.com/github/JakeOh/202011_itw_bd21/blob/main/lab_ml/ml02_train_set_test_set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier  # 클래스
from sklearn.metrics import confusion_matrix, classification_report  # 함수, 속성
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 데이터 준비

Google Drive에 저장된 fish.pickle 파일에서 데이터를 로드.

In [2]:
file_path = '/content/drive/MyDrive/Colab Notebooks/lab_ml/fish.pickle'

In [3]:
with open(file_path, mode='rb') as f:
    fish = pickle.load(f)

In [4]:
type(fish)

dict

In [5]:
fish.keys()

dict_keys(['data', 'target'])

In [7]:
fish_data = fish['data']

In [8]:
fish_data.shape  
# row: 35 bream + 14 smelt
# column: Weigth + Length

(49, 2)

In [9]:
fish_data[:5]

array([[242. ,  25.4],
       [290. ,  26.3],
       [340. ,  26.5],
       [363. ,  29. ],
       [430. ,  29. ]])

In [10]:
fish_label = fish['target']

In [11]:
fish_label.shape

(49,)

In [12]:
fish_label[:5]

array([1, 1, 1, 1, 1])

# Train/Test 분리, 훈련, 평가

## 순차적인 샘플링

In [13]:
num_train = 35  # 훈련 셋의 샘플 개수

In [16]:
X_train, X_test = fish_data[:num_train], fish_data[num_train:]

In [17]:
X_train.shape, X_test.shape

((35, 2), (14, 2))

In [18]:
y_train, y_test = fish_label[:num_train], fish_label[num_train:]

In [19]:
y_train.shape, y_test.shape

((35,), (14,))

In [20]:
knn_clf = KNeighborsClassifier()  # kNN 모델 객체 생성

In [21]:
knn_clf.fit(X=X_train, y=y_train)  # 35개 훈련 셋 샘플들로 모델을 학습시킴.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [22]:
knn_clf.score(X_train, y_train)  #> 훈련 셋 평가 점수: 1.0(정확도)

1.0

In [23]:
test_predicts = knn_clf.predict(X_test)  # 14개 테스트 셋 샘플들의 예측값을 계산

In [24]:
test_predicts  # 모든 예측값 1(도미, bream)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

**샘플링 편향(sampling bias)**: 훈련 셋과 테스트 셋으로 나눌 때, 훈련 셋이 타겟들을 골고루 갖기 못하고, 특정 타겟이 너무 많이 샘플링되는 경우.

In [25]:
y_test  # 테스트 셋의 실제 target(레이블)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
knn_clf.score(X_test, y_test)  #> 테스트 셋의 평가 점수(정확도): 0

0.0

## 무작위 샘플링(Random Sampling)

데이터 셋을 무작위로 섞은 후, 훈련/테스트 셋을 분리

In [32]:
idx = [0, 1, 2, 3]
np.random.shuffle(idx)
idx  # 원본 배열 idx에서 아이템들의 순서가 무작위로 섞이게 됨.

[3, 2, 0, 1]

In [33]:
np.random.seed(1)
idx = np.arange(49)  # [0, 1, 2, ..., 48]
np.random.shuffle(idx)
idx

array([27, 34, 39, 48,  2,  3, 42, 29, 45, 30, 31, 38, 21, 35, 19, 41, 36,
       26, 22, 13, 40, 17, 44, 24, 23,  4, 32, 14, 10, 28, 47, 33, 18, 20,
       25,  6,  7, 46,  1, 16,  0, 15,  5, 11,  9,  8, 12, 43, 37])

In [34]:
train_idx, test_idx = idx[:num_train], idx[num_train:]

In [35]:
train_idx

array([27, 34, 39, 48,  2,  3, 42, 29, 45, 30, 31, 38, 21, 35, 19, 41, 36,
       26, 22, 13, 40, 17, 44, 24, 23,  4, 32, 14, 10, 28, 47, 33, 18, 20,
       25])

In [36]:
test_idx

array([ 6,  7, 46,  1, 16,  0, 15,  5, 11,  9,  8, 12, 43, 37])

In [37]:
X_train, X_test = fish_data[train_idx], fish_data[test_idx]

In [39]:
X_train.shape, X_test.shape

((35, 2), (14, 2))

In [40]:
y_train, y_test = fish_label[train_idx], fish_label[test_idx]

In [41]:
y_train.shape, y_test.shape

((35,), (14,))

In [42]:
y_train

array([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [43]:
y_test

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])

numpy 배열의 unique한 값의 개수 확인

In [45]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([11, 24]))

In [46]:
11/35, 24/35  # 빙어(smelt)와 도미(bream)의 비율

(0.3142857142857143, 0.6857142857142857)

In [47]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([ 3, 11]))

In [48]:
3/14, 11/14

(0.21428571428571427, 0.7857142857142857)