<a href="https://colab.research.google.com/github/JakeOh/202110_itw_lab_python/blob/main/ml08_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Fish 데이터 셋

In [2]:
fish_csv = 'https://github.com/JakeOh/202110_itw_lab_python/raw/main/fish.csv'

In [3]:
fish = pd.read_csv(fish_csv)

## 이진 분류(binary classification)

* 생선의 종류들 중에서 도미(Bream)와 빙어(Smelt)를 분류
* 생선의 모든 특성(Weight, Length, Diagonal, Height, Width)을 사용


In [5]:
# Bream과 Smelt로 이루어진 데이터프레임
df = fish[fish['Species'].isin(['Bream', 'Smelt'])]

In [6]:
df.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [7]:
df.tail()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
154,Smelt,12.2,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,12.4,13.5,2.43,1.269
156,Smelt,12.2,13.0,13.8,2.277,1.2558
157,Smelt,19.7,14.3,15.2,2.8728,2.0672
158,Smelt,19.9,15.0,16.2,2.9322,1.8792


In [8]:
X = df.iloc[:, 1:].values  # 특성 행렬(2차원 배열)
y = df['Species'].values  # 타겟 배열

In [9]:
X[:5, :]

array([[242.    ,  25.4   ,  30.    ,  11.52  ,   4.02  ],
       [290.    ,  26.3   ,  31.2   ,  12.48  ,   4.3056],
       [340.    ,  26.5   ,  31.1   ,  12.3778,   4.6961],
       [363.    ,  29.    ,  33.5   ,  12.73  ,   4.4555],
       [430.    ,  29.    ,  34.    ,  12.444 ,   5.134 ]])

In [10]:
y[:5]

array(['Bream', 'Bream', 'Bream', 'Bream', 'Bream'], dtype=object)

In [11]:
X.shape, y.shape

((49, 5), (49,))

# 훈련 셋, 테스트 셋 분리

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

In [13]:
X_train.shape, X_test.shape

((39, 5), (10, 5))

In [14]:
y_train.shape, y_test.shape

((39,), (10,))

In [15]:
np.unique(y_train, return_counts=True)

(array(['Bream', 'Smelt'], dtype=object), array([28, 11]))

In [16]:
28/39, 11/39

(0.717948717948718, 0.28205128205128205)

In [17]:
np.unique(y_test, return_counts=True)

(array(['Bream', 'Smelt'], dtype=object), array([7, 3]))

# KNN Classifier

## 모델 생성

In [18]:
scaler = StandardScaler()
clf = KNeighborsClassifier()
knn_model = Pipeline(steps=[('scaler', scaler), ('clf', clf)])  # 모델 생성

## 모델 훈련

In [19]:
knn_model.fit(X_train, y_train)  # 모델 훈련

Pipeline(steps=[('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

## 훈련 셋 평가

In [20]:
# 훈련 셋 예측값
knn_train_pred = knn_model.predict(X_train)
print(knn_train_pred)

['Bream' 'Smelt' 'Bream' 'Bream' 'Smelt' 'Smelt' 'Smelt' 'Bream' 'Bream'
 'Smelt' 'Smelt' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Smelt'
 'Bream' 'Bream' 'Bream' 'Smelt' 'Bream' 'Smelt' 'Bream' 'Smelt' 'Bream'
 'Bream' 'Bream' 'Smelt' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream'
 'Bream' 'Bream' 'Bream']


In [21]:
# 훈련 셋 실젯값
print(y_train)

['Bream' 'Smelt' 'Bream' 'Bream' 'Smelt' 'Smelt' 'Smelt' 'Bream' 'Bream'
 'Smelt' 'Smelt' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Smelt'
 'Bream' 'Bream' 'Bream' 'Smelt' 'Bream' 'Smelt' 'Bream' 'Smelt' 'Bream'
 'Bream' 'Bream' 'Smelt' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream' 'Bream'
 'Bream' 'Bream' 'Bream']


In [22]:
confusion_matrix(y_train, knn_train_pred)

array([[28,  0],
       [ 0, 11]])

In [23]:
print(classification_report(y_train, knn_train_pred))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00        28
       Smelt       1.00      1.00      1.00        11

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39



## 테스트 셋 평가

In [24]:
# 테스트 셋 예측값
knn_test_pred = knn_model.predict(X_test)
print(knn_test_pred)

['Bream' 'Bream' 'Smelt' 'Bream' 'Bream' 'Smelt' 'Smelt' 'Bream' 'Bream'
 'Bream']


In [25]:
# 테스트 셋 실젯값
print(y_test)

['Bream' 'Bream' 'Smelt' 'Bream' 'Bream' 'Smelt' 'Smelt' 'Bream' 'Bream'
 'Bream']


In [26]:
confusion_matrix(y_test, knn_test_pred)

array([[7, 0],
       [0, 3]])

In [27]:
print(classification_report(y_test, knn_test_pred))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00         7
       Smelt       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



# Logistic Regression

## 모델 생성

In [28]:
scaler = StandardScaler()
clf = LogisticRegression()
logit_model = Pipeline(steps=[('scaler', scaler), ('clf', clf)])

## 모델 훈련

In [29]:
logit_model.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression())])

## 훈련 셋에서 모델 평가

In [30]:
logit_train_pred = logit_model.predict(X_train)  # 훈련 셋 예측값

In [31]:
confusion_matrix(y_train, logit_train_pred)

array([[28,  0],
       [ 0, 11]])

## 테스트 셋에서 모델 평가

In [32]:
logit_test_pred = logit_model.predict(X_test)  # 테스트 셋 예측값

In [35]:
print(logit_test_pred)

['Bream' 'Bream' 'Smelt' 'Bream' 'Bream' 'Smelt' 'Smelt' 'Bream' 'Bream'
 'Bream']


In [33]:
confusion_matrix(y_test, logit_test_pred)

array([[7, 0],
       [0, 3]])

In [34]:
# Logistic Regression은 각 클래스로 분류가 될 확률(probability)을 계산해서 분류 예측을 하는 알고리즘.
logit_model.predict_proba(X_test)

array([[9.90960690e-01, 9.03931034e-03],
       [9.94320123e-01, 5.67987700e-03],
       [5.00458495e-02, 9.49954150e-01],
       [9.99481616e-01, 5.18384317e-04],
       [9.97495022e-01, 2.50497777e-03],
       [3.16773760e-02, 9.68322624e-01],
       [9.56001338e-02, 9.04399866e-01],
       [9.98456978e-01, 1.54302243e-03],
       [9.83451744e-01, 1.65482562e-02],
       [9.70261071e-01, 2.97389290e-02]])