## 목표 : 생선 분류 모델
- 데이터 : fish.csv
- 피쳐 : 5개 Weight, Length, Diagonal, Height, Width
- 타겟 : 1개 Species
- 방법 : 지도학습 + LogisticRegression

In [118]:

from sklearnex import patch_sklearn, unpatch_sklearn
patch_sklearn()

import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
# 나눔고딕
mpl.rc('font', family='NanumGothic')
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline



Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [119]:
data_file = '../data/fish.csv'
df = pd.read_csv(data_file)
df['Species'] = df['Species'].astype('category')
# df['Species'] = df['Species'].cat.codes

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Species   159 non-null    category
 1   Weight    159 non-null    float64 
 2   Length    159 non-null    float64 
 3   Diagonal  159 non-null    float64 
 4   Height    159 non-null    float64 
 5   Width     159 non-null    float64 
dtypes: category(1), float64(5)
memory usage: 6.8 KB


### (2) 학습 위한 데이터 준비
<hr>


#### (2-1) 피쳐 / 타켓 분리

In [120]:
featuredf = df[['Weight', 'Length', 'Diagonal', 'Height', 'Width']]
targetf = df['Species']


In [121]:
print(featuredf.shape, targetf.shape)

(159, 5) (159,)


In [122]:
round(targetf.value_counts()/targetf.shape[0]*100,2)

Perch        35.22
Bream        22.01
Roach        12.58
Pike         10.69
Smelt         8.81
Parkki        6.92
Whitefish     3.77
Name: Species, dtype: float64

In [123]:
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# le = LabelEncoder()
# targetf = le.fit_transform(targetf)
# # numpy unique element
# print(np.unique(
# targetf))


#### (2-2) 학습용/테스트용 데이터셋 준비

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(
    featuredf, 
    targetf, 
    test_size=0.2, 
    random_state=42, 
    stratify=targetf
)

In [126]:
print(f"[Train Dataset] {X_train.shape}, {y_train.shape}")
print(f"[Test Dataset] {X_test.shape}, {y_test.shape}")


[Train Dataset] (127, 5), (127,)
[Test Dataset] (32, 5), (32,)


#### (3) 학습 진행

In [127]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10**4, solver='liblinear')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.96875

In [128]:
# 모델 파라메터 확인 
attributes = ['intercept_', 'coef_', 'n_iter_', 'classes_', 'feature_names_in_'] 
for attr in attributes:
    print(f"{attr}: {getattr(model, attr)}")



intercept_: [-0.44714905  0.0157149  -0.21076237 -1.29436017 -1.46575477  0.30909346
 -0.43485959]
coef_: [[ 1.28739252e-02 -1.79260663e+00  1.01225040e+00  1.32066452e+00
  -4.72412257e-01]
 [-1.92989443e-02  4.76638517e-01 -1.11348797e+00  2.18920063e+00
   3.98172820e-02]
 [-2.08914578e-03  2.71756398e+00 -2.77974648e+00 -1.07689131e-02
   1.97871899e+00]
 [ 1.05220007e-02  3.46506792e-01  7.60637048e-02 -1.97989150e+00
  -8.84042982e-01]
 [-8.80359352e-03 -1.73998982e+00  1.54152420e+00 -6.06878924e-01
   1.88062298e+00]
 [-7.19409896e-02  2.64464766e-01  2.86969882e-01 -1.58463377e+00
  -7.21863827e-01]
 [ 6.36648444e-03 -5.31173756e-01  2.71526723e-01 -2.63074980e-01
   7.63853632e-01]]
n_iter_: [16 22 19 17 17 17 22]
classes_: ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
feature_names_in_: ['Weight' 'Length' 'Diagonal' 'Height' 'Width']


### (4) 평가
<hr>


In [129]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred), accuracy_score(y_train, model.predict(X_train)))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00         7
      Parkki       1.00      1.00      1.00         2
       Perch       0.92      1.00      0.96        11
        Pike       1.00      1.00      1.00         4
       Roach       1.00      1.00      1.00         4
       Smelt       1.00      1.00      1.00         3
   Whitefish       0.00      0.00      0.00         1

    accuracy                           0.97        32
   macro avg       0.85      0.86      0.85        32
weighted avg       0.94      0.97      0.95        32

[[ 7  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0]
 [ 0  0 11  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  4  0  0]
 [ 0  0  0  0  0  3  0]
 [ 0  0  1  0  0  0  0]]
0.96875 0.952755905511811


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### (5) 모델 활용
<hr>

In [130]:
y_pre = model.predict(X_test.iloc[[0]])
print(y_pre)
print(y_test.iloc[0])

['Bream']
Bream


In [131]:
np.round(model.predict_proba(X_test.iloc[:5]), 3), y_test[:5]



(array([[0.93 , 0.   , 0.   , 0.   , 0.002, 0.   , 0.067],
        [0.009, 0.003, 0.057, 0.001, 0.872, 0.   , 0.058],
        [0.001, 0.005, 0.101, 0.129, 0.187, 0.538, 0.038],
        [0.   , 0.   , 0.945, 0.001, 0.013, 0.   , 0.041],
        [0.001, 0.029, 0.783, 0.002, 0.16 , 0.   , 0.026]]),
 28     Bream
 50     Roach
 154    Smelt
 110    Perch
 85     Perch
 Name: Species, dtype: category
 Categories (7, object): ['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'])

In [132]:
result = model.predict_proba(X_test.iloc[:5]).argmax(axis=1)
result



array([0, 4, 5, 2, 2], dtype=int64)

In [133]:
data = {
    'Pre Y' : [model.classes_[idx] for idx in result],
    "True Y" : y_test[:5]
}

pd.DataFrame(data)

Unnamed: 0,Pre Y,True Y
28,Bream,Bream
50,Roach,Roach
154,Smelt,Smelt
110,Perch,Perch
85,Perch,Perch


### (6) 모델 성능 평가
<hr>

- 정확도
- 정밀도
- 재현율
- f1 score
- Confusion Matrix
- Classification Report

In [134]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))
print(confusion_matrix(y_test, y_pred, labels=df['Species'].tolist()))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))
print(precision_score(y_test, y_pred, average='macro'))
print(recall_score(y_test, y_pred, average='macro'))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00         7
      Parkki       1.00      1.00      1.00         2
       Perch       0.92      1.00      0.96        11
        Pike       1.00      1.00      1.00         4
       Roach       1.00      1.00      1.00         4
       Smelt       1.00      1.00      1.00         3
   Whitefish       0.00      0.00      0.00         1

    accuracy                           0.97        32
   macro avg       0.85      0.86      0.85        32
weighted avg       0.94      0.97      0.95        32

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 3]]
0.96875
0.8509316770186335
0.8452380952380951
0.8571428571428571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
