In [None]:
# 로지스틱 회귀
import numpy as np
import matplotlib.pyplot as plt

In [20]:
import pandas as pd
url="https://bit.ly/fish_csv_data"
fish=pd.read_csv(url)

In [21]:
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [22]:
pd.unique(fish["Species"])

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [23]:
fish_input=fish[["Weight",	"Length",	"Diagonal",	"Height",	"Width"]]
fish_target=fish["Species"]

In [24]:
fish_target=fish_target.to_numpy()
fish_input=fish_input.to_numpy()

In [25]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target=train_test_split(fish_input,fish_target,random_state=42)

In [26]:

# 정규화 -> 데이터의 속성 값의 척도(scale)이 다른 경우, 일관된 크기로 변경하기 위해서 한다.
from sklearn.preprocessing import StandardScaler
# StandardScaler 초기화
ss= StandardScaler()
# 입력 값 train_input를 받아서 정규화에  사용할 평균, 표준편차를 구한다.
ss.fit(train_input)
#각 컬럼의 값을 표준화 한다.
train_scaled=ss.transform(train_input) ## 각 칼럼의 (실제 값- 평균)/ (표준편차)를 구해준다.
test_scaled=ss.transform(test_input)


# 모델링

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(C=20,max_iter=1000) ## 1000번 까지 최적의 값을 찾아 보는것 
lr.fit(train_scaled,train_target)

In [37]:
print(" Train Score:",lr.score(train_scaled,train_target))
print(" Test Score:",lr.score(test_scaled,test_target))


 Train Score: 0.9327731092436975
 Test Score: 0.925


In [39]:
test_input[:5] ## 정규화 되지 않은 값

array([[ 78.    ,  18.7   ,  19.4   ,   5.1992,   3.1234],
       [ 13.4   ,  12.4   ,  13.5   ,   2.43  ,   1.269 ],
       [200.    ,  32.3   ,  34.8   ,   5.568 ,   3.3756],
       [270.    ,  26.    ,  28.7   ,   8.3804,   4.2476],
       [150.    ,  23.    ,  24.5   ,   5.2185,   3.626 ]])

In [None]:
test_scaled[:5] ## 정규화된 값

array([[-0.88741352, -0.91804565, -1.03098914, -0.90464451, -0.80762518],
       [-1.06924656, -1.50842035, -1.54345461, -1.58849582, -1.93803151],
       [-0.54401367,  0.35641402,  0.30663259, -0.8135697 , -0.65388895],
       [-0.34698097, -0.23396068, -0.22320459, -0.11905019, -0.12233464],
       [-0.68475132, -0.51509149, -0.58801052, -0.8998784 , -0.50124996]])

In [None]:
lr.predict(test_input[:5]) ## test_input에 클래스를 예측해본다.

array(['Perch', 'Pike', 'Perch', 'Perch', 'Perch'], dtype=object)

In [None]:
print(lr.classes_) ## 클래스의 종류를 나타낸다.

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [None]:
proba=lr.predict_proba(test_scaled[:5]) ## 예측하는 값의 확률을 구해준다.

In [50]:
print(np.round(proba,decimals=3)) ## 소수점 셋째 자리까지 확률을 보여준다.

[[0.    0.014 0.842 0.    0.135 0.007 0.003]
 [0.    0.003 0.044 0.    0.007 0.946 0.   ]
 [0.    0.    0.034 0.934 0.015 0.016 0.   ]
 [0.011 0.034 0.305 0.006 0.567 0.    0.076]
 [0.    0.    0.904 0.002 0.089 0.002 0.001]]


In [None]:
#z1,z2,z3...z7
decision=lr.decision_function(test_scaled[:5])
print(np.round(decision,decimals=2))

[[ -6.51   1.04   5.17  -2.76   3.34   0.35  -0.63]
 [-10.88   1.94   4.78  -2.42   2.99   7.84  -4.25]
 [ -4.34  -6.24   3.17   6.48   2.36   2.43  -3.87]
 [ -0.69   0.45   2.64  -1.21   3.26  -5.7    1.26]
 [ -6.4   -1.99   5.82  -0.13   3.5   -0.09  -0.7 ]]


In [56]:
# Softmax 계산 => proba가 Softmax의 값인것을 알 수 있다.
from scipy.special import softmax

proba = softmax(decision,axis=1)
print(np.round(proba,decimals=3))

[[0.    0.014 0.842 0.    0.135 0.007 0.003]
 [0.    0.003 0.044 0.    0.007 0.946 0.   ]
 [0.    0.    0.034 0.934 0.015 0.016 0.   ]
 [0.011 0.034 0.305 0.006 0.567 0.    0.076]
 [0.    0.    0.904 0.002 0.089 0.002 0.001]]
