In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#   패키지 설치
#   pip install sklearn

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# 모델 학습
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# 성능 조절
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint

# 성능 평가
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import silhouette_score

# 경고 메세지 숨김
import warnings     
warnings.filterwarnings('ignore')

## 테이블 열기

In [2]:
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish[:5]

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [3]:
# 특성
fish_data = fish.iloc[:, 1:]

# 타겟
fish_target = fish.iloc[:, 0]

In [4]:
# 특성 확인
fish_data.head()

Unnamed: 0,Weight,Length,Diagonal,Height,Width
0,242.0,25.4,30.0,11.52,4.02
1,290.0,26.3,31.2,12.48,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.73,4.4555
4,430.0,29.0,34.0,12.444,5.134


In [5]:
# 타겟 확인
fish_target.value_counts()

Perch        56
Bream        35
Roach        20
Pike         17
Smelt        14
Parkki       11
Whitefish     6
Name: Species, dtype: int64

# 데이터 분할

In [6]:
data = fish_data       # 특성
target = fish_target     # 정답

xtrain, xtest, ytrain, ytest = train_test_split(
 
    data, target             # x: 특성, y: 정답
    , test_size = 0.25       # 테스트 데이터 비율(0~1)
#    , stratify = target      # 계층화(훈련＆테스트 데이터 비율: x=y)
    , random_state = 42     # 랜덤 시드
    )

# 데이터 스케일링(표준화)

In [7]:
ss = StandardScaler()

xtrain_scaled = ss.fit_transform(xtrain)    # 훈련 데이터 → 표준화
xtest_scaled = ss.transform(xtest)          # 테스트 데이터 → 표준화

# 로지스틱 회귀(이진 분류)

In [8]:
# 타겟 - 도미, 빙어
bream_smelt = (ytrain == 'Bream') | (ytrain == 'Smelt')

xtrain_bream_smelt = xtrain_scaled[bream_smelt]
ytrain_bream_smelt = ytrain[bream_smelt]

In [9]:
lr = LogisticRegression()                               # 모델 생성

# (분류)변수.fit( xtrain, ytrain )              # 모델 학습
lr.fit( xtrain_bream_smelt, ytrain_bream_smelt )         # 모델 학습(스케일링)

# (분류)변수.score( xtrain, ytrain )            # 훈련 데이터 정확도
# (분류)변수.score( xtest, ytest )              # 테스트 데이터 정확도

# (분류)변수.score( xtrain_scaled, ytrain )     # 훈련 데이터 정확도(스케일링)
# (분류)변수.score( xtest_scaled, ytest )       # 테스트 데이터 정확도(스케일링)

# (분류)변수.classes_       # 타겟 목록??
# (분류)변수.predict_proba( xtest_scaled )      # 타겟 예측 확률

lr.classes_  # 타겟 목록

array(['Bream', 'Smelt'], dtype=object)

In [10]:
lr.predict( xtrain_bream_smelt[:5] )

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [11]:
lr.predict_proba( xtrain_bream_smelt[:5] )

array([[0.99759855, 0.00240145],
       [0.02735183, 0.97264817],
       [0.99486072, 0.00513928],
       [0.98584202, 0.01415798],
       [0.99767269, 0.00232731]])

# 시그모이드 함수

In [12]:
print( lr.coef_, lr.intercept_ )

[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]


In [13]:
# z값(선형방정식)
lr.decision_function( xtrain_bream_smelt[:5] )

array([-6.02927744,  3.57123907, -5.26568906, -4.24321775, -6.0607117 ])

In [14]:
# z값 -> 시그모이드
from scipy.special import expit
expit( lr.decision_function( xtrain_bream_smelt[:5] ) )

array([0.00240145, 0.97264817, 0.00513928, 0.01415798, 0.00232731])

# 로지스틱 회귀(다중 분류)

In [15]:
lr = LogisticRegression(
    C = 20            # 하이퍼 파라미터
    , max_iter = 1000
)

# lr.coef_         # 기울기
# lr.intercept_    # 절편                               

# (분류)변수.fit( xtrain, ytrain )              # 모델 학습
lr.fit( xtrain_scaled, ytrain )         # 모델 학습(스케일링)

# (분류)변수.score( xtrain, ytrain )            # 훈련 데이터 정확도
# (분류)변수.score( xtest, ytest )              # 테스트 데이터 정확도

lr.classes_       # 타겟 목록

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [16]:
lr.score( xtrain_scaled, ytrain )     # 훈련 데이터 정확도(스케일링)

0.9327731092436975

In [17]:
lr.score( xtest_scaled, ytest )       # 테스트 데이터 정확도(스케일링)

0.925

In [18]:
# 예측
lr.predict( xtest_scaled[:5] )

array(['Perch', 'Smelt', 'Pike', 'Roach', 'Perch'], dtype=object)

In [19]:
proba = lr.predict_proba( xtest_scaled[:5] )      # 타겟 예측 확률
np.round(proba, decimals=3)

array([[0.   , 0.014, 0.841, 0.   , 0.136, 0.007, 0.003],
       [0.   , 0.003, 0.044, 0.   , 0.007, 0.946, 0.   ],
       [0.   , 0.   , 0.034, 0.935, 0.015, 0.016, 0.   ],
       [0.011, 0.034, 0.306, 0.007, 0.567, 0.   , 0.076],
       [0.   , 0.   , 0.904, 0.002, 0.089, 0.002, 0.001]])

# 소프트맥스 함수

In [20]:
print(lr.coef_, lr.intercept_)
print(lr.coef_.shape, lr.intercept_.shape)

[[-1.49002465 -1.02915033  2.59347373  7.70356653 -1.20069469]
 [ 0.19618504 -2.01065738 -3.7797884   6.50490975 -1.99484019]
 [ 3.56279355  6.343586   -8.48971913 -5.75757009  3.79307946]
 [-0.10458098  3.60318511  3.93067682 -3.61734668 -1.75069896]
 [-1.40060785 -6.07504053  5.25968833 -0.87221069  1.86043573]
 [-1.38526797  1.49215576  1.39226981 -5.67733578 -4.40096933]
 [ 0.62150286 -2.32407863 -0.90660116  1.71598695  3.69368798]] [-0.09205087 -0.26290884  3.25101307 -0.14743221  2.65497631 -6.78781848
  1.38422102]
(7, 5) (7,)


In [21]:
# z값(선형방정식)
lr.decision_function( xtest_scaled[:5] )

array([[ -6.49809703,   1.03224223,   5.16361628,  -2.72867779,
          3.33888351,   0.32650921,  -0.63447641],
       [-10.85946393,   1.92721051,   4.77097878,  -2.39849854,
          2.97811596,   7.84131255,  -4.25965533],
       [ -4.33528446,  -6.23306253,   3.17445488,   6.48668696,
          2.35757423,   2.42118037,  -3.87154945],
       [ -0.68335973,   0.45272685,   2.64700088,  -1.1866764 ,
          3.26453577,  -5.75274473,   1.25851735],
       [ -6.39706271,  -1.99273317,   5.81573446,  -0.11035964,
          3.50283425,  -0.11162293,  -0.70679027]])

In [22]:
# z값 -> 시그모이드
from scipy.special import softmax
proba = softmax( lr.decision_function( xtest_scaled[:5] ), axis=1 )
np.round(proba, decimals=3)

array([[0.   , 0.014, 0.841, 0.   , 0.136, 0.007, 0.003],
       [0.   , 0.003, 0.044, 0.   , 0.007, 0.946, 0.   ],
       [0.   , 0.   , 0.034, 0.935, 0.015, 0.016, 0.   ],
       [0.011, 0.034, 0.306, 0.007, 0.567, 0.   , 0.076],
       [0.   , 0.   , 0.904, 0.002, 0.089, 0.002, 0.001]])

## 그리드 서치

In [50]:
lr = LogisticRegression()

# lr.coef_         # 기울기
# lr.intercept_    # 절편                               

# (분류)변수.fit( xtrain, ytrain )              # 모델 학습
lr.fit( xtrain_scaled, ytrain )         # 모델 학습(스케일링)

# (분류)변수.score( xtrain, ytrain )            # 훈련 데이터 정확도
# (분류)변수.score( xtest, ytest )              # 테스트 데이터 정확도

lr.classes_       # 타겟 목록

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [51]:
from sklearn.model_selection import GridSearchCV

params = {
    'C': np.arange(1, 20+1),
    'max_iter': np.arange(100, 1000+1, 100)
    }

gscv = GridSearchCV(
    lr       # 모델
    , params        # 하이퍼 파라미터 목록
    , cv = 5        # 교차 검증 횟수(K번)
    , scoring = 'accuracy'         # 평가 지표(정확도)
    , return_train_score = True    # 훈련 데이터 정확도 추가
    , n_jobs = -1    # 사용할 CPU 코어 개수(-1: 모두, 1: 한 개)
    )

# gscv.fit(xtrain, ytrain)
gscv.fit(xtrain_scaled, ytrain)

lr = gscv.best_estimator_    # 하이퍼 파라미터 최적화 값 저장
gscv.best_params_    # 하이퍼 파라미터 최적화 값

{'C': 11, 'max_iter': 100}

In [25]:
gscv.cv_results_['mean_test_score']            # 하이퍼 파라미터 교차 검증 정확도

array([0.78913043, 0.79818841, 0.81521739, 0.84057971, 0.84891304,
       0.84891304, 0.86594203, 0.88297101, 0.89130435, 0.89130435,
       0.89963768, 0.89963768, 0.89963768, 0.89963768, 0.89963768,
       0.89963768, 0.89963768, 0.89963768, 0.89963768, 0.89963768])

In [26]:
np.max(gscv.cv_results_['mean_test_score'])    # 하이퍼 파라미터 최적화 교차 검증 정확도

0.8996376811594203

In [52]:
lr.score( xtrain_scaled, ytrain )     # 훈련 데이터 정확도(스케일링)

0.9159663865546218

In [53]:
lr.score( xtest_scaled, ytest )       # 테스트 데이터 정확도(스케일링)

0.925