In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#   패키지 설치
#   pip install sklearn

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

# 모델 학습
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

# 성능 평가
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import silhouette_score

# 경고 메세지 숨김
import warnings     
warnings.filterwarnings('ignore')

# 테이블 열기

In [2]:
wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
# 특성
wine_data = wine.iloc[:, 0:3]

# 타겟
wine_target = wine.iloc[:, 3]

In [4]:
wine_data.head()

Unnamed: 0,alcohol,sugar,pH
0,9.4,1.9,3.51
1,9.8,2.6,3.2
2,9.8,2.3,3.26
3,9.8,1.9,3.16
4,9.4,1.9,3.51


In [5]:
wine_target.value_counts()
# 1: 화이트, 0: 레드

1.0    4898
0.0    1599
Name: class, dtype: int64

# 데이터 분할(훈련 + 테스트)

In [6]:
data = wine_data       # 특성
target = wine_target     # 타겟

xtrain, xtest, ytrain, ytest = train_test_split(
 
    data, target             # x: 특성, y: 타겟
    , test_size = 0.2       # 테스트 데이터 비율(0~1)
#   , stratify = target      # 계층화(훈련＆테스트 데이터 비율: x=y)
    , random_state = 42   # 랜덤 시드
    )

print(xtrain.shape, xtest.shape)

(5197, 3) (1300, 3)


# 교차 검증(훈련 + 검증)

In [7]:
# 결정 트리
dtc = DecisionTreeClassifier(
    criterion = 'gini'    # 불순도
    , random_state=42
    )

In [8]:
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold

spliter = StratifiedKFold(
    n_splits=10    # 겹 갯수
    , shuffle=True
    , random_state=42
    )

scores = cross_validate(
    dtc
    , xtrain, ytrain    # 특성, 타겟
    , scoring='accuracy'    # 평가 지표(분류/회귀)
    , cv=spliter   # 분할 종류(회귀: KFold, 분류: StratifiedKFold)
    , n_jobs=1    # 사용할 CPU 코어 개수(-1: 모두, 1: 한 개)
    )

scores # 모델 학습 시간, 모델 검증 시간, 검증 점수

{'fit_time': array([0.02208328, 0.00942945, 0.00948453, 0.00912404, 0.00871086,
        0.00888515, 0.00901556, 0.00873518, 0.00882792, 0.00872231]),
 'score_time': array([0.0020833 , 0.00163507, 0.00169396, 0.00165462, 0.00168228,
        0.00200248, 0.00221872, 0.00196576, 0.00231767, 0.00186658]),
 'test_score': array([0.83461538, 0.87884615, 0.85384615, 0.85384615, 0.84615385,
        0.87307692, 0.85961538, 0.85549133, 0.85163776, 0.86705202])}

In [9]:
np.mean(scores['test_score'])  # 성능 평가(평균)

0.8574181117533719

# 그리드 서치
하이퍼 파마리터 + 교차 검증

In [10]:
from sklearn.model_selection import GridSearchCV

params = {
    'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001)  # 불순도 감소 최소량
    , 'max_depth': np.arange(5, 20, 1)
    , 'min_samples_split': np.arange(2, 100, 10)
    }

In [11]:
# 결정 트리
dtc = DecisionTreeClassifier(
    criterion = 'gini'    # 불순도
    , random_state=42
    )

In [12]:
gscv = GridSearchCV(
    dtc       # 모델
    , params        # 하이퍼 파라미터 목록
    , cv = 5        # 교차 검증 횟수(K번)
    , scoring = 'accuracy'         # 평가 지표(정확도)
    , return_train_score = True    # 훈련 데이터 정확도 추가
    , n_jobs = -1    # 사용할 CPU 코어 개수(-1: 모두, 1: 한 개)
    )

gscv.fit(xtrain, ytrain)

gscv.best_params_  # 하이퍼 파라미터 최적화 값

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}

In [25]:
gscv.cv_results_['mean_train_score']    # 교차 검증 점수(cv=5)

array([0.86968438, 0.86963628, 0.86963628, ..., 0.8779107 , 0.87781448,
       0.87637131])

In [26]:
np.max(gscv.cv_results_['mean_train_score'])    # 최적화 교차 검증 점수(cv=5)

0.97195503380252

In [13]:
gscv.cv_results_['mean_test_score']    # 교차 검증 점수(cv=5)

array([0.85780355, 0.85799604, 0.85799604, ..., 0.86126601, 0.86165063,
       0.86357629])

In [14]:
np.max(gscv.cv_results_['mean_test_score'])    # 최적화 교차 검증 점수(cv=5)

0.8683865773302731

In [15]:
dtc = gscv.best_estimator_    # 하이퍼 파라미터 최적화 값 모델 저장

### 결정 트리

In [16]:
dtc.score( xtrain, ytrain )            # 훈련 데이터 정확도

0.892053107562055

In [17]:
dtc.score( xtest, ytest )              # 테스트 데이터 정확도

0.8615384615384616

# 랜덤 서치

In [18]:
from scipy.stats import uniform, randint

params = {
    'min_impurity_decrease': uniform(0.0001, 0.001)  # 불순도 감소 최소량
    , 'max_depth': randint(20, 50)
    , 'min_samples_split': randint(2, 25)
    , 'min_samples_leaf': randint(1, 25)
    }

In [19]:
from sklearn.model_selection import RandomizedSearchCV
rscv = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42)
    , params
    , n_iter=100    # 샘플링 횟수
    , n_jobs=-1
    , random_state=42)

rscv.fit(xtrain, ytrain)

rscv.best_params_  # 하이퍼 파라미터 최적화 값

{'max_depth': 39,
 'min_impurity_decrease': 0.00034102546602601173,
 'min_samples_leaf': 7,
 'min_samples_split': 13}

In [20]:
rscv.cv_results_['mean_test_score']    # 교차 검증 점수(cv=5)

array([0.86511513, 0.86261235, 0.86838528, 0.86588547, 0.86376731,
       0.86434497, 0.86280503, 0.86280484, 0.86357592, 0.86357555,
       0.86280503, 0.8626142 , 0.86472977, 0.86954283, 0.86203543,
       0.86761827, 0.86222884, 0.86473033, 0.86877082, 0.86184423,
       0.86126657, 0.86511494, 0.8626142 , 0.86203543, 0.86511476,
       0.86607722, 0.86222773, 0.86684682, 0.86261309, 0.86338436,
       0.8629977 , 0.86242171, 0.86184478, 0.86165211, 0.86049808,
       0.86530706, 0.86280521, 0.86684775, 0.86203524, 0.86318983,
       0.86780947, 0.86761624, 0.86126694, 0.86934867, 0.86857889,
       0.86530743, 0.86434497, 0.86415303, 0.86838602, 0.86530688,
       0.86145813, 0.86684626, 0.8618446 , 0.86145961, 0.86338454,
       0.86569131, 0.86242152, 0.86376805, 0.86203543, 0.86376916,
       0.86511457, 0.86184275, 0.86338454, 0.86242004, 0.86107481,
       0.86203654, 0.86184478, 0.86434552, 0.86184478, 0.86338473,
       0.86299993, 0.8641534 , 0.86338269, 0.85972662, 0.86415

In [21]:
np.max(rscv.cv_results_['mean_test_score'])    # 최적화 교차 검증 점수(cv=5)

0.8695428296438884

In [22]:
dtc = rscv.best_estimator_    # 하이퍼 파라미터 최적화 값 모델 저장

In [23]:
dtc.score( xtest, ytest )              # 테스트 데이터 정확도

0.86

In [24]:
dtc.score( xtrain, ytrain )              # 테스트 데이터 정확도

0.8928227823744468