In [85]:
import numpy as np
import pandas as pd

titanic_df = pd.read_csv('./csv-data/titanic_train.csv')
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [86]:
from sklearn import preprocessing

def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
        
    return dataDF

titanic_df = encode_features(titanic_df)
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,147,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,81,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,147,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,55,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,147,2


In [87]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

# 레이블 인코딩 수행. 
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [88]:
# 원본 데이터를 재로딩 하고, feature데이터 셋과 Label 데이터 셋 추출. 
titanic_df = pd.read_csv('./csv-data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived',axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=11)

In [90]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()
svm_svc = SVC() ##추가해보자.!!

# DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train , y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train , y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForestClassifier 정확도:{0:.4f}'.format(accuracy_score(y_test, rf_pred)))

# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train , y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))


# SVM 학습/예측/평가
svm_svc.fit(X_train , y_train)
svm_pred = lr_clf.predict(X_test)
print('SVM 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

DecisionTreeClassifier 정확도: 0.7877
RandomForestClassifier 정확도:0.8547
LogisticRegression 정확도: 0.8492
SVM 정확도: 0.8492


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### k-fold

In [91]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)
# kfold 의 결과는 튜플형태 

In [92]:
for iter_count , (train_index, test_index) in enumerate(kfold.split(X_titanic_df)) :
    print(iter_count)

0
1
2
3
4


In [93]:
X_titanic_df.shape

(891, 8)

In [94]:
for iter_count , (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        print(iter_count, train_index) #인덱스만 추출한 것임.
        #test데이터가 변경됨! => iter_count별로 train데이터, test데이터 구성 비교해볼 것.

0 [  0   1   2   4  11  12  13  14  17  19  20  22  25  26  27  30  31  32
  33  35  36  37  39  40  41  42  43  44  45  46  48  49  50  52  53  54
  55  56  57  58  59  60  62  63  64  66  68  69  70  71  72  74  76  77
  78  81  82  83  84  85  86  87  88  89  90  91  94  95  96  97  98  99
 100 101 102 103 104 105 108 109 110 111 113 114 115 116 118 119 120 121
 122 123 124 125 126 128 129 130 131 132 133 134 136 137 138 139 140 141
 142 143 144 145 146 147 148 149 150 152 154 155 156 157 158 160 161 163
 164 165 166 167 168 169 172 174 175 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 195 196 198 199 201 204 205 206 207 209
 210 211 212 213 214 215 216 217 218 219 220 221 223 224 225 226 227 228
 229 231 232 233 235 237 238 239 240 242 243 244 245 246 248 249 250 252
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 269 270 271 272
 273 274 275 276 277 278 279 283 284 285 287 288 289 290 291 293 294 295
 296 297 299 300 301 302 303 304 305 307 311 313 

In [95]:
X_titanic_df.values[train_index]

array([[ 3.        ,  1.        , 22.        , ...,  7.25      ,
         7.        ,  3.        ],
       [ 1.        ,  0.        , 38.        , ..., 71.2833    ,
         2.        ,  0.        ],
       [ 3.        ,  0.        , 26.        , ...,  7.925     ,
         7.        ,  3.        ],
       ...,
       [ 1.        ,  0.        , 19.        , ..., 30.        ,
         1.        ,  3.        ],
       [ 3.        ,  0.        , 29.69911765, ..., 23.45      ,
         7.        ,  3.        ],
       [ 3.        ,  1.        , 32.        , ...,  7.75      ,
         7.        ,  2.        ]])

In [96]:
# kfold 교차검증 함수 생성 

def exec_kfold(clf, folds=5) :
    #clf = classification ; 모델객체
    # 폴드 세트를 5개인 KFold 객체를 생성, 폴드 수 만큼 예측결과 저장을 위한 리스트 객체 생성
    kfold = KFold(n_splits=folds)
    scores=[]
    
    # kfold 교차검증수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)) :
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        #classifier 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차검증 {0} 정확도 :{1:.4f}".format(iter_count, accuracy))
        
    mean_score = np.mean(scores)
    print("평균 정확도 : {0:4f}".format(mean_score))
    return mean_score

In [97]:
# decision tree classifier
exec_kfold(dt_clf, folds=5)

교차검증 0 정확도 :0.7542
교차검증 1 정확도 :0.7809
교차검증 2 정확도 :0.7865
교차검증 3 정확도 :0.7697
교차검증 4 정확도 :0.8202
평균 정확도 : 0.782299


0.782298662984119

In [98]:
# randomforecast classifier
exec_kfold(rf_clf, folds=5)

교차검증 0 정확도 :0.7933
교차검증 1 정확도 :0.8090
교차검증 2 정확도 :0.8371
교차검증 3 정확도 :0.7753
교차검증 4 정확도 :0.8596
평균 정확도 : 0.814839


0.8148389931579938

In [99]:
# logistic regression
exec_kfold(lr_clf, folds=5)

교차검증 0 정확도 :0.8045
교차검증 1 정확도 :0.7809
교차검증 2 정확도 :0.7753
교차검증 3 정확도 :0.7584
교차검증 4 정확도 :0.8202
평균 정확도 : 0.787860


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7878601468834348

In [100]:
# svm
exec_kfold(svm_svc, folds=5)

교차검증 0 정확도 :0.5866
교차검증 1 정확도 :0.6685
교차검증 2 정확도 :0.6685
교차검증 3 정확도 :0.6629
교차검증 4 정확도 :0.7079
평균 정확도 : 0.658891


0.658891469462055

In [101]:
# all_result=[]
# for i in range(2,10) :
#     for j in method : 
#         all_result.append(exec_kfold(j,folds=i))
# max(all_result)

In [102]:
method=[svm_svc,dt_clf, rf_clf, lr_clf]
all_result=[]
for j in method : 
    all_result.append(exec_kfold(j,folds=5))

교차검증 0 정확도 :0.5866
교차검증 1 정확도 :0.6685
교차검증 2 정확도 :0.6685
교차검증 3 정확도 :0.6629
교차검증 4 정확도 :0.7079
평균 정확도 : 0.658891
교차검증 0 정확도 :0.7542
교차검증 1 정확도 :0.7809
교차검증 2 정확도 :0.7865
교차검증 3 정확도 :0.7697
교차검증 4 정확도 :0.8202
평균 정확도 : 0.782299
교차검증 0 정확도 :0.7933
교차검증 1 정확도 :0.8090
교차검증 2 정확도 :0.8371
교차검증 3 정확도 :0.7753
교차검증 4 정확도 :0.8596
평균 정확도 : 0.814839
교차검증 0 정확도 :0.8045
교차검증 1 정확도 :0.7809
교차검증 2 정확도 :0.7753
교차검증 3 정확도 :0.7584
교차검증 4 정확도 :0.8202
평균 정확도 : 0.787860


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [103]:
# accuracy 평균값 중 max값 출력함수 

def max_acc(score_list) :
    max_value = 0 
    index = 0 
    
    for i, model_result in enumerate(score_list):
        if model_result > max_value : 
            max_value = model_result 
            index = i 
    return max_value, index

In [104]:
max_acc(all_result)

(0.8148389931579938, 2)

In [105]:
print(method[2])

RandomForestClassifier(random_state=11)


In [106]:
def max_acc2(clf, folds=5) :
    kfold = KFold(n_splits=folds)
    scores=[]
    
    # kfold 교차검증수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)) :
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        #classifier 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
    print("method :",clf, " max_accuracy :",max(scores))


In [107]:
max_acc2(dt_clf, folds=5)

method : DecisionTreeClassifier(random_state=11)  max_accuracy : 0.8202247191011236


In [108]:
def exec_kfold2(clf, folds=5) :
    #clf = classification ; 모델객체
    # 폴드 세트를 5개인 KFold 객체를 생성, 폴드 수 만큼 예측결과 저장을 위한 리스트 객체 생성
    kfold = KFold(n_splits=folds)
    scores=[]
    
    # kfold 교차검증수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)) :
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        #classifier 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
    return scores


In [109]:
method=[svm_svc,dt_clf, rf_clf, lr_clf]

for i in method : 
    print(i, np.mean(exec_kfold2(i, folds=5)))
    

SVC() 0.658891469462055
DecisionTreeClassifier(random_state=11) 0.782298662984119
RandomForestClassifier(random_state=11) 0.8148389931579938
LogisticRegression() 0.7878601468834348


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [110]:
def callKFold_all(model, folds=5) : 
    kfold = KFold(n_splits=folds)
    mean_list=[]
    
    for clf in models :
        scores=[]
        # kfold 교차검증수행
        for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)) :
            X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
            y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        #classifier 학습, 예측, 정확도 계산
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            scores.append(accuracy)
        mean_score = np.mean(scores)
        mean_list.append(mean_score)

    print('최적의 model은 ', models[np.argmax(mean_list)], "최대 accuracy는 ", max(mean_list))

In [111]:
models = [svm_svc,dt_clf, rf_clf, lr_clf]
callKFold_all(models, folds=5)

최적의 model은  RandomForestClassifier(random_state=11) 최대 accuracy는  0.8148389931579938


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [112]:
#라이브러리 사용 
from sklearn.model_selection import cross_val_score

def callKFold_all2(models):
    result=[]
    for i in models : 
        scores = cross_val_score(i, X_titanic_df , y_titanic_df , cv=5)
        print(scores)
        result.append(np.mean(scores))
        
    print('\n-------------------------------')
    print('최적의 model은 ', models[np.argmax(result)])
    print('최대 accuracy는', max(result)) 
    
callKFold_all2(models)

[0.60335196 0.71348315 0.67977528 0.67977528 0.68539326]
[0.74301676 0.7752809  0.79213483 0.78651685 0.84269663]
[0.79329609 0.79775281 0.84831461 0.76404494 0.86516854]
[0.79888268 0.76966292 0.78089888 0.7752809  0.79775281]

-------------------------------
최적의 model은  RandomForestClassifier(random_state=11)
최대 accuracy는 0.8137153976523758


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [113]:
# 딕셔너리로 저장

from sklearn.model_selection import cross_val_score

def callKFold_all3(models):
    result=dict()
    for i in models : 
        scores = cross_val_score(i, X_titanic_df , y_titanic_df , cv=5)
        mean_score = np.mean(scores)
        result[i] = mean_score
    max_result =max(result.values())
    for key, value in result.items():
        if value == max_result :
            print("method :", key)
    print("accuracy :",max_result)
    
callKFold_all3(models)

method : RandomForestClassifier(random_state=11)
accuracy : 0.8137153976523758


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### cross_val_score

In [114]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(dt_clf, X_titanic_df, y_titanic_df, cv=5)
scores

array([0.74301676, 0.7752809 , 0.79213483, 0.78651685, 0.84269663])

In [115]:
type(scores)

numpy.ndarray

In [116]:
for index, accuracy in enumerate(scores) :
    print("cross validation : {0} => 정확도 : {1:.4f}".format(index+1, accuracy))

cross validation : 1 => 정확도 : 0.7430
cross validation : 2 => 정확도 : 0.7753
cross validation : 3 => 정확도 : 0.7921
cross validation : 4 => 정확도 : 0.7865
cross validation : 5 => 정확도 : 0.8427


In [117]:
print("average accuracy : {0:.4f}".format(np.mean(scores)))

average accuracy : 0.7879


### Grid Search CV

In [118]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,3,5,10], 
              'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

# depth : 어디까지 공부시킬지 ; 데이터의 성격이 명확한 데이터(iris)는 depth를 낮게,
# 속성의 수와 row의 수에 따라서 depth가 달라질 수 있다.

# overfitting 피하기 : depth를 낮추거나 decision tree의 경우 drop out 시킴

# dt_clf, rf_clf, lr_clf, svm_svc


In [119]:
grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring = 'accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=11),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')

In [120]:
#GridSearchCV 최적 하이퍼 파라미터
grid_dclf.best_params_

{'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}

In [121]:
best_dclf = grid_dclf.best_estimator_
best_dclf

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=11)

In [122]:
#GridSearchCV 최고 정확도
best_score = grid_dclf.best_score_
best_score

0.7991825076332119

In [123]:
#GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행
dpredictions = best_dclf.predict(X_test)
dpredictions

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [124]:
#테스트 세트에서의 DecisionTreeClassifier 정확도
accuracy = accuracy_score(y_test, dpredictions)
accuracy

0.8715083798882681