In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

import matplotlib.pyplot as plt
from matplotlib import rc # 한글 그래픽 처리
import seaborn as sns
from sklearn.datasets import load_breast_cancer
%matplotlib inline

In [2]:
# target가 y (분류 문제: 암인지 아닌지 ->1,0 으로 표현)
cancer = load_breast_cancer()

# Hold-out 방법

In [5]:
#train 과 test 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)

In [3]:
#모델지정
from sklearn.tree import DecisionTreeClassifier
# 회귀일때 : from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import accuracy_score
tree = DecisionTreeClassifier(random_state=0)

In [34]:
tree.fit(X_train, y_train) # 모델 훈련
pred_tree = tree.predict(X_test)# 훈련된 모델로 test 예측
accuracy_score(y_test, pred_tree)# 정확도 평가

0.9370629370629371

# Cross-validation 방법

* cross_val_score(x값,y,cv=몇번 반복할것인지)

In [19]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree,cancer.data, cancer.target,cv=5)

In [20]:
print('cross-val-score \n{}'.format(scores))# 5번 k-fold 한 결과

print('cross-val-score.mean \n{:.5f}'.format(scores.mean()))#5번한 결과의 평균(대충적인 모델의 성능을 알수있음)

cross-val-score 
[0.90434783 0.92173913 0.91150442 0.94690265 0.90265487]
cross-val-score.mean 
0.91743


# Parameter tuning

## 1. Grid search

In [16]:
from sklearn.model_selection import GridSearchCV
# 범위 설정
param_grid={'max_depth': [5,6,8],
            'min_samples_split': [2,3,4]}
#모델 설정(grid + cv)
gd_sr = GridSearchCV(tree,
                     param_grid=param_grid,
                     scoring='accuracy',
                     cv=5,# cross-validation 5번 실행
                     n_jobs=-1)#

In [17]:
gd_sr.fit(cancer.data, cancer.target)# 모델 훈련시키기

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=0,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [5, 6, 8],
                         'min_samples_split': [2, 3, 4]},
             pre_dispatch='2*n_job

In [18]:
#최고 점수와 파라미터 보기
best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)

{'max_depth': 6, 'min_samples_split': 4}
0.9173989455184535


## 2. Random search

In [10]:
from sklearn.model_selection import RandomizedSearchCV
# 범위 설정
param_grid={'max_depth': [5,6,8,10,11,9],
            'min_samples_split': [2,3,5,8,4]}
#모델설정(random + cv)
rd_sr = RandomizedSearchCV(tree,
                     param_distributions=param_grid,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [11]:
rd_sr.fit(cancer.data, cancer.target)#모델 훈련시키기

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=0,
                                                    splitter='best'),
 

In [13]:
#최고 점수와 파라미터 보기
best_parameters = rd_sr.best_params_
print(best_parameters)
best_result = rd_sr.best_score_
print(best_result)

{'min_samples_split': 3, 'max_depth': 11}
0.9173989455184535
