## [13주차-2번] 교차검증을 통한 과적합 분석하기

In [20]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
# Dimensionality: 30
# Classes: 2
# 212(M-유방암)-label('0'), 357(B-정상인) - label('1')

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [21]:
X = cancer.data
Y = cancer.target

In [22]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html?highlight=train_test_split#sklearn.model_selection.train_test_split

# 학습 데이터 분할
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=1)

In [23]:
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# 파이파라인 기능을 이용한 모듈 설계

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), PCA(n_components=4), LogisticRegression() )


In [24]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate

from sklearn.model_selection import cross_validate
scores = cross_validate(pipeline, X_train, Y_train, cv=10, return_train_score=True)


In [25]:
sorted(scores.keys())

['fit_time', 'score_time', 'test_score', 'train_score']

In [26]:
import numpy as np

print('CV Validation Accuracy scores: ', scores['train_score'])
print('CV Validation Accuracy: %.3f +/- %.3f' %(np.mean(scores['train_score']), np.std(scores['train_score'])))

CV Validation Accuracy scores:  [0.96577017 0.96577017 0.96577017 0.96577017 0.96821516 0.96585366
 0.97073171 0.96829268 0.97560976 0.96585366]
CV Validation Accuracy: 0.968 +/- 0.003


In [27]:
import numpy as np

print('CV Validation Accuracy scores: ', scores['test_score'])
print('CV Validation Accuracy: %.3f +/- %.3f' %(np.mean(scores['test_score']), np.std(scores['test_score'])))

CV Validation Accuracy scores:  [0.97826087 0.97826087 0.95652174 1.         0.95652174 0.97777778
 0.93333333 0.95555556 0.91111111 1.        ]
CV Validation Accuracy: 0.965 +/- 0.027


### 교차 검증 결과
- 과대적합도 과소적합도 발생하지 않는다

### [13주차-3번] 최적모델 선정

In [32]:
from sklearn.model_selection import GridSearchCV

parameters = {}
gs = GridSearchCV(pipeline, parameters, scoring='accuracy',cv=10)
gs.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=4, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('logisticregression',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                      

In [33]:
best = gs.best_estimator_

In [34]:
gs.cv_results_

{'mean_fit_time': array([0.00662093]),
 'mean_score_time': array([0.00051041]),
 'mean_test_score': array([0.9647343]),
 'params': [{}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.97826087]),
 'split1_test_score': array([0.97826087]),
 'split2_test_score': array([0.95652174]),
 'split3_test_score': array([1.]),
 'split4_test_score': array([0.95652174]),
 'split5_test_score': array([0.97777778]),
 'split6_test_score': array([0.93333333]),
 'split7_test_score': array([0.95555556]),
 'split8_test_score': array([0.91111111]),
 'split9_test_score': array([1.]),
 'std_fit_time': array([0.00164769]),
 'std_score_time': array([3.95498696e-05]),
 'std_test_score': array([0.02665336])}

In [30]:
from sklearn.metrics import accuracy_score

Y_train_Pred = best.predict(X_train)
accuracy_score(Y_train, Y_train_Pred)


0.967032967032967

In [31]:
from sklearn.metrics import accuracy_score

Y_test_Pred = best.predict(X_test)
accuracy_score(Y_test, Y_test_Pred)


0.9736842105263158