# 교차검증!

In [1]:
import numpy as np
from sklearn.model_selection import KFold

In [2]:
X = np.array([
    [1,2],[3,4],[1,2],[3,4]
])

y = np.array([1,2,3,4])

In [3]:
X

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [4]:
y

array([1, 2, 3, 4])

In [7]:
kf = KFold(n_splits=2) # random_state=13

print(kf.get_n_splits(X))

2


In [8]:
print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)


In [11]:
for train_idx, test_idx in kf.split(X):
    print('-------- index')
    print(train_idx, test_idx)
    print('-------- train data')
    print(X[train_idx])
    print('-------- validation data')
    print(X[test_idx])

-------- index
[2 3] [0 1]
-------- train data
[[1 2]
 [3 4]]
-------- validation data
[[1 2]
 [3 4]]
-------- index
[0 1] [2 3]
-------- train data
[[1 2]
 [3 4]]
-------- validation data
[[1 2]
 [3 4]]


- 크로스 폴딩은 KFold(n_splits=2) 처럼 몇등분으로 나눌지 정해서 나눈 다음 kf.split(X)에서 인덱스로 반환가져와 학습할수 잇도록한다.

# 와인 교차검증 해보자

In [1]:
import pandas as pd

## 와인 데이터 불러오기
red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv"

red_wine = pd.read_csv(red_url, sep=";")
white_wine = pd.read_csv(white_url, sep=";")

# 레드/화이트 라벨 컬럼 생성
red_wine['color'] = 1
white_wine['color'] = 0

# 와인 데이터
wine = pd.concat([red_wine, white_wine])

In [2]:
wine['taste'] = [1 if grade>5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train) # 트레인데이터로 학습해라

In [5]:
y_pred_tr = wine_tree.predict(X_train) # 트레인을 토대로 예측하고 다시 트레인데이터 넣으면 얼마나 예측할까?
y_pred_test = wine_tree.predict(X_test) # 트레인을 토대로 예측하고 테스트 데이터를 넣으면 얼마나 예측할까?

print('Train Acc : ', accuracy_score(y_train, y_pred_tr)) # 트레인을 토대로 예측하고 다시 트레인데이터 넣은 예측
print('Test Acc : ',accuracy_score(y_test, y_pred_test)) # 트레인을 토대로 예측하고 테스트 데이터 넣은 예측

Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461


## 이제 k폴딩 5번

In [6]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

In [10]:
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [11]:
cv_accuracy = []

for train_idx, test_idx in kfold.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

- 각 acc의 분산값이 크지 않다면 평균을 대표값으로 사용한다.

In [14]:
import numpy as np

np.mean(cv_accuracy)

0.709578255462782

kfold + stratify : kfold할때 stratify를 사용해 라벨비율을 일정하게 유지
    
- kfold : 트레이닝데이터를 다시 일정 등분으로 나누어 학습, 검증을 하여 교차검증하는 방법
- stratify : 지정한 Data의 비율을 유지한다. 예를 들어, Label Set인 Y가 25%의 0과 75%의 1로 이루어진 Binary Set일 때, stratify=Y로 설정하면 나누어진 데이터셋들도 0과 1을 각각 25%, 75%로 유지한 채 분할된다.

In [15]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cv_accuracy = []

for train_idx, test_idx in skfold.split(X, y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [16]:
np.mean(cv_accuracy)

0.6888004974240539

# 반복문 안쓰고 더 간단한 방법

In [17]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

cross_val_score(wine_tree_cv,X,y, cv=skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [18]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)

cross_val_score(wine_tree_cv,X,y, cv=skfold)

array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772])

In [23]:
def skfold_dt(depth):
    from sklearn.model_selection import cross_val_score

    skfold = StratifiedKFold(n_splits=5)
    wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=13)

    print(cross_val_score(wine_tree_cv,X,y, cv=skfold))

In [24]:
skfold_dt(3)

[0.56846154 0.68846154 0.71439569 0.73210162 0.75673595]


- depth가 높다고 무조건 acc가 좋아지지는 않는다!

## 트레니잉 데이터 score와 함께 보는 방법

In [30]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv,X,y, scoring=None, cv=skfold, return_train_score=True) # scoring=None 안해도됨?

{'fit_time': array([0.01895952, 0.0149734 , 0.01396203, 0.01499057, 0.01401401]),
 'score_time': array([0.00202084, 0.00198913, 0.00099802, 0.00198817, 0.0019803 ]),
 'test_score': array([0.50076923, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
 'train_score': array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}

- 트레인 스코어 테스트 스코어 뒤에 2개는 비슷해서 잘 나온거?

- 앞부분은 차이가 20 ~ 30나니까 문제가 있다? -> 과적합이 현상이 보인다....
- 트레이닝 데이터에서 나눠서 학습하고 검증도하니.. 과적합이 생길수도

# 하이퍼파라미터 튜닝!
- 내가 활용하는 모델에서 파라미터를 바꿔가면서 학습, 결과 관찰
- 나무모델에서는 튜닝해볼만한게 depth
- 근데 하이퍼파라미터가 조금만 많아져도 일일히 하나씩 바꿔가면 코드가 또 너무 길어진다....
- 이때 gridsearchCV!

In [34]:
import pandas as pd

## 와인 데이터 불러오기
red_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv"
white_url = "https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv"

red_wine = pd.read_csv(red_url, sep=";")
white_wine = pd.read_csv(white_url, sep=";")

# 레드/화이트 라벨 컬럼 생성
red_wine['color'] = 1
white_wine['color'] = 0

# 와인 데이터
wine = pd.concat([red_wine, white_wine])

wine['taste'] = [1 if grade>5 else 0 for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2,4,7,10]}

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5) # n_jobs = cpu 코어
gridsearch.fit(X,y)

# 결과좀 보여줘 : gridsearch.cv_results_

In [36]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.01099377, 0.01496248, 0.02314835, 0.03190489]),
    'mean_score_time': array([0.00239062, 0.00298753, 0.00298104, 0.00299778]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4]),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]),
    'std

# 가장 최고의 결과를 보여줘 : best_estimator_

In [37]:
gridsearch.best_estimator_

# 베스트 스코어는 : best_score_

In [38]:
gridsearch.best_score_

0.6888004974240539

In [39]:
gridsearch.best_params_

{'max_depth': 2}

## 파이프라인에서 파라미터 튜닝

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
]

pipe = Pipeline(estimators)

In [42]:
param_grid = [{'clf__max_depth' : [2,4,7,10]}]

GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X,y)

In [47]:
GridSearch.best_estimator_

In [44]:
GridSearch.best_score_

0.6888004974240539

In [45]:
GridSearch.best_params_

{'clf__max_depth': 2}

## 잡기술 ㅋ

In [48]:
import pandas as pd

score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.6531,0.08555
3,{'clf__max_depth': 10},4,0.64771,0.077542


스코어는 높고 표준편차는 낮아야 좋은것!