In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [28]:
x_data = np.array([[2,1],[2,3],[4,1],[5,6],[7,7],[8,9],[9,10],[6,7],[6,10],[2,8],[7,5]])

kf = KFold(n_splits=5)

In [29]:
for train_index, test_index in kf.split(x_data):
    print(train_index)
    print(test_index)

[ 3  4  5  6  7  8  9 10]
[0 1 2]
[ 0  1  2  5  6  7  8  9 10]
[3 4]
[ 0  1  2  3  4  7  8  9 10]
[5 6]
[ 0  1  2  3  4  5  6  9 10]
[7 8]
[0 1 2 3 4 5 6 7 8]
[ 9 10]


### K-Fold 교차검증 -> 회귀 문제에서 사용
- 학습 data와 테스트 data를 n_splits개의 세트로 나누어 검증하는 방법
- data set이 적을 때 train data를 늘리기 위해 사용
- 여러 개의 train, test data 짝으로 검증과정을 거침

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [31]:
x_data = np.array([[2,1],[2,3],[4,1],[5,6],[7,7],[8,9],[9,10],[6,7],[6,10],[2,8],[7,5]])
y_data = np.array([3,5,9,7,10,2,15,6,4,7,8])

In [32]:
model = LinearRegression()

train_scores = []
test_scores = []

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x_data):
    x_train = np.array(x_data)[train_index]
    y_train = np.array(y_data)[train_index]
    x_test = np.array(x_data)[test_index]
    y_test = np.array(y_data)[test_index]

    model_kf = LinearRegression()
    model_kf.fit(x_train, y_train)
    score = model_kf.score(x_train, y_train)    # R2
    train_scores.append(score)
    score = model_kf.score(x_test, y_test)
    test_scores.append(score)

train_scores, test_scores

([0.09087045589617082,
  0.14634108316388406,
  0.2596984905781652,
  0.22523882013459684,
  0.35657970453058097],
 [0.33939878660391054,
  -0.28777247567478437,
  0.06722521416975036,
  -9.902863890644971,
  -157.7869721150263])

In [33]:
np.array(train_scores).mean()

0.21574571086067956

In [34]:
np.array(test_scores).mean()

-33.51419689611448

R2값
- -, + 부호는 그래프의 기울기
- 절대값은 data를 잘 설명하는 정도(-1~1 사이의 값을 가짐) : 너무 커지거나 작아지면 data가 쓰레기라는 뜻

In [35]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [36]:
model = LinearRegression()
cv_results = cross_validate(model, x_data, y_data)
cv_results

{'fit_time': array([0.00099683, 0.        , 0.        , 0.        , 0.        ]),
 'score_time': array([0.        , 0.00099707, 0.        , 0.00099659, 0.        ]),
 'test_score': array([ 3.39398787e-01, -2.87772476e-01,  6.72252142e-02, -9.90286389e+00,
        -1.57786972e+02])}

In [37]:
cv_results['test_score'].mean()

-33.51419689611448

In [38]:
df = pd.DataFrame(cv_results)
df = df.sort_values(by='test_score', ascending=False)   # 내림차순
df

Unnamed: 0,fit_time,score_time,test_score
0,0.000997,0.0,0.339399
2,0.0,0.0,0.067225
1,0.0,0.000997,-0.287772
3,0.0,0.000997,-9.902864
4,0.0,0.0,-157.786972


In [39]:
model = LinearRegression()
model.fit(x_data, y_data)

LinearRegression()

In [40]:
cvs = cross_val_score(model, x_data, y_data, cv=5)
cvs

array([ 3.39398787e-01, -2.87772476e-01,  6.72252142e-02, -9.90286389e+00,
       -1.57786972e+02])

In [41]:
cvs.mean()

-33.51419689611448

분류
 - 분류문제에서는 범주형 값의 편향된 분산을 막기 위해  
 - 범주형의 비율에 따라 나눠준다.

 - 이것을 '계층적 K-Fold'라고 한다

In [42]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [43]:
x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4],    
])
y_data = np.array([2,2,2,1,1,2,0,0,0,1,0,2])

labels = ['A','B','C']

In [44]:
model = LogisticRegression()
cv_results = cross_validate(model, x_data, y_data, return_estimator=True)
cv_results



{'fit_time': array([0.00398684, 0.00299025, 0.0039866 , 0.00298977, 0.00299883]),
 'score_time': array([0.00099659, 0.        , 0.        , 0.        , 0.        ]),
 'estimator': [LogisticRegression(),
  LogisticRegression(),
  LogisticRegression(),
  LogisticRegression(),
  LogisticRegression()],
 'test_score': array([0.66666667, 1.        , 1.        , 1.        , 1.        ])}

In [45]:
cv_results['test_score'].mean()

0.9333333333333332

In [46]:
df = pd.DataFrame(cv_results)
df = df.sort_values(by='test_score', ascending=False)
df

Unnamed: 0,fit_time,score_time,estimator,test_score
1,0.00299,0.0,LogisticRegression(),1.0
2,0.003987,0.0,LogisticRegression(),1.0
3,0.00299,0.0,LogisticRegression(),1.0
4,0.002999,0.0,LogisticRegression(),1.0
0,0.003987,0.000997,LogisticRegression(),0.666667


### 계층적 k-겹 교차검증(stratified k-fold cross validation)
- 분류 모델에 적용
- k-겹 교차검증 모델은 k-fold가 원본 data 집합의 레이블 분포를 학습 및 검증 data set에 제대로 분배하지 못하는 문제를 해결
- target 값(= 레이블/클래스)의 속성값의 개수를 골고루 넣어주게 됨

In [47]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [48]:
from sklearn.datasets import load_iris

In [49]:
iris = load_iris()

In [50]:
x = iris.data
y = iris.target

skf = StratifiedKFold(n_splits=5, random_state=30, shuffle=True)

In [51]:
idx_iter = 0
cv_acc = []

# split으로 반환된 인덱스를 이용해서 학습데이터, 테스트데이터를 정의해줌
for train_index, test_index in skf.split(x,y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    idx_iter += 1
    acc = np.round(accuracy_score(y_test, pred),4)
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]

    print('{0}번째 교차 검증 정확도 : {1} \n 학습 데이터 크기 : {2} \n 검증 데이터 크기 : {3}'.format(idx_iter,acc,train_size,test_size))
    cv_acc.append(acc)

1번째 교차 검증 정확도 : 0.9333 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
2번째 교차 검증 정확도 : 0.9667 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
3번째 교차 검증 정확도 : 1.0 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
4번째 교차 검증 정확도 : 0.9333 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
5번째 교차 검증 정확도 : 1.0 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt