### k-fold

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [3]:
x_data=np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]  
])

In [4]:
kf = KFold(n_splits=5)

In [5]:
for train_index, test_index in kf.split(x_data):
    print('train_index: ', train_index)
    print('test_index: ', test_index)    

train_index:  [ 3  4  5  6  7  8  9 10 11]
test_index:  [0 1 2]
train_index:  [ 0  1  2  6  7  8  9 10 11]
test_index:  [3 4 5]
train_index:  [ 0  1  2  3  4  5  8  9 10 11]
test_index:  [6 7]
train_index:  [ 0  1  2  3  4  5  6  7 10 11]
test_index:  [8 9]
train_index:  [0 1 2 3 4 5 6 7 8 9]
test_index:  [10 11]


#### K-Fold 교차검증 -> 보통 회귀 문제에서 사용됨
- 학습 데이터와 테스트 데이터를 k개의 세트로 나누어 검증하는 방법
- 데이터셋이 굉장히 적을 때 훈련데이터를 어떻게든 최대한 늘려보려고 사용되기도 함(과소적합방지)
- 여러 개의 훈련 테스트 짝으로 검증과정을 거침

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#### 2.데이터 수집

In [8]:
x_data=np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]  
])

y_data = np.array([3,5,7,10,12,7,13,13,12,13,12,6])

### 3.데이터 전처리, 4.EDA 

### 5~7 모델링

In [9]:
lr = LinearRegression()

In [11]:
train_scores = []
test_scores = []

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x_data): #5번 검증을 하겠다.
    x_train = np.array(x_data)[train_index]
    y_train = np.array(y_data)[train_index]
    x_test = np.array(x_data)[test_index]
    y_test = np.array(y_data)[test_index]
    
    model_kf = LinearRegression()
    model_kf.fit(x_train, y_train)
    score = model_kf.score(x_train,y_train) #r2
    train_scores.append(score)
    score = model_kf.score(x_test, y_test) #r2
    test_scores.append(score)

In [12]:
display(train_scores, test_scores)

[0.9522707858769932,
 0.9469593697441799,
 0.9446524178499608,
 0.9232432525564045,
 0.9166499001004778]

[-1.1475590101753324,
 0.56847222331606,
 0.0,
 -11.7747639790487,
 0.9602035173350366]

In [13]:
print(np.array(train_scores).mean())

0.9367551452256032


In [14]:
print(np.array(test_scores).mean())

-2.278729449714587


#### cross-validation

In [16]:
from sklearn.model_selection import cross_validate

In [17]:
model = LinearRegression()

In [18]:
cv_results = cross_validate(model, x_data, y_data)

In [19]:
print(cv_results['test_score'].mean())

-2.278729449714587


In [20]:
df = pd.DataFrame(cv_results)
df = df.sort_values(by= 'test_score', ascending=False)

In [21]:
df

Unnamed: 0,fit_time,score_time,test_score
4,0.001,0.0,0.960204
1,0.0,0.0,0.568472
2,0.001,0.0,0.0
0,0.001,0.0,-1.147559
3,0.0,0.0,-11.774764


#### cross_val_score

In [22]:
from sklearn.model_selection import cross_val_score

In [24]:
model = LinearRegression()
model.fit(x_data, y_data)

LinearRegression()

In [25]:
cv_score = cross_val_score(model, x_data, y_data, cv= 5)

In [27]:
print('cv_mean_score: ',cv_score.mean())

cv_mean_score:  -2.278729449714587


## 분류

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [30]:
x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4],    
])
y_data = np.array([2,2,2,1,1,2,0,0,0,1,0,2])

labels = ['A','B','C']

In [31]:
model = LogisticRegression()

In [32]:
cv_results = cross_validate(model, x_data, y_data, return_estimator=True)



In [33]:
print(cv_results['test_score'].mean())

0.9333333333333332


In [34]:
df= pd.DataFrame(cv_results)
df = df.sort_values(by= 'test_score', ascending=False)
df

Unnamed: 0,fit_time,score_time,estimator,test_score
1,0.003,0.0,LogisticRegression(),1.0
2,0.003532,0.0,LogisticRegression(),1.0
3,0.003014,0.0,LogisticRegression(),1.0
4,0.003001,0.001,LogisticRegression(),1.0
0,0.014085,0.0,LogisticRegression(),0.666667


### 계층적 k-겹 교차검증(Stratified k-Fold cross Validation)
- 분류 모델에 적용
- k-fold 교차검증 모델은 k-fold가 원본 데이터 집합의레이블 분포를 학습 및 검증데이터 세트에 제대로 분배하지 못하는 문제를 해결해줌
- traget값(정답값) = 레이블/클래스의 속성값의 개수를 골고루 넣어주게 됨

In [35]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [36]:
from sklearn.datasets import load_iris

In [37]:
iris = load_iris()

In [38]:
x = iris.data
y = iris.target

In [39]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [40]:
idx_iter = 0
cv_accuracy = []

for train_index, test_index in skf.split(x,y):
    # split으로 반환된 인덱스를 이용, train,test split
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #학습
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    idx_iter +=1
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('{0}번째 교차 검증 정확도: {1}\n 학습데이터크기: {2}\n 검증 데이터 크기 : {3}'.format(idx_iter, accuracy,train_size, test_size))
    
    cv_accuracy.append(accuracy)
    

1번째 교차 검증 정확도: 1.0
 학습데이터크기: 120
 검증 데이터 크기 : 30
2번째 교차 검증 정확도: 0.9667
 학습데이터크기: 120
 검증 데이터 크기 : 30
3번째 교차 검증 정확도: 0.9333
 학습데이터크기: 120
 검증 데이터 크기 : 30
4번째 교차 검증 정확도: 1.0
 학습데이터크기: 120
 검증 데이터 크기 : 30
5번째 교차 검증 정확도: 0.9333
 학습데이터크기: 120
 검증 데이터 크기 : 30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt