# 과적합, 분산 편향 트레이드오프, 교차 검증

In [1]:
#필요한 라이브러리 임포트

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 교차검증

## 1. 사이킷런의 model_selection의 KFold()를 사용하는 경우(For loop 사용)

#### 폴드를 분리할 객체 생성

In [8]:
from sklearn.model_selection import KFold

In [14]:
# 데이터셋 불러오기
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

diab = load_diabetes()
X = diab.data
y = diab.target

In [9]:
[:5], y[:5]

(array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
         -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
         -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
         -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034],
        [-0.08906294, -0.04464164, -0.01159501, -0.03665608,  0.01219057,
          0.02499059, -0.03603757,  0.03430886,  0.02268774, -0.00936191],
        [ 0.00538306, -0.04464164, -0.03638469,  0.02187239,  0.00393485,
          0.01559614,  0.00814208, -0.00259226, -0.03198764, -0.04664087]]),
 array([151.,  75., 141., 206., 135.]))

In [15]:
len(X)

442

#### 데이터를 준비하고 회귀 모형 객체를 생성

#### split()함수를 호출하여 폴드별로 분리될 행 인덱스 세트를 구함

In [16]:
kfold = KFold(5)

kfold.split(X)

<generator object _BaseKFold.split at 0x14e4927a0>

In [17]:
# 학습데이터와 데스테데이터를 5번으로 나눔
list(kfold.split(X))

[(array([ 89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
         102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
         115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
         128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
         141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
         154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
         167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
         180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
         193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205,
         206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
         219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
         232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
         245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257,
         258, 259, 260, 261, 262, 263,

샘플들의 인덱스번호를 통해서 학습데이터와 테스트데이터를 불러서 나눔

    - 1번 fold: 학습데이터(89~441), 테스트데이터(0~88) 
    - 2번 fold: 학습데이터(0~88, 178~441), 테스트데이터(89~177)
    - 3번 fold: 학습데이터(0~177, 266~441), 테스트데이터(178~265)
    - 4번 fold: 학습데이터(0~265, 354~441), 테스트데이터(266~353)
    - 5번 fold: 학습데이터(0~353), 테스트데이터(354~441)

In [18]:
from sklearn.metrics import r2_score

r2_scores = []
lr = LinearRegression()

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    reg = lr.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

In [19]:
r2_scores

[0.4295561538258379,
 0.5225993866099365,
 0.4826805413452825,
 0.42649776111040205,
 0.5502483366517519]

In [21]:
# r2_scores의 평균값을 예측데이터의 점수?

for i, r2 in enumerate(r2_scores):
    print(i+1, f'- R2 ={r2:.3f}')

print(f'average R2 = {np.round(np.mean(r2_scores),3)}')

1 - R2 =0.430
2 - R2 =0.523
3 - R2 =0.483
4 - R2 =0.426
5 - R2 =0.550
average R2 = 0.482


## 2. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 without shuffling:
- for loop 필요 없음

In [30]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

diab = load_diabetes()
X = diab.data
y = diab.target

kfold = KFold(5, shuffle=True)
lr = LinearRegression()

print(cross_val_score(lr,X,y, cv=5))
print(np.round(np.mean(cross_val_score(lr,X,y, cv=5)),3))

[0.42955615 0.52259939 0.48268054 0.42649776 0.55024834]
0.482


### iris 데이터셋

1. y값인 'kind'는 데이터가 꽃의 종류로 0,1,2값(분류)으로 50개씩 가지고 있어 편중되어있음
2. 그러므로 Kfold를 하기전에 shuffle로 데이터셋의 데이터를 섞어줘야한다.
3. stratify는 분류의 데이터값만 할수 있고, 분포를 가지게 할수 있음.

In [27]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df['kind'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),kind
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## 3. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 with shuffling

In [29]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

diab = load_diabetes()
X = diab.data
y = diab.target

kfold = KFold(5, shuffle=True, random_state=29)
lr = LinearRegression()

print(cross_val_score(lr,X,y, cv=5))
print(np.round(np.mean(cross_val_score(lr,X,y, cv=kfold)),3))

[0.42955615 0.52259939 0.48268054 0.42649776 0.55024834]
0.489
