# 과적합, 분산 편향 트레이드오프, 교차 검증

In [1]:
#필요한 라이브러리 임포트

import numpy as np
import pandas as pd

# 교차검증

## 1. 사이킷런의 model_selection의 KFold()를 사용하는 경우(For loop 사용)

### 폴드를 분리할 객체 생성

In [2]:
from sklearn.model_selection import KFold 
kfold = KFold(5) # 분리할 폴드의 개수 

### 데이터를 준비하고 회귀 모형 객체를 생성

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

In [4]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

### split()함수를 호출하여 폴드별로 분리될 행 인덱스 세트를 구함

In [5]:
from sklearn.metrics import r2_score

r2_scores = []

for train_idx, test_idx in kfold.split(X): # split 함수가 학습과 검증데이터를 자동으로 각각 짝맞춰 잘라준다. 알아서 들어간다. 
    # 인덱스 번호를 받아 레코드 분리 
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    reg = lr.fit(X_train, y_train)
    
    y_pred = reg.predict(X_test)
    r2_scores.append(r2_score(y_test, y_pred))

In [6]:
r2_scores

[0.4295564286585777,
 0.5225982811135659,
 0.4826783998252703,
 0.4265082749941945,
 0.5502492259658609]

## 2. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 without shuffling:


In [7]:
import numpy as np 
for i, r2 in enumerate(r2_scores):
    print(i, ": R2 - {:.3f}".format(r2))
print("average R2: ", np.round(np.mean(r2_scores), 3))

0 : R2 - 0.430
1 : R2 - 0.523
2 : R2 - 0.483
3 : R2 - 0.427
4 : R2 - 0.550
average R2:  0.482


#### 위의 과정을 이미 구현해놓은 함수가 존재한다.

In [8]:
from sklearn.datasets import load_diabetes 
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

r2_scores = cross_val_score(lr, X, y, cv = 5)

print("R2:", np.round(r2_scores, 3))
print("average R2: ", np.round(np.mean(r2_scores), 3))

R2: [0.43  0.523 0.483 0.427 0.55 ]
average R2:  0.482


## 3. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 with shuffling

In [9]:
from sklearn.datasets import load_diabetes 
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

kfold = KFold(3, shuffle = True, random_state = 0)
r2_scores = cross_val_score(lr, X, y, cv = kfold)

print("R2: ", np.round(r2_scores, 3))
print("average R2: ", np.round(np.mean(r2_scores),3))

R2:  [0.404 0.521 0.544]
average R2:  0.49
