# Cross-Validation and the Bootstrap

## The Validation Set Approach

In [2]:
options(warn = -1)

In [18]:
library(ISLR)
library(boot)

In [13]:
set.seed(1)
train = sample(392, 196)

In [14]:
lm.fit = lm(mpg ~ horsepower, subset = train, data = Auto)
mean((Auto$mpg - predict(lm.fit, Auto))[-train] ^ 2)

In [15]:
lm.fit2 = lm(mpg ~ poly(horsepower, 2), subset = train, data = Auto)
mean((Auto$mpg - predict(lm.fit2, Auto))[-train] ^ 2)

In [16]:
lm.fit3 = lm(mpg ~ poly(horsepower, 3), subset = train, data = Auto)
mean((Auto$mpg - predict(lm.fit3, Auto))[-train] ^ 2)

## Leave-One-Out Cross-Validation

In [17]:
glm.fit = glm(mpg ~ horsepower, data = Auto)
coef(glm.fit)

In [19]:
cv.err = cv.glm(Auto, glm.fit)
cv.err$delta

In [21]:
cv.error = rep(0, 5)
for (i in 1:5) {
  glm.fit = glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error[i] = cv.glm(Auto, glm.fit)$delta[1]
}
cv.error

## k-Fold Cross-Validation

In [24]:
set.seed(1)
cv.error.10 = rep(0, 10)
for (i in 1:10) {
  glm.fit = glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error.10[i] = cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.10

# The Bootstrap

## Estimating the Accuracy of a Statistic of Interest

In [25]:
alpha.fn = function(data, index) {
  X = data$X[index]
  Y = data$Y[index]
  return ((var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}

In [26]:
alpha.fn(Portfolio, 1:100)

In [27]:
set.seed(1)
alpha.fn(Portfolio, sample(100, 100, replace = T))

In [28]:
boot(Portfolio, alpha.fn, R = 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Portfolio, statistic = alpha.fn, R = 1000)


Bootstrap Statistics :
     original       bias    std. error
t1* 0.5758321 -0.001695873  0.09366347

## Estimating the Accuracy of a Linear Regression Model

In [29]:
boot.fn = function(data, index) {
  return(coef(lm(
    mpg ~ horsepower, data = data, subset = index
  )))
}

In [30]:
boot.fn(Auto, 1:392)

In [33]:
set.seed(1)
boot.fn(Auto, sample(392, 392, replace = T))

In [34]:
boot(Auto, boot.fn, 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
      original        bias    std. error
t1* 39.9358610  0.0549915227 0.841925746
t2* -0.1578447 -0.0006210818 0.007348956

In [35]:
summary(lm(mpg ~ horsepower, data = Auto))$coef

Unnamed: 0,Estimate,Std. Error,t value,Pr(>|t|)
(Intercept),39.935861,0.717498656,55.65984,1.2203619999999999e-187
horsepower,-0.1578447,0.006445501,-24.48914,7.031989000000001e-81


In [36]:
boot.fn = function(data, index) {
  coef(lm(
    mpg ~ horsepower + I(horsepower ^ 2),
    data = data,
    subset = index
  ))
}

In [37]:
set.seed(1)
boot(Auto, boot.fn, 1000)


ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = Auto, statistic = boot.fn, R = 1000)


Bootstrap Statistics :
        original        bias     std. error
t1* 56.900099702  3.511640e-02 2.0300222526
t2* -0.466189630 -7.080834e-04 0.0324241984
t3*  0.001230536  2.840324e-06 0.0001172164

In [39]:
summary(lm(mpg ~ horsepower + I(horsepower ^ 2), data = Auto))$coef

Unnamed: 0,Estimate,Std. Error,t value,Pr(>|t|)
(Intercept),56.900099702,1.8004268063,31.60367,1.740911e-109
horsepower,-0.46618963,0.0311246171,-14.97816,2.289429e-40
I(horsepower^2),0.001230536,0.0001220759,10.08009,2.19634e-21
