## Machine Learning

### Resampling - Cross Validation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.metrics import mean_squared_error, roc_auc_score

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

In [2]:
bd = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv")
bd.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
bd.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,year,origin
count,397.0,397.0,397.0,397.0,397.0,397.0,397.0
mean,23.515869,5.458438,193.532746,2970.261965,15.555668,75.994962,1.574307
std,7.825804,1.701577,104.379583,847.904119,2.749995,3.690005,0.802549
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.0,2223.0,13.8,73.0,1.0
50%,23.0,4.0,146.0,2800.0,15.5,76.0,1.0
75%,29.0,8.0,262.0,3609.0,17.1,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [15]:
bd.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

In [14]:
bd.horsepower = pd.to_numeric(bd.horsepower,errors='coerce')

In [17]:
bd.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [19]:
bd = bd.dropna()

In [20]:
bd.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [21]:
y = bd.mpg
X = bd.horsepower
y = np.array(y)
X = np.array(X)
X = X.reshape(-1,1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =.3, random_state=5)

In [28]:
reg = LinearRegression()
reg.fit(X_train, y_train)
yhat = reg.predict(X_test)
medida = mean_squared_error(y_test, yhat)
print('MSE for Simples Linear Regression: ', medida)

MSE for Simples Linear Regression:  34.61244348009657


## Estimando o MSE da Regressão com o LOOCV

In [33]:
loo = LeaveOneOut()

mse_list = []

for train, test in loo.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    reg.fit(X_train, y_train)
    yhat = reg.predict(X_test)
    medida = mean_squared_error(y_test, yhat)
    mse_list.append(medida)

print(mse_list)
mse_array = np.array(mse_list)

[2.020010024321651, 1.2509241240422704, 3.068051640163971, 0.06799019837204447, 0.7082556294244434, 41.35667450716614, 81.37553579730985, 67.14947672916045, 97.04988465707532, 26.34303682304336, 3.6742869739717334, 0.4707416905678176, 1.6050778867388202, 97.04988465707532, 0.889557151130716, 8.69418109789263, 44.12289653365883, 30.656222509959434, 0.9165497318131891, 45.31853784962301, 1.457052807537722, 3.0098355440288405, 0.0035461760456729618, 15.29640877266284, 22.502220796825593, 16.79054457227355, 2.7673535145602672, 18.535464818187215, 0.22995747384892581, 0.9165497318131891, 5.183799994543558, 0.0035461760456729618, 26.674551027218364, 54.47911198315081, 51.40783236884584, 49.94052560583025, 38.03600056861017, 0.011988458474189093, 2.9103300297208614, 3.2310436344767135, 5.166911181280772, 0.23248733238398803, 0.010667890815345261, 0.48261526388763243, 21.021111412644863, 43.558519729146035, 26.674551027218364, 65.12311616023943, 11.369041835374313, 5.183799994543558, 1.2508572

In [35]:
print('MSE with LOOCV:', mse_array.mean())

MSE with LOOCV: 24.231513517929226


## Fazendo com o K-Fold

In [37]:
kf = KFold(10)

mse_list = []

for train, test in kf.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    reg.fit(X_train, y_train)
    yhat = reg.predict(X_test)
    medida = mean_squared_error(y_test, yhat)
    mse_list.append(medida)

print(mse_list)
mse_array = np.array(mse_list)

[28.34783584097226, 17.226408542020682, 26.92535793423824, 23.36016121696417, 15.5576330367143, 17.893834560501954, 17.044768671534392, 22.836578723524884, 65.93489566837346, 39.271862328554384]


In [38]:
print('MSE with KFold:', mse_array.mean())

MSE with KFold: 27.439933652339867
