# Machine Learning Foundation

## Section 2, Part C: Cross Validation

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

In [3]:
import requests

def download(url, filename):
    response = requests.get(url)
    
    print(f"content {response}")
    if response.status_code == 200:
        with open(filename,'wb') as f:
            f.write(response.content)

## Data loading

In [4]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML240EN-SkillsNetwork/labs/data/boston_housing_clean.pickle"

download(url, 'boston_housing_clean.pickle')

content <Response [200]>


In [6]:
boston = pd.read_pickle('./boston_housing_clean.pickle')
boston.keys()

dict_keys(['dataframe', 'description'])

In [7]:
boston_data = boston['dataframe']
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### Discussion: 

Suppose we want to do Linear Regression on our dataset to get an estimate, based on mean squared error, of how well our model will perform on data outside our dataset. 

Suppose also that our data is split into three folds: Fold 1, Fold 2, and Fold 3.

What would the steps be, in English, to do this?


In [8]:
y_col_name = 'MEDV'
X = boston_data.drop(y_col_name, axis=1)
y = boston_data[y_col_name]

In [9]:
kf = KFold(shuffle=True, random_state=72018, n_splits=3)

In [11]:
kf

KFold(n_splits=3, random_state=72018, shuffle=True)

In [10]:
for train_index, test_index in kf.split(X):
    print(f"train_index: {train_index[:10]}, {len(train_index)}")
    print(f"test_index: {test_index[:10]}, {len(test_index)}")
    print(f"\n")

train_index: [ 1  3  4  5  7  8 10 11 12 13], 337
test_index: [ 0  2  6  9 15 17 19 23 25 26], 169


train_index: [ 0  2  6  9 10 11 12 13 15 17], 337
test_index: [ 1  3  4  5  7  8 14 16 22 27], 169


train_index: [0 1 2 3 4 5 6 7 8 9], 338
test_index: [10 11 12 13 18 20 21 24 28 31], 168




In [12]:
scores = []
lr = LinearRegression()

for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :],
                                        X.iloc[test_index, :],
                                        y[train_index],
                                        y[test_index])
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    score = r2_score(y_test.values, y_pred)
    scores.append(score)

scores

[0.6719348798472742, 0.7485020059212382, 0.6976807323597771]

In [15]:
scores = []

lr = LinearRegression()
sc = StandardScaler()

for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :],
                                        X.iloc[test_index, :],
                                        y[train_index],
                                        y[test_index])
    X_train_s = sc.fit_transform(X_train)
    lr.fit(X_train_s, y_train)
    X_test_s = sc.transform(X_test)
    y_pred = lr.predict(X_test_s)
    score = r2_score(y_test.values, y_pred)
    scores.append(score)

scores

[0.6719348798472715, 0.7485020059212382, 0.6976807323597745]

## Using Pipeline and cross_val_predict

In [19]:
s = StandardScaler()
lr = LinearRegression()
kf = KFold(shuffle=True, random_state=72018, n_splits=3)

In [20]:
estimator = Pipeline([('scaler', s),
                      ('regression', lr)])

In [21]:
predictions = cross_val_predict(estimator, X, y, cv=kf)
r2_score(y, predictions)
np.mean(scores)

0.7060392060427615

## Hyperparameter Tuning

In [22]:
alphas = np.geomspace(1e-9, 1e0, num=10)
alphas

array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
       1.e-01, 1.e+00])

In [23]:
scores = []
coefs = []

for alpha in alphas:
    las = Lasso(alpha=alpha, max_iter=100000)
    
    estimator = Pipeline([
        ('scaler', s),
        ('lasso_regression', las)])
    
    predictions = cross_val_predict(estimator, X, y, cv=kf)
    score = r2_score(y, predictions)
    scores.append(score)

In [24]:
list(zip(alphas, scores))

[(1e-09, 0.7063531064981925),
 (1e-08, 0.7063531072356071),
 (1e-07, 0.7063531145602442),
 (1e-06, 0.7063531882052063),
 (1e-05, 0.7063539165191507),
 (0.0001, 0.706361268093463),
 (0.001, 0.706433467041546),
 (0.01, 0.7070865958083233),
 (0.1, 0.705838151167185),
 (1.0, 0.6512724532884888)]

In [25]:
Lasso(alpha=1e-6).fit(X, y).coef_

array([-1.07170372e-01,  4.63952623e-02,  2.08588308e-02,  2.68854318e+00,
       -1.77954207e+01,  3.80475296e+00,  7.50802707e-04, -1.47575348e+00,
        3.05654279e-01, -1.23293755e-02, -9.53459908e-01,  9.39253013e-03,
       -5.25467196e-01])

In [26]:
Lasso(alpha=1.0).fit(X, y).coef_

array([-0.06342255,  0.04916867, -0.        ,  0.        , -0.        ,
        0.94678567,  0.02092737, -0.66900864,  0.26417501, -0.01520915,
       -0.72319901,  0.00829117, -0.76143296])

In [27]:
plt.figure(figsize=(10, 6))
plt.semilogx(alphas, scores, '-o')
plt.xlabel('$\\alpha$')
plt.ylabel('$R^2$')

AttributeError: module 'matplotlib.pyplot' has no attribute 'semilog'

<Figure size 1000x600 with 0 Axes>