# Machine Learning Foundation

## Section 2, Part e: Regularization LAB


## Lib Imports

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)
np.random.seed(72081)

## Import Local Libs

In [5]:
import sys, os

In [6]:
sys.path.append('../')

In [9]:
from lib.helpers import download

## Helper Methods

In [49]:
def to_2d(array: np.array) -> np.array:
    return array.reshape(array.shape[0], -1)

def plot_exponential_data() -> np.array:
    data = np.exp(np.random.normal(size=1000))
    plt.hist(data)
    plt.show()
    return data

def plot_square_normal_data():
    data = np.square(np.random.normal(loc=5, size=1000))
    plt.hist(data)
    plt.show()
    return data

def X_and_y_from_df(df: pd.DataFrame,
                    y_col: str) -> tuple[np.array, np.array]:
    X = boston_data.drop(Y_COL, axis=1)
    y = boston_data[Y_COL]
    return(X, y)
    

## Load Boston Data

In [13]:
from sklearn.datasets import fetch_openml

boston =  fetch_openml(data_id=506)
path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML240EN-SkillsNetwork/labs/data/boston_housing_clean.pickle"
download(path, 'boston_housing_clean.pickle')

content <Response [200]>


In [14]:
with open('boston_housing_clean.pickle', 'rb') as to_read:
    boston = pd.read_pickle(to_read)
    
boston_data = boston['dataframe']
boston_description = boston['description']
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# Globals

In [47]:
Y_COL = 'MEDV'

### Data Standardization

**Standardizing** data refers to transforming each variable so that it more closely follows a **standard** normal distribution, with mean 0 and standard deviation 1.

The [`StandardScaler`](http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.StandardScaler.html?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMML240ENSkillsNetwork34171862-2022-01-01#sklearn.preprocessing.StandardScaler) object in SciKit Learn can do this.


In [51]:
X, y = X_and_y_from_df(boston_data, Y_COL)

In [19]:
from sklearn.preprocessing import StandardScaler

s = StandardScaler()
X_ss = s.fit_transform(X)
X_ss

array([[-0.418,  0.285, -1.288, ..., -1.459,  0.441, -1.076],
       [-0.415, -0.488, -0.593, ..., -0.303,  0.441, -0.492],
       [-0.415, -0.488, -0.593, ..., -0.303,  0.396, -1.209],
       ...,
       [-0.411, -0.488,  0.116, ...,  1.176,  0.441, -0.983],
       [-0.406, -0.488,  0.116, ...,  1.176,  0.403, -0.865],
       [-0.413, -0.488,  0.116, ...,  1.176,  0.441, -0.669]])

### Exercise:

Confirm standard scaling


In [34]:
a = np.array([[1, 2, 12],
              [8, 12, 2],
              [5, 32, 7]])

In [35]:
column_mean = a.mean(axis=0)
column_mean

array([ 4.667, 15.333,  7.   ])

In [36]:
a_diff = a - a.mean(axis=0)
a_diff

array([[ -3.667, -13.333,   5.   ],
       [  3.333,  -3.333,  -5.   ],
       [  0.333,  16.667,   0.   ]])

In [39]:
a_std = a.std(axis=0)
a_std

array([ 2.867, 12.472,  4.082])

In [37]:
row_mean = a.mean(axis=1)
row_mean

array([ 5.   ,  7.333, 14.667])

In [38]:
a_ss = s.fit_transform(a)
a_ss

array([[-1.279, -1.069,  1.225],
       [ 1.162, -0.267, -1.225],
       [ 0.116,  1.336,  0.   ]])

In [41]:
X2 = np.array(X)
X2_ss = (X2 - X2.mean(axis=0)) / X2.std(axis=0)
X2_ss

array([[-0.418,  0.285, -1.288, ..., -1.459,  0.441, -1.076],
       [-0.415, -0.488, -0.593, ..., -0.303,  0.441, -0.492],
       [-0.415, -0.488, -0.593, ..., -0.303,  0.396, -1.209],
       ...,
       [-0.411, -0.488,  0.116, ...,  1.176,  0.441, -0.983],
       [-0.406, -0.488,  0.116, ...,  1.176,  0.403, -0.865],
       [-0.413, -0.488,  0.116, ...,  1.176,  0.441, -0.669]])

In [42]:
np.allclose(X2_ss, X_ss)

True

### Coefficients with and without Scaling

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [52]:
X, y = X_and_y_from_df(boston_data, Y_COL)

In [53]:
lr = LinearRegression()

In [54]:
lr.fit(X, y)

In [55]:
lr.coef_

array([ -0.107,   0.046,   0.021,   2.689, -17.796,   3.805,   0.001,
        -1.476,   0.306,  -0.012,  -0.953,   0.009,  -0.525])

In [None]:
pipeline = Pipeline(
    steps=[
        ('ss', StandardScaler()),
        ('lr', LinearRegression())
    ]
)

In [None]:
pipeline.fit(X, y)

In [None]:
lr = pipeline['lr']

In [None]:
lr.coef_