# Exercise: Feature Scaling

Definition and exercise content.

REFS: https://www.baeldung.com/cs/normalization-vs-standardization

https://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling


https://stats.stackexchange.com/questions/324369/feature-scaling-giving-reduced-output-linear-regression-using-gradient-descent


## Preparing data
......



In [1]:
# Import everything we will need for this unit
import pandas as pd
import numpy as np
import operator
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import SimpleLinearRegression as slr

# Load data from our dataset file into a pandas dataframe
dataset = pd.read_csv('Data/auto-mpg-cleaned.csv')

# # Check what's in the dataset
print(dataset.head())
print(dataset.info())



    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  27.0          4          97.0          88    2130          14.5   
1  26.0          4         121.0         113    2234          12.5   
2  26.0          4          97.0          46    1835          20.5   
3  25.0          4         110.0          87    2672          17.5   
4  25.0          4         104.0          95    2375          17.5   

   model year  origin                      car name  
0          70       3                  datsun pl510  
1          70       2                      bmw 2002  
2          70       2  volkswagen 1131 deluxe sedan  
3          70       2                   peugeot 504  
4          70       2                      saab 99e  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392

Explain dataset, 

why features need scaling

train unscaled model

In [2]:
# Select only numerical features for training and testing
# Convert both X and y to numpy arrays for processing
X = dataset[['horsepower', 'weight', 'acceleration']].to_numpy()
y = dataset['mpg'].to_numpy()

# Split test and Train sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)



# print("X_train")
# print(X_train)

# Train model with unscaled features
# We have to set normalize and fit_transform to False
# to disable automatic scaling
# model = LinearRegression(normalize=False)
model = slr.SimpleLinearRegression()
weights, J_history = model.fit(X_train,y_train, 0.001, 400)

# Evaluate using test_set
y_hat = model.predict(X_test)

# Calculate metrics
rmse_0 = np.sqrt(mean_squared_error(y_test,y_hat))
r2_0 = r2_score(y_test,y_hat)

print(f"RMSE metrics: {rmse_0}")
print(f"R2 metrics: {r2_0}")


X in class
[[1.000e+00 6.300e+01 2.051e+03 1.700e+01]
 [1.000e+00 9.700e+01 2.405e+03 1.490e+01]
 [1.000e+00 1.250e+02 3.605e+03 1.500e+01]
 ...
 [1.000e+00 1.450e+02 3.988e+03 1.300e+01]
 [1.000e+00 1.000e+02 3.329e+03 1.550e+01]
 [1.000e+00 1.500e+02 4.498e+03 1.450e+01]]
  weights = weights - (learning_rate / m) * np.dot(h_vec - y, X)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Show how to scale features, train new  model

In [4]:
# Do a model comparison
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

# X_norm2 = np.concatenate([np.ones((X_norm.shape[0], 1)), X_norm], axis=1)

print(X_norm)

# Split test and Train sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_norm, y, test_size=0.2, random_state=0)

print("X_train")
print(X_train2)

# Train model with unscaled features
# model2 = LinearRegression(normalize=False)
# model2.fit(X_train,y_train)
model2 = slr.SimpleLinearRegression()
_, J_history2 = model2.fit(X_train2,y_train2, 0.001, 400)

# Evaluate using test_set
y_hat2 = model2.predict(X_test2)

# Calculate metrics
rmse_1 = np.sqrt(mean_squared_error(y_test,y_hat2))
r2_1 = r2_score(y_test2,y_hat2)

print(f"RMSE metrics: {rmse_1}")
print(f"R2 metrics: {r2_1}")

# # Use a dataframe to create a comparison table of metrics
# l = [["Unscaled Features", original_rmse, original_r2],
#     ["Custom Model", rmse, r2]]

# pd.DataFrame(l, columns=["", "RMSE", "R2"])

[[-0.42842136 -0.99913445 -0.37792992]
 [ 0.22190846 -0.87653898 -1.10379239]
 [-1.52097544 -1.34688122  1.79965748]
 ...
 [ 0.14386888 -0.03841032  0.31163942]
 [-0.32436858 -0.13271453  0.31163942]
 [ 0.19589527 -0.16807861 -0.30534367]]
X_train
[[-1.07875117e+00 -1.09225986e+00  5.29398160e-01]
 [-1.94302622e-01 -6.74963731e-01 -2.32757428e-01]
 [ 5.34066770e-01  7.39599403e-01 -1.96464305e-01]
 [-6.10513703e-01 -8.93042214e-01  4.93105037e-01]
 [-6.36526896e-01 -9.59055160e-01 -1.96464305e-01]
 [-5.06460933e-01 -4.60421656e-01  1.66466928e-01]
 [-5.06460933e-01  1.46357967e-02  9.64915640e-01]
 [ 1.70466044e+00  1.99266658e+00 -1.46672362e+00]
 [-5.32474126e-01 -5.74765509e-01  5.75875579e-02]
 [ 7.94198696e-01  1.00482999e+00 -1.23878059e-01]
 [ 6.64132733e-01  1.55297320e+00 -2.32757428e-01]
 [ 1.43868881e-01  6.39401181e-01  3.11639421e-01]
 [-1.31286990e+00 -8.52962925e-01  2.88845117e+00]
 [ 1.05433062e+00  1.37025880e+00 -6.68274908e-01]
 [-1.33888310e+00 -1.39403332e+00  7.1

In [None]:
fig = plt.figure()
axes = fig.add_subplot()
axes.set_title('Cost x Iterations for Different Learning Rates')

l0 = plt.plot(np.arange(len(J_history)), J_history, lw=2, color="red")
l1 = plt.plot(np.arange(len(J_history2)), J_history2, lw=2, color="blue")

# Add legend
red_patch = mpatches.Patch(color="red", label="High")
blue_patch = mpatches.Patch(color="blue", label="Low")
plt.legend(title = "Learning Rate", handles=[red_patch, blue_patch])
plt.xlabel("Number of iterations")
_ = plt.ylabel("Cost")

Conclusion



## Summary

.....
