# Exercise: Feature Scaling

Definition and exercise content.

REFS: https://www.baeldung.com/cs/normalization-vs-standardization

https://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling


https://stats.stackexchange.com/questions/324369/feature-scaling-giving-reduced-output-linear-regression-using-gradient-descent


## Preparing data
......



In [36]:
# Import everything we will need for this unit
import pandas as pd
import numpy as np
import operator
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

# Load data from our dataset file into a pandas dataframe
dataset = pd.read_csv('Data/auto-mpg-cleaned.csv')

# Check what's in the dataset
print(dataset.head())



    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  27.0          4          97.0          88    2130          14.5   
1  26.0          4         121.0         113    2234          12.5   
2  26.0          4          97.0          46    1835          20.5   
3  25.0          4         110.0          87    2672          17.5   
4  25.0          4         104.0          95    2375          17.5   

   model year  origin                      car name  
0          70       3                  datsun pl510  
1          70       2                      bmw 2002  
2          70       2  volkswagen 1131 deluxe sedan  
3          70       2                   peugeot 504  
4          70       2                      saab 99e  


Explain dataset, 

why features need scaling

train unscaled model

In [37]:
# Select only numerical features for training and testing
# Convert both X and y to numpy arrays for processing
X = dataset[['displacement', 'horsepower', 'weight', 'acceleration']]
y = dataset['mpg'].to_numpy()

# Split test and Train sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

X_train.describe()



Unnamed: 0,displacement,horsepower,weight,acceleration
count,313.0,313.0,313.0,313.0
mean,193.891374,104.517572,2984.552716,15.596805
std,104.04767,38.378018,849.391248,2.829003
min,68.0,46.0,1613.0,8.0
25%,105.0,76.0,2226.0,13.7
50%,146.0,95.0,2807.0,15.5
75%,267.0,125.0,3632.0,17.2
max,455.0,230.0,5140.0,24.8


In [38]:
# Train model with unscaled features
# We have to set normalize and fit_transform to False
# to disable automatic scaling
model = LinearRegression(normalize=False, fit_intercept=False)
model.fit(X_train,y_train)

# Evaluate using test_set
y_hat = model.predict(X_test)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test,y_hat))
r2 = r2_score(y_test,y_hat)

print(f"RMSE metrics: {rmse}")
print(f"R2 metrics: {r2}")


RMSE metrics: 6.252185905467142
R2 metrics: 0.4327943749210519


Show how to scale features, train new  model

In [39]:

# Split test and Train sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
norm = scaler.fit(X_train2)
X_train2_norm = scaler.transform(X_train2)
X_test2_norm = scaler.transform(X_test2)


df = pd.DataFrame(data=X_train2_norm, index=None, columns=['displacement', 'horsepower', 'weight', 'acceleration'])
df.describe()



Unnamed: 0,displacement,horsepower,weight,acceleration
count,313.0,313.0,313.0,313.0
mean,6.242788000000001e-17,7.37784e-17,1.362063e-16,-3.859178e-16
std,1.001601,1.001601,1.001601,1.001601
min,-1.211877,-1.52721,-1.617334,-2.689629
25%,-0.8557012,-0.7442603,-0.8944846,-0.671559
50%,-0.46102,-0.2483925,-0.20937,-0.0342736
75%,0.7037706,0.5345567,0.7634692,0.567607
max,2.513528,3.274879,2.541701,3.258367


In [40]:
# Train model with unscaled features
model2 = LinearRegression(normalize=False)
model2.fit(X_train2_norm,y_train2)

# Evaluate using test_set
y_hat2 = model2.predict(X_test2_norm)

# Calculate metrics
rmse_2 = np.sqrt(mean_squared_error(y_test,y_hat2))
r2_2 = r2_score(y_test2,y_hat2)

print(f"RMSE metrics: {rmse_2}")
print(f"R2 metrics: {r2_2}")



RMSE metrics: 5.026357848273459
R2 metrics: 0.6334075027239533


In [43]:
# # Use a dataframe to create a comparison table of metrics
l = [["Without Feature Scaling", rmse, r2],
    ["With Feature Scaling", rmse_2, r2_2]]

pd.DataFrame(l, columns=["", "RMSE", "R2"])

Unnamed: 0,Unnamed: 1,RMSE,R2
0,Without Feature Scaling,6.252186,0.432794
1,With Feature Scaling,5.026358,0.633408


Conclusion



## Summary

.....
