# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Features Scaling

In [3]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


### Note that all the features are of different scale, weight is in thousands and horsepower is around 100

### Some model may not be smart enough and mistakenly assume weight is a more important factor simply because they are huge number and therefore a small change in weight may give arise to a bigger impact in the target variable

### One may want to "standardize" all features to be around the same magnitude. To do that we use the StandardScalar

In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [9]:
numerical_features = df[["cylinders","displacement","horsepower","weight", "acceleration"]]
numerical_features.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,8,307.0,130.0,3504,12.0
1,8,350.0,165.0,3693,11.5
2,8,318.0,150.0,3436,11.0
3,8,304.0,150.0,3433,12.0
4,8,302.0,140.0,3449,10.5


In [13]:
scaler = StandardScaler()
normalized_df = scaler.fit_transform(numerical_features)
normalized_df = pd.DataFrame(normalized_df, columns = ["cylinders","displacement","horsepower","weight", "acceleration"] )
normalized_df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,1.498191,1.090604,0.664133,0.63087,-1.295498
1,1.498191,1.503514,1.574594,0.854333,-1.477038
2,1.498191,1.196232,1.184397,0.55047,-1.658577
3,1.498191,1.061796,1.184397,0.546923,-1.295498
4,1.498191,1.042591,0.924265,0.565841,-1.840117


In [14]:
normalized_df.describe()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
count,398.0,398.0,392.0,398.0,398.0
mean,-5.171742e-16,-8.312725000000001e-17,-4.392745e-16,-9.902743000000001e-17,8.982206000000001e-17
std,1.001259,1.001259,1.001278,1.001259,1.001259
min,-1.444949,-1.204411,-1.520975,-1.604943,-2.747814
25%,-0.8563206,-0.8563178,-0.7665929,-0.8828266,-0.6328794
50%,-0.8563206,-0.431404,-0.2853488,-0.1973624,-0.02472221
75%,1.498191,0.6584879,0.56008,0.7538337,0.5834349
max,1.498191,2.511784,3.265452,2.565185,3.351912


### Now all features has mean of 0 and standard deviation of 1

In [21]:
minMaxScaler = MinMaxScaler()
minMax_df = minMaxScaler.fit_transform(numerical_features)
minMax_df = pd.DataFrame(minMax_df, columns = ["cylinders","displacement","horsepower","weight", "acceleration"] )
minMax_df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
0,1.0,0.617571,0.456522,0.53615,0.238095
1,1.0,0.728682,0.646739,0.589736,0.208333
2,1.0,0.645995,0.565217,0.51687,0.178571
3,1.0,0.609819,0.565217,0.516019,0.238095
4,1.0,0.604651,0.51087,0.520556,0.14881


In [22]:
minMax_df.describe()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration
count,398.0,398.0,392.0,398.0,398.0
mean,0.490955,0.324098,0.317768,0.384867,0.450482
std,0.340201,0.269431,0.209191,0.240103,0.164148
min,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.093669,0.157609,0.173164,0.346726
50%,0.2,0.20801,0.258152,0.337539,0.446429
75%,1.0,0.501292,0.434783,0.565637,0.546131
max,1.0,1.0,1.0,1.0,1.0


## Categorical Variables