#### MIN-MAX SCALING  
`Xscaled=(Xi-Xmin) / (Xmax-Xmin)`  
Data will range between 0 to 1  


In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler

In [43]:
df = pd.read_csv("ago/USA_Housing.csv")
print(df.head())

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0      79545.458574             5.682861                   7.009188   
1      79248.642455             6.002900                   6.730821   
2      61287.067179             5.865890                   8.512727   
3      63345.240046             7.188236                   5.586729   
4      59982.197226             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  \
0                          4.09     23086.800503  1.059034e+06   
1                          3.09     40173.072174  1.505891e+06   
2                          5.13     36882.159400  1.058988e+06   
3                          3.26     34310.242831  1.260617e+06   
4                          4.23     26354.109472  6.309435e+05   

                                             Address  
0  208 Michael Ferry Apt. 674\nLaurabury, NE 3701...  
1  188 Johnson Views Suite 079\nLake Kathleen, CA...  
2  9127 Eli

In [44]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

# Create df with num col only
num_col_df = pd.DataFrame(df, columns=num_col)
num_col_df.head()

Categorical columns : ['Address']
Numerical columns : ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price']


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5


In [45]:
# initialising the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# learning the statistical parameters for each of the data and transforming
scaled_data = scaler.fit_transform(num_col_df)

scaled_df = pd.DataFrame(scaled_data, columns=num_col_df.columns)
scaled_df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,0.686822,0.441986,0.501502,0.464444,0.329942,0.42521
1,0.683521,0.488538,0.464501,0.242222,0.575968,0.607369
2,0.483737,0.468609,0.70135,0.695556,0.528582,0.425192
3,0.50663,0.660956,0.31243,0.28,0.491549,0.507384
4,0.469223,0.348556,0.611851,0.495556,0.376988,0.250702


#### NORMALIZATION   
`Xscaled=(Xi-Xmean) / (Xmax-Xmin)`  
Data will range between 0 to 1  


In [46]:
# initialising the Normalizer
scaler = Normalizer()

# learning the statistical parameters for each of the data and transforming
scaled_data2 = scaler.fit_transform(num_col_df)

scaled_df2 = pd.DataFrame(scaled_data2, columns=num_col_df.columns)
scaled_df2.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,0.074883,5e-06,7e-06,4e-06,0.021734,0.996955
1,0.052534,4e-06,4e-06,2e-06,0.026631,0.998264
2,0.057742,6e-06,8e-06,5e-06,0.034749,0.997727
3,0.050168,6e-06,4e-06,3e-06,0.027173,0.998371
4,0.094559,8e-06,1.2e-05,7e-06,0.041546,0.994652


#### STANDARDIZATION   
`Xscaled=(Xi-Xmean) / (standard deviation)`  
Data does not range between 0 to 1  


In [47]:
# initialising the StandardScaler
scaler = StandardScaler()

# learning the statistical parameters for each of the data and transforming
scaled_data3 = scaler.fit_transform(num_col_df)

scaled_d3 = pd.DataFrame(scaled_data3, columns=num_col_df.columns)
scaled_d3.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,1.02866,-0.296927,0.021274,0.088062,-1.317599,-0.490081
1,1.000808,0.025902,-0.255506,-0.722301,0.403999,0.775508
2,-0.684629,-0.112303,1.516243,0.93084,0.07241,-0.490211
3,-0.491499,1.221572,-1.393077,-0.58454,-0.186734,0.080843
4,-0.807073,-0.944834,0.846742,0.201513,-0.988387,-1.702518


#### ROBUST SCALING  
`Xscaled=(Xi-Xmedian) / (interquartile range IQR))`  
Data does not range between 0 to 1  


In [48]:
# initialising the RobustScaler
scaler = RobustScaler()

# learning the statistical parameters for each of the data and transforming
scaled_data4 = scaler.fit_transform(num_col_df)

scaled_d4 = pd.DataFrame(scaled_data4, columns=num_col_df.columns)
scaled_d4.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,0.750985,-0.216456,0.0046,0.02963,-0.974382,-0.366604
1,0.730233,0.024441,-0.19909,-0.711111,0.295278,0.576863
2,-0.525578,-0.078688,1.104787,0.8,0.050735,-0.3667
3,-0.381677,0.916661,-1.036259,-0.585185,-0.140381,0.059006
4,-0.61681,-0.69993,0.612083,0.133333,-0.731592,-1.270447
