In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler

In [2]:
# load Dataframe
df = pd.read_csv('data.csv')
df

Unnamed: 0,Time,Height
0,0.0,103.05
1,0.1,101.43
2,0.2,99.0
3,0.3,98.51
4,0.4,97.56
5,0.5,94.43
6,0.6,97.47
7,0.7,96.68
8,0.8,98.32
9,0.9,95.47


<h3> MinMaxScaler()</h3>
This Transformer normalizes the values for each column independently, <br>
such that the maximum value is set 1.0 and the minimum to 0.0

In [3]:
df_MinMax = MinMaxScaler().fit_transform(df)
df_MinMax

array([[0.        , 1.        ],
       [0.02222222, 0.98427948],
       [0.04444444, 0.96069869],
       [0.06666667, 0.95594372],
       [0.08888889, 0.94672489],
       [0.11111111, 0.91635129],
       [0.13333333, 0.94585153],
       [0.15555556, 0.93818535],
       [0.17777778, 0.95409995],
       [0.2       , 0.92644347],
       [0.22222222, 0.92197962],
       [0.24444444, 0.92498787],
       [0.26666667, 0.91237263],
       [0.28888889, 0.87103348],
       [0.31111111, 0.87442989],
       [0.33333333, 0.87074236],
       [0.35555556, 0.8792819 ],
       [0.37777778, 0.85104318],
       [0.4       , 0.8363901 ],
       [0.42222222, 0.80912179],
       [0.44444444, 0.76108685],
       [0.46666667, 0.7869966 ],
       [0.48888889, 0.74138768],
       [0.51111111, 0.72954876],
       [0.53333333, 0.66220281],
       [0.55555556, 0.65657448],
       [0.57777778, 0.64599709],
       [0.6       , 0.5930131 ],
       [0.62222222, 0.59223678],
       [0.64444444, 0.57962154],
       [0.

<h3> Normalizer() </h3>
This transformer divides each value in column/row by the norm of the column/row vector. <br>
If used .T (transpose), then the column will be selected for vector's norm calculation.

In [4]:
df_Normalizer = Normalizer().fit_transform(df.T)
df_Normalizer.T

array([[0.        , 0.20769037],
       [0.00564378, 0.20442536],
       [0.01128755, 0.19952786],
       [0.01693133, 0.1985403 ],
       [0.0225751 , 0.19662564],
       [0.02821888, 0.19031733],
       [0.03386266, 0.19644425],
       [0.03950643, 0.19485206],
       [0.04515021, 0.19815737],
       [0.05079398, 0.19241338],
       [0.05643776, 0.19148628],
       [0.06208153, 0.19211107],
       [0.06772531, 0.18949101],
       [0.07336909, 0.18090526],
       [0.07901286, 0.18161066],
       [0.08465664, 0.1808448 ],
       [0.09030041, 0.18261838],
       [0.09594419, 0.17675347],
       [0.10158797, 0.17371017],
       [0.10723174, 0.1680468 ],
       [0.11287552, 0.15807041],
       [0.11851929, 0.16345161],
       [0.12416307, 0.15397908],
       [0.12980684, 0.15152025],
       [0.13545062, 0.13753314],
       [0.1410944 , 0.13636419],
       [0.14673817, 0.13416737],
       [0.15238195, 0.12316311],
       [0.15802572, 0.12300187],
       [0.1636695 , 0.12038181],
       [0.

<h3> StandardScaler() </h3>
This transformer set the mean and std_dev of each column to 0 and 1, respectively.
scaled_feature = (feature - mean)/std


In [5]:
array_standarized = StandardScaler().fit_transform(df)
df_standarized = pd.DataFrame(array_standarized, columns=df.columns)

print(df_standarized.Time.mean(),   df_standarized.Time.std())
print(df_standarized.Height.mean(), df_standarized.Height.std())

# Of course, those values (mean and std) are approximations.

-1.9308226515220113e-17 1.0110500592068734
1.8342815189459108e-16 1.0110500592068734


<h3> RobustScaler() </h3>
This transformer perform for each feature/column independently:
scaled_feature = (feature - median)/IQR
Where IQR is the Interquartile Range (25%, 75%).

In [6]:
array_Robusted = RobustScaler().fit_transform(df)
df_Robusted = pd.DataFrame(array_Robusted, columns=df.columns)
df_Robusted

Unnamed: 0,Time,Height
0,-1.0,0.521922
1,-0.955556,0.490906
2,-0.911111,0.444381
3,-0.866667,0.434999
4,-0.822222,0.41681
5,-0.777778,0.356883
6,-0.733333,0.415087
7,-0.688889,0.399962
8,-0.644444,0.431361
9,-0.6,0.376795


In [7]:
# Scaling manually the Height column
quantile = df.Height.quantile(q=[0.25, 0.75])
IQR = quantile[0.75] - quantile[0.25]
median = df.Height.median()
df_Robusted['manual_scaled_Height'] = (df.Height - median)/IQR
df_Robusted

Unnamed: 0,Time,Height,manual_scaled_Height
0,-1.0,0.521922,0.521922
1,-0.955556,0.490906,0.490906
2,-0.911111,0.444381,0.444381
3,-0.866667,0.434999,0.434999
4,-0.822222,0.41681,0.41681
5,-0.777778,0.356883,0.356883
6,-0.733333,0.415087,0.415087
7,-0.688889,0.399962,0.399962
8,-0.644444,0.431361,0.431361
9,-0.6,0.376795,0.376795


The main adventage of RobustScaler() is that the scaling factor does not depends of the outliers, unlike the previous ones.