In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.DataFrame({
    'Income': [15000, 1800, 120000, 10000],
    'Age': [25, 18, 42, 51],
    'Department': ['HR','Legal','Marketing','Management']
})
df.head()

Unnamed: 0,Income,Age,Department
0,15000,25,HR
1,1800,18,Legal
2,120000,42,Marketing
3,10000,51,Management


In [4]:
df_scaled = df.copy()
col_names = ['Income', 'Age']
features = df_scaled[col_names]
features.head()

Unnamed: 0,Income,Age
0,15000,25
1,1800,18
2,120000,42
3,10000,51


In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [6]:
df_scaled[col_names] = scaler.fit_transform(features.values)

In [7]:
df_scaled

Unnamed: 0,Income,Age,Department
0,0.111675,0.212121,HR
1,0.0,0.0,Legal
2,1.0,0.727273,Marketing
3,0.069374,1.0,Management


In [8]:
scaler = MinMaxScaler(feature_range=(5,10))

In [9]:
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department
0,5.558376,6.060606,HR
1,5.0,5.0,Legal
2,10.0,8.636364,Marketing
3,5.34687,10.0,Management


In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [12]:
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department
0,-0.449056,-0.685248,HR
1,-0.722214,-1.218219,Legal
2,1.723796,0.60911,Marketing
3,-0.552525,1.294358,Management


In [13]:
df_scaled.describe()

Unnamed: 0,Income,Age
count,4.0,4.0
mean,0.0,-5.5511150000000004e-17
std,1.154701,1.154701
min,-0.722214,-1.218219
25%,-0.594947,-0.818491
50%,-0.500791,-0.03806935
75%,0.094157,0.7804217
max,1.723796,1.294358


## MaxAbsScaler

In [14]:
df["Balance"] = [100.0, -263.0, 2000.0, -5.0]
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

In [23]:
df_scaled =df.copy()
col_names = ['Income', 'Age','Balance']
features = df_scaled[col_names]

In [24]:
df_scaled[col_names] = scaler.fit_transform(features.values)

In [25]:
df_scaled

Unnamed: 0,Income,Age,Department,Balance
0,0.125,0.490196,HR,0.05
1,0.015,0.352941,Legal,-0.1315
2,1.0,0.823529,Marketing,1.0
3,0.083333,1.0,Management,-0.0025


In [26]:
df_scaled.describe()

Unnamed: 0,Income,Age,Balance
count,4.0,4.0,4.0
mean,0.305833,0.666667,0.229
std,0.464994,0.297368,0.519626
min,0.015,0.352941,-0.1315
25%,0.06625,0.455882,-0.03475
50%,0.104167,0.656863,0.02375
75%,0.34375,0.867647,0.2875
max,1.0,1.0,1.0


In [27]:
df.describe()

Unnamed: 0,Income,Age,Balance
count,4.0,4.0,4.0
mean,36700.0,34.0,458.0
std,55799.28315,15.165751,1039.252616
min,1800.0,18.0,-263.0
25%,7950.0,23.25,-69.5
50%,12500.0,33.5,47.5
75%,41250.0,44.25,575.0
max,120000.0,51.0,2000.0


# Robust Scaler

## x_scaled = (x – Q1)/(Q3 – Q1)

In [28]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

In [29]:
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

Unnamed: 0,Income,Age,Department,Balance
0,0.075075,-0.404762,HR,0.081458
1,-0.321321,-0.738095,Legal,-0.481769
2,3.228228,0.404762,Marketing,3.02948
3,-0.075075,0.833333,Management,-0.081458


# Quantile Transformer Scaler
'''One of the most interesting feature transformation 
techniques is the Quantile Transformer Scaler converts the variable distribution
to a normal distribution. and scales it accordingly.
Since it makes the variable normally distributed, it also deals with the outliers. 
Here are a few important points regarding the 
Quantile Transformer Scaler
1. It computes the cumulative distribution function of the variable

2. It uses this cdf to map the values to a normal distribution

3. Maps the obtained values to the desired output distribution using the associated quantile function

A caveat to keep in mind though: Since this scaler changes the very distribution of the variables, 
inear relationships among variables may be destroyed by using this scaler. Thus, 
it is best to use this for non-linear data.
Here is the code for using the Quantile Transformer:'''

In [30]:
from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer()

In [32]:
df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

  % (self.n_quantiles, n_samples))


Unnamed: 0,Income,Age,Department,Balance
0,0.666667,0.333333,HR,0.666667
1,0.0,0.0,Legal,0.0
2,1.0,0.666667,Marketing,1.0
3,0.333333,1.0,Management,0.333333


# Power Transformer Scaler

In [35]:
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method = 'box-cox')

'''
parameters:
method = 'box-cox' or 'yeo-johnson'
'''

df_scaled[col_names] = scaler.fit_transform(features.values)
df_scaled

ValueError: The Box-Cox transformation can only be applied to strictly positive data