# Feature Transformation

- Ref: 1031

In [63]:
import pandas as pd

In [64]:
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [65]:
# Deep copy for future reference. Its good practice, can be helpful for loading again again
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()
df6 = df.copy()

# Feature Engineering

- Normalization
- Standardization
- Max Scaler
- Max Absolute Scaler
- Robust Scaler

# Normalization

In [66]:
# Import Scaling Package
from sklearn.preprocessing import MinMaxScaler     # Package
min_max = MinMaxScaler()                           # Object Creation, shift+tab for customization when need e.g default range can be updated

In [67]:
# min_max = MinMaxScaler(feature_range=(5,0))   # Example for customization

In [68]:
# Scaling
scaled_1st = min_max.fit(df[['Marketing Spend']])  # Fit/train the model by created "object" >> no changed in df until transform
scaled_1st                                         # Just fit & Keep waiting for tranform

MinMaxScaler()

In [69]:
df.head()                                          # No scaled until tranform

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [70]:
# Tranform - 1st Tranformation applied on Marketing Spend
df['Marketing Spend'] = min_max.transform(df[['Marketing Spend']])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


In [71]:
# Tranform - Administration  (fit_tranform together)
df['Administration'] = min_max.fit_transform(df[['Administration']])  # fit on "Adminintration" & replace on "Administration"
df.head()            # Transform is depend on fit first

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,471784.1,Dhaka,192261.83
1,0.983359,0.761972,443898.53,Ctg,191792.06
2,0.927985,0.379579,407934.54,Rangpur,191050.39
3,0.873136,0.512998,383199.62,Dhaka,182901.99
4,0.859438,0.305328,366168.42,Rangpur,166187.94


In [72]:
# Tranform - Transport  (fit_tranform together)
df['Transport'] = min_max.fit_transform(df[['Transport']])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,192261.83
1,0.983359,0.761972,0.940893,Ctg,191792.06
2,0.927985,0.379579,0.864664,Rangpur,191050.39
3,0.873136,0.512998,0.812235,Dhaka,182901.99
4,0.859438,0.305328,0.776136,Rangpur,166187.94


# Standardization

In [73]:
# Original Data Before Scaling
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


#### Scratch Method

In [74]:
# Raw / Scratch / Manual
df2['New Markeing Spend Col'] = (df2['Marketing Spend'] - df2['Marketing Spend'].mean()) / df2['Marketing Spend'].std()
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,New Markeing Spend Col
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.888889
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.936203
2,153441.51,101145.55,407934.54,Rangpur,191050.39,1.736731
3,144372.41,118671.85,383199.62,Dhaka,182901.99,1.539157
4,142107.34,91391.77,366168.42,Rangpur,166187.94,1.489812


- Scaled values keep in seperate col as "New Marketing Spend Col" instead of replacing original Marketing Spend Col- its optinal
- Scartch application accordting to formula
- After scaling (standardization); mean & STD will be 1:0 respectily, its proved below

In [75]:
df2['New Markeing Spend Col'].std()

1.0

In [76]:
df2['New Markeing Spend Col'].mean()  # mean will be 0 (ZERO) ; literary it zero as ...-16 i.g 0.000000000000000279 like that, retured as scientic format(e-16) 279 after fifteen zero

2.7977620220553945e-16

#### Import package

In [77]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

In [78]:
# Tranform - 1st on Marketing
df2['Marketing Spend'] = std_scaler.fit_transform(df2[['Marketing Spend']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,New Markeing Spend Col
0,0.897913,136897.8,471784.1,Dhaka,192261.83,0.888889
1,1.95586,151377.59,443898.53,Ctg,191792.06,1.936203
2,1.754364,101145.55,407934.54,Rangpur,191050.39,1.736731
3,1.554784,118671.85,383199.62,Dhaka,182901.99,1.539157
4,1.504937,91391.77,366168.42,Rangpur,166187.94,1.489812


In [79]:
# Tranform - 2nd on Marketing
df2['Administration'] = std_scaler.fit_transform(df2[['Administration']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,New Markeing Spend Col
0,0.897913,0.560753,471784.1,Dhaka,192261.83,0.888889
1,1.95586,1.082807,443898.53,Ctg,191792.06,1.936203
2,1.754364,-0.728257,407934.54,Rangpur,191050.39,1.736731
3,1.554784,-0.096365,383199.62,Dhaka,182901.99,1.539157
4,1.504937,-1.079919,366168.42,Rangpur,166187.94,1.489812


- New Marketing Spend Col : its done by manual/scratch whereas "Marketing Spend" values are almost same, or same
- But there is a tiny distance between two Marketing Spend and "New Marketing Spend Col" because "Makerketin Spend" by library takes many param, whereas manual takes only xi & max value
- Using Library is recommended

In [80]:
# Tranform - 2nd on Transport
df2['Transport'] = std_scaler.fit_transform(df2[['Transport']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,New Markeing Spend Col
0,0.897913,0.560753,2.165287,Dhaka,192261.83,0.888889
1,1.95586,1.082807,1.929843,Ctg,191792.06,1.936203
2,1.754364,-0.728257,1.626191,Rangpur,191050.39,1.736731
3,1.554784,-0.096365,1.417348,Dhaka,182901.99,1.539157
4,1.504937,-1.079919,1.27355,Rangpur,166187.94,1.489812


# Max Scaling

In [81]:
# Original Data Before Scaling
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [82]:
df3['Marketing Spend'].max()  # getting max value

165349.2

In [83]:
# Tranform 1st - on Marketing
df3['Marketing Spend'] = df3['Marketing Spend'] / df3['Marketing Spend'].max()
df3.head()     # this is kind of scaling

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


# Max Absolute Scaling

In [84]:
# Oridinal Data before Feature Tranform/ Scaling
df4.head()  

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [85]:
from sklearn.preprocessing import MaxAbsScaler
mxabs_scaler = MaxAbsScaler()

In [86]:
# Transform 1st - Marketing
df4['Marketing Spend'] = mxabs_scaler.fit_transform(df4[['Marketing Spend']])
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


In [88]:
# Transform 2nd - Administration
df4['Administration'] = mxabs_scaler.fit_transform(df4[['Administration']])
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.749527,471784.1,Dhaka,192261.83
1,0.983359,0.828805,443898.53,Ctg,191792.06
2,0.927985,0.553781,407934.54,Rangpur,191050.39
3,0.873136,0.649738,383199.62,Dhaka,182901.99
4,0.859438,0.500378,366168.42,Rangpur,166187.94


In [89]:
# Transform 3rd - Transport
df4['Transport'] = mxabs_scaler.fit_transform(df4[['Transport']])
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.749527,1.0,Dhaka,192261.83
1,0.983359,0.828805,0.940893,Ctg,191792.06
2,0.927985,0.553781,0.864664,Rangpur,191050.39
3,0.873136,0.649738,0.812235,Dhaka,182901.99
4,0.859438,0.500378,0.776136,Rangpur,166187.94


# Robust Scaler

In [91]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [90]:
from sklearn.preprocessing import RobustScaler
RoSc = RobustScaler()

In [92]:
# Transform 1 - Marketing
df5['Marketing Spend'] = RoSc.fit_transform(df5[['Marketing Spend']])
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.67253,136897.8,471784.1,Dhaka,192261.83
1,1.452113,151377.59,443898.53,Ctg,191792.06
2,1.303634,101145.55,407934.54,Rangpur,191050.39
3,1.156567,118671.85,383199.62,Dhaka,182901.99
4,1.119836,91391.77,366168.42,Rangpur,166187.94


### Assumptions

- fit is the pre-condition for transformation
- fit is just traing process, while transform actually convert the values with scaled values
- object is essential for fitting or tranformation

### Ref:

- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.htm
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html