#### Feature scaling
: output이 바뀌지 않으며, 처리속도가 좀 더 빨라진다.
- feature(변수)간의 min-max를 맞춰준다.
    - Min-Max Normalization
- Standardization(정규분포화, Z-score Normalization)

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


##### Min-Max Normalization

In [4]:
# A를 1~5사이의 값으로 바꿔준다.
df['A'] = (df['A'] - df['A'].min()) / (df['A'].max() - df['A'].min()) * (5 - 1) + 1
df

Unnamed: 0,A,B,C
0,1.0,103.02,big
1,4.704874,107.26,small
2,4.741339,110.35,big
3,5.0,114.23,small
4,4.753981,114.68,small


##### Z-Score Normalization

In [5]:
df['B'] = (df['B'] - df['B'].mean()) / df['B'].std()
df

Unnamed: 0,A,B,C
0,1.0,-1.40525,big
1,4.704874,-0.54023,small
2,4.741339,0.090174,big
3,5.0,0.881749,small
4,4.753981,0.973556,small


##### Feature scaling with sklearn

In [6]:
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )

df.columns=['Class label', 'Alcohol', 'Malic acid']

df.head()

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [7]:
from sklearn import preprocessing

In [18]:
std_scale = preprocessing.StandardScaler().fit(df[['Alcohol','Malic acid']])
df_std = std_scale.transform(df[['Alcohol','Malic acid']])

minmax_scale = preprocessing.MinMaxScaler(feature_range=(1,5)).fit(df[['Alcohol','Malic acid']])
df_minmax = minmax_scale.transform(df[['Alcohol','Malic acid']])

fit() -> transform() 형태 : labelencoder와 동일

In [19]:
df_std

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125],
       [ 1.69154964, -0.34681064],
       [ 0.29570023,  0.22769377],
       [ 1.48155459, -0.51736664],
       [ 1.71625494, -0.4186237 ],
       [ 1.3086175 , -0.16727801],
       [ 2.25977152, -0.62508622],
       [ 1.0615645 , -0.88540853],
       [ 1.3580281 , -0.15830138],
       [ 1.38273339, -0.76871232],
       [ 0.92568536, -0.54429654],
       [ 2.16095032, -0.54429654],
       [ 1.70390229, -0.4186237 ],
       [ 0.77745356, -0.47248348],
       [ 1.60508109, -0.37374054],
       [ 1.02450655, -0.68792264],
       [ 1.46920194, -0.66996938],
       [ 0.78980621,  0.68550197],
       [ 1.3086175 , -0.63406285],
       [-0.08723191,  1.31386618],
       [ 0.87627476, -0.42760033],
       [-0.18605311, -0.66099274],
       [ 0.61686912, -0.47248348],
       [ 0.06099988, -0.25704433],
       [ 0.48098997, -0.50839001],
       [ 0.36981612, -0.55327317],
       [ 1.07391715,

In [20]:
df_minmax

array([[ 4.36842105,  1.76679842],
       [ 3.28421053,  1.82213439],
       [ 3.24210526,  2.28063241],
       [ 4.51578947,  1.95652174],
       [ 3.32631579,  2.46245059],
       [ 4.33684211,  1.80632411],
       [ 4.53684211,  1.89328063],
       [ 4.18947368,  2.11462451],
       [ 5.        ,  1.71146245],
       [ 3.97894737,  1.48221344],
       [ 4.23157895,  2.12252964],
       [ 4.25263158,  1.58498024],
       [ 3.86315789,  1.7826087 ],
       [ 4.91578947,  1.7826087 ],
       [ 4.52631579,  1.89328063],
       [ 3.73684211,  1.8458498 ],
       [ 4.44210526,  1.93280632],
       [ 3.94736842,  1.65612648],
       [ 4.32631579,  1.67193676],
       [ 3.74736842,  2.86561265],
       [ 4.18947368,  1.70355731],
       [ 3.        ,  3.41897233],
       [ 3.82105263,  1.88537549],
       [ 2.91578947,  1.6798419 ],
       [ 3.6       ,  1.8458498 ],
       [ 3.12631579,  2.03557312],
       [ 3.48421053,  1.81422925],
       [ 3.38947368,  1.77470356],
       [ 3.98947368,