In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
X_train, X_test, y_train, y_test = train_test_split(cancer['data'], cancer.target, random_state=0)

### 1. standardscaler
- 각 데이터의 feature들의 평균을 0, 분산을 1로 변경
- 모든 feature들이 같은 스케일을 가지게 됨

In [4]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler1 = StandardScaler()
scaler1.fit(X_train)

StandardScaler()

In [7]:
X_scaled = scaler1.transform(X_train)

In [8]:
print(X_train.min(axis=0))
print(X_train.max(axis=0))
print(X_scaled.min(axis=0))
print(X_scaled.max(axis=0))

[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[-2.02304051 -2.31265646 -1.98102078 -1.43031247 -3.16043159 -1.66502629
 -1.13037758 -1.26748337 -2.69245547 -1.84928844 -1.02253911 -1.55688818
 -1.01199829 -0.69064562 -1.86789242 -1.34651102 -1.00915683 -1.98136529
 -1.53238992 -1.07259928 -1.71542529 -2.2481009  -1.68161632 -1.19940347
 -2.67791268 -1.48658803 -1.36015587 -1.75887319 -2.10621594 -1.60344965]
[ 3.93179406  3.540015

### 2. RobustScaler
- 모든 feature가 같은 크기를 같는다는 점에서 standardscaler와 비슷하지만, 평균이나 분산 대신에 median과 quartile(4분위수)을 사용
- 이상치의 영향을 받지 않는다

In [9]:
from sklearn.preprocessing import RobustScaler

scaler2 = RobustScaler()
X_scaled2 = scaler2.fit_transform(X_train)

In [10]:
print(X_train.min(axis=0))
print(X_train.max(axis=0))
print(X_scaled2.min(axis=0))
print(X_scaled2.max(axis=0))

[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[-1.58561686 -1.67896679 -1.5013689  -1.16660726 -2.30881961 -1.19057491
 -0.63597464 -0.62177215 -2.15384615 -1.44313968 -0.8600647  -1.18226678
 -0.87312212 -0.61824912 -1.55668617 -0.97626846 -1.05423348 -1.56999564
 -1.36098175 -0.97051424 -1.18334735 -1.63681438 -1.13928636 -0.87578375
 -2.036      -0.98594566 -0.8783887  -1.02631852 -1.7636235  -1.22475089]
[ 3.65406076  2.767527

### 3. MinMaxscaler
- 모든 feature가 0과 1사이에 위치하게 만듬
- 데이터가 2차원 set일 경우, 모든 데이터는 x축의 0과 1 사이에 y축의 0과 1 사이에 위치

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler3 = MinMaxScaler()
X_scaled3 = scaler3.fit_transform(X_train)

In [13]:
print(X_train.min(axis=0))
print(X_train.max(axis=0))
print(X_scaled3.min(axis=0))
print(X_scaled3.max(axis=0))

[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


### 4. normalizer
- standardscaler, robustscaler, minmaxscaler : 각 컬럼별(feature별)로 스케일을 조절하는게 아니라 행(row)별로 정규화
- normalizer : 빠르게 학습, 과대적합 확률을 낮출 수 있음(모든 scaler 장점)

In [14]:
from sklearn.preprocessing import Normalizer

In [16]:
scaler4 = Normalizer()
scaler4.fit(X_train)
X_scaled4 = scaler4.transform(X_train)

In [17]:
print(X_train.min(axis=0))
print(X_train.max(axis=0))
print(X_scaled4.min(axis=0))
print(X_scaled4.max(axis=0))

[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[5.51189319e-03 4.56825580e-03 3.63962422e-02 3.76233264e-01
 2.17902707e-05 2.37805701e-05 0.00000000e+00 0.00000000e+00
 4.14296567e-05 1.13032004e-05 1.38230416e-04 2.22630519e-04
 8.95949678e-04 1.14342671e-02 1.17600717e-06 2.76335624e-06
 0.00000000e+00 0.00000000e+00 3.33747542e-06 5.12412074e-07
 7.24466195e-03 5.15445862e-03 5.04955350e-02 6.96047105e-01
 2.72780418e-05 4.2112

모델에 스케일링된 데이터 적용

In [18]:
from sklearn.svm import SVC

In [19]:
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [20]:
print(svc.score(X_test, y_test))
print(svc.score(X_train, y_train))

0.9370629370629371
0.903755868544601


In [21]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
svc1 = SVC()
svc1.fit(X_train_scaled, y_train)

SVC()

In [24]:
print(svc1.score(X_test_scaled, y_test))
print(svc1.score(X_train_scaled, y_train))

0.965034965034965
0.9859154929577465


In [25]:
scaler1 = RobustScaler()
scaler1.fit(X_train)
X_train_scaled1 = scaler1.transform(X_train)
X_test_scaled1 = scaler1.transform(X_test)

In [26]:
svc2 = SVC()
svc2.fit(X_train_scaled1, y_train)

SVC()

In [27]:
print(svc1.score(X_test_scaled1, y_test))
print(svc1.score(X_train_scaled1, y_train))

0.9230769230769231
0.9647887323943662


In [32]:
scaler2 = MinMaxScaler()
X_train_scaled2 = scaler2.fit_transform(X_train)
X_test_scaled2 = scaler2.transform(X_test)

In [33]:
svc3 = SVC()
svc3.fit(X_train_scaled2, y_train)
print(svc3.score(X_test_scaled2, y_test))
print(svc3.score(X_train_scaled2, y_train))

0.972027972027972
0.9835680751173709


In [34]:
scaler3 = Normalizer()
X_train_scaled3 = scaler3.fit_transform(X_train)
X_test_scaled3 = scaler3.transform(X_test)

In [35]:
svc4 = SVC()
svc4.fit(X_train_scaled3, y_train)
print(svc4.score(X_test_scaled3, y_test))
print(svc4.score(X_train_scaled3, y_train))

0.8811188811188811
0.8943661971830986
