### 最大值最小值归一化

In [1]:
import numpy as np

X_1 = np.random.randint(1,10,size = 10)
X_2 = np.random.randint(100,300,size =10)

X = np.c_[X_1,X_2]
X

array([[  3, 229],
       [  2, 228],
       [  9, 237],
       [  1, 180],
       [  8, 242],
       [  6, 162],
       [  2, 294],
       [  6, 218],
       [  5, 248],
       [  4, 102]])

In [2]:
X.min(axis = 0) # 分别计算每一列的最小值

array([  1, 102])

In [3]:
(X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))

array([[0.25      , 0.66145833],
       [0.125     , 0.65625   ],
       [1.        , 0.703125  ],
       [0.        , 0.40625   ],
       [0.875     , 0.72916667],
       [0.625     , 0.3125    ],
       [0.125     , 1.        ],
       [0.625     , 0.60416667],
       [0.5       , 0.76041667],
       [0.375     , 0.        ]])

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit_transform(X)

array([[0.25      , 0.66145833],
       [0.125     , 0.65625   ],
       [1.        , 0.703125  ],
       [0.        , 0.40625   ],
       [0.875     , 0.72916667],
       [0.625     , 0.3125    ],
       [0.125     , 1.        ],
       [0.625     , 0.60416667],
       [0.5       , 0.76041667],
       [0.375     , 0.        ]])

In [5]:
X[-1,-1] *= 1000
X

array([[     3,    229],
       [     2,    228],
       [     9,    237],
       [     1,    180],
       [     8,    242],
       [     6,    162],
       [     2,    294],
       [     6,    218],
       [     5,    248],
       [     4, 102000]])

In [6]:
scaler.fit_transform(X)

array([[2.50000000e-01, 6.57907657e-04],
       [1.25000000e-01, 6.48088140e-04],
       [1.00000000e+00, 7.36463795e-04],
       [0.00000000e+00, 1.76751311e-04],
       [8.75000000e-01, 7.85561382e-04],
       [6.25000000e-01, 0.00000000e+00],
       [1.25000000e-01, 1.29617628e-03],
       [6.25000000e-01, 5.49892967e-04],
       [5.00000000e-01, 8.44478485e-04],
       [3.75000000e-01, 1.00000000e+00]])

#### 演示离群点

In [7]:
X[6,1] = 123456789
X

array([[        3,       229],
       [        2,       228],
       [        9,       237],
       [        1,       180],
       [        8,       242],
       [        6,       162],
       [        2, 123456789],
       [        6,       218],
       [        5,       248],
       [        4,    102000]])

In [9]:
X_norm = (X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))
X_norm

array([[2.50000000e-01, 5.42700717e-07],
       [1.25000000e-01, 5.34600706e-07],
       [1.00000000e+00, 6.07500803e-07],
       [0.00000000e+00, 1.45800193e-07],
       [8.75000000e-01, 6.48000856e-07],
       [6.25000000e-01, 0.00000000e+00],
       [1.25000000e-01, 1.00000000e+00],
       [6.25000000e-01, 4.53600599e-07],
       [5.00000000e-01, 6.96600920e-07],
       [3.75000000e-01, 8.24888890e-04]])

In [10]:
X_norm.round(2)

array([[0.25, 0.  ],
       [0.12, 0.  ],
       [1.  , 0.  ],
       [0.  , 0.  ],
       [0.88, 0.  ],
       [0.62, 0.  ],
       [0.12, 1.  ],
       [0.62, 0.  ],
       [0.5 , 0.  ],
       [0.38, 0.  ]])

### Z-score标准化【正态分布】

In [5]:
X_1 = np.random.randint(1,10,size = 10)
X_2 = np.random.randint(100,300,size =10)

X = np.c_[X_1,X_2]
X

array([[  9, 216],
       [  9, 116],
       [  7, 121],
       [  8, 286],
       [  9, 288],
       [  9, 239],
       [  5, 210],
       [  4, 255],
       [  4, 142],
       [  7, 260]])

In [6]:
X2 = (X - X.mean(axis = 0))/X.std(axis =0)
X2

array([[ 0.96333824,  0.04349518],
       [ 0.96333824, -1.56743749],
       [-0.05070201, -1.48689086],
       [ 0.45631811,  1.17114805],
       [ 0.96333824,  1.20336671],
       [ 0.96333824,  0.4140097 ],
       [-1.06474227, -0.05316078],
       [-1.57176239,  0.67175892],
       [-1.57176239, -1.148595  ],
       [-0.05070201,  0.75230556]])

In [7]:
X2.mean(axis = 0)

array([ 1.90819582e-16, -1.88737914e-16])

In [8]:
X2.std(axis = 0)

array([1., 1.])

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 模仿
scaler.fit(X) # 统一【模型，fit】，fit“训练”

scaler.transform(X) # 根据训练，得到平均值，标准差【transform 相当于 predict】

array([[ 0.96333824,  0.04349518],
       [ 0.96333824, -1.56743749],
       [-0.05070201, -1.48689086],
       [ 0.45631811,  1.17114805],
       [ 0.96333824,  1.20336671],
       [ 0.96333824,  0.4140097 ],
       [-1.06474227, -0.05316078],
       [-1.57176239,  0.67175892],
       [-1.57176239, -1.148595  ],
       [-0.05070201,  0.75230556]])

In [10]:
scaler.fit_transform(X)

array([[ 0.96333824,  0.04349518],
       [ 0.96333824, -1.56743749],
       [-0.05070201, -1.48689086],
       [ 0.45631811,  1.17114805],
       [ 0.96333824,  1.20336671],
       [ 0.96333824,  0.4140097 ],
       [-1.06474227, -0.05316078],
       [-1.57176239,  0.67175892],
       [-1.57176239, -1.148595  ],
       [-0.05070201,  0.75230556]])

数据训练数据X_train又有测试数据X_test

scaler.fit(X_train) # 基准，先进行训练

数据转换：  
scaler.transform(X_train)  
scaler.transform(X_test)