In [5]:
from sklearn import preprocessing 
import numpy as np

#原始数据
a = np.array([[10,2.7,3.6],[-100,5,-2],[120,20,40]],dtype=np.float64)
print(a)

[[  10.     2.7    3.6]
 [-100.     5.    -2. ]
 [ 120.    20.    40. ]]


In [6]:
#对数据进行标准化
a_scale = preprocessing.scale(a)
print(a_scale)

[[ 0.         -0.85170713 -0.55138018]
 [-1.22474487 -0.55187146 -0.852133  ]
 [ 1.22474487  1.40357859  1.40351318]]


In [18]:
#数据处理实例

from sklearn import datasets,preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

#读取数据
iris = datasets.load_iris()

In [19]:


#使用原始数据，不进行标准化处理
x = iris.data
y = iris.target

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))

0.85


In [20]:
#正太标准化：将每一个特征的数据分布化为标准正太分布

x = preprocessing.StandardScaler().fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))

0.9


In [21]:
#归一化：将数据缩放到某一个范围之内。默认是0到1之间

#方法1

x = preprocessing.MinMaxScaler().fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))



0.95


In [22]:
#方法2
x = preprocessing.minmax_scale(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))

0.95


In [23]:
#异常值的处理
#异常值对于模型的训练会产生不好的影响
#另外，归一化方法对异常数据十分敏感，在进行归一化之前往往也需要去除异常数据

x = preprocessing.RobustScaler().fit_transform(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))


0.85


In [24]:
#除最大数的绝对值，将数据放缩到[-1,1]之间
#擅长于处理稀疏矩阵

x = preprocessing.maxabs_scale(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))



0.95


In [25]:
#处理缺失值

from sklearn.impute import SimpleImputer

x = SimpleImputer().fit_transform(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))




0.95


In [27]:
#常规标准化
#数据分布的均值为0，方差为1.但不要求数据分布为正太分布
x = preprocessing.scale(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.13,random_state=13)

#建立模型,训练模型
knn = KNeighborsClassifier().fit(x_train,y_train)

#正确率
print(knn.score(x_test,y_test))

0.9
