# 数据预处理
## 数据无量纲化
无量纲化是指将不同规格的数据转换到统一规格，或者将不同分布的数据转换到某个分布的需求
在以梯度和矩阵为核心的算法中，譬如逻辑回归，支持向量机和神经网络，无量纲化可以加快求解速度；而在距离类模型，譬如K近邻，K-Means聚类中，无量纲化可以提升模型精度，避免某一个取值范围特别大的特征对距离计算造成影响。
一个特例是决策树和树的集成算法不需要无量纲化，决策树可以把任意数据都处理得很好。

线性的无量纲化包括中心化处理和缩放处理，中心化的本质是让所有记录减去一个固定值，即让数据样本数据平移到某个位置。缩放的本质是通过除以一个固定值，将数据固定在某个范围之中，取对数也算是一种缩放处理。

* preprocessing.MinMaxScaler
$$
x^*=\frac{x-\min (x)}{\max (x)-\min (x)}
$$

归一化后的数据服从正态分布

In [1]:
from sklearn.preprocessing import MinMaxScaler

data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

#实现归一化
scaler = MinMaxScaler() #实例化
scaler = scaler.fit(data) #fit，这里本质是生成min(x)和max(x)
result = scaler.transform(data) #通过接口导出结果
result


array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [2]:
result_ = scaler.fit_transform(data) #训练和导出一步达成
scaler.inverse_transform(result_) #将归一化后的结果逆转



array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [3]:
#使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=[5, 10]) #依然实例化
result = scaler.fit_transform(data)
result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

 当X中的特征数量非常多的时候，fit会报错并表示，数据量太大无法计算
 此时使用partial_fit作为训练接口
 `scaler = scaler.partial_fit(data)`

In [4]:
#使用numpy来实现归一化
import numpy as np
X = np.array([[-1, 2], [-0.5, 6], [0, 10], [1, 18]])

#归一化
X_nor = (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))

#逆转归一化
X_returned = X_nor * (X.max(axis = 0) - X.min(axis = 0)) + X.min(axis = 0)
X_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

* preprocessing.StandardScaler
当数据按均值中心化后，再按标准差缩放，数据就会服从均值为0，方差为1的正态分布。
$$ x^* = \frac{x-\mu}{\sigma} $$

In [5]:
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

scaler = StandardScaler()
scaler.fit(data)

scaler.mean_
scaler.var_

x_std = scaler.transform(data)

x_std.mean()
x_std.std()

scaler.fit_transform(data)

scaler.inverse_transform(x_std)


array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [6]:
import pandas as pd
data = pd.read_csv(r"./result.csv"
#                   ,index_col = 0
                  )
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [8]:
Age = data.loc[:,"Age"].values.reshape(-1,1) #sklearn当中特征矩阵必须是二维
Age[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [nan],
       [31.],
       [nan]])

In [9]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer() #实例化，默认均值填补
imp_median = SimpleImputer(strategy="median") #用中位数填补
imp_0 = SimpleImputer(strategy="constant", fill_value=0) #用0填补

In [10]:
imp_mean = imp_mean.fit_transform(Age)
imp_mean[:20]

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765]])

In [11]:
imp_median = imp_median.fit_transform(Age)
imp_median[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [28.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [28.],
       [31.],
       [28.]])

In [12]:
imp_0 = imp_0.fit_transform(Age)
imp_0[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [ 0.],
       [31.],
       [ 0.]])

In [13]:
data.loc[:,"Age"] = imp_median

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [15]:
#使用众数填充Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy="most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [17]:
import pandas as pd
data = pd.read_csv(r"./result.csv"
#                   ,index_col = 0
                  )

data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
# .fillna在DataFrame里面直接进行填补


data.dropna(axis=0, inplace = True)
#.dropna(axis=O)删除所有有缺失值的行，.dropna(axis=1)删除所有有缺失值的列
#参数inplace，为True表示在原数据集上进行修改，为False表示生成一个复制对象，不修改原数据，默认False


In [18]:
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No
...,...,...,...,...
886,27.0,male,S,No
887,19.0,female,S,Yes
888,28.0,female,S,No
889,26.0,male,C,Yes


In [19]:
#对数据进行编码
from sklearn.preprocessing import LabelEncoder
y = data.iloc[:,-1] #要输入的是标签，不是特征矩阵，所以允许一维

In [20]:
le = LabelEncoder() #实例化
le = le.fit(y) #导入数据
label = le.transform(y)

In [21]:
label

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,

In [22]:
le.classes_ #查看标签中有多少类别

array(['No', 'Yes'], dtype=object)

In [23]:
data.iloc[:,-1] = label #让标签等于运行出来的结果
data.head(20)

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,1
2,26.0,female,S,1
3,35.0,female,S,1
4,35.0,male,S,0
5,28.0,male,Q,0
6,54.0,male,S,0
7,2.0,male,S,0
8,27.0,female,S,1
9,14.0,female,C,1


In [24]:
from sklearn.preprocessing import LabelEncoder
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])

In [25]:
data

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,1
2,26.0,female,S,1
3,35.0,female,S,1
4,35.0,male,S,0
...,...,...,...,...
886,27.0,male,S,0
887,19.0,female,S,1
888,28.0,female,S,0
889,26.0,male,C,1


In [26]:
from sklearn.preprocessing import OrdinalEncoder
data_ = data.copy()
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [27]:
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,1
2,26.0,0.0,2.0,1
3,35.0,0.0,2.0,1
4,35.0,1.0,2.0,0


In [28]:
#对于舱门这类数据不能通过单一数字来进行编码
#要使用独热编码
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1]

In [29]:
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()

In [30]:
result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [31]:
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [32]:
result.shape

(889, 5)

In [33]:
newdata = pd.concat([data, pd.DataFrame(result)], axis = 1)
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,1.0,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,1.0,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,1.0,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0


In [34]:
newdata.drop(["Sex","Embarked"], axis = 1, inplace = True)
newdata.columns = ["Age", "Survived", "Female", "Male", "Embarked_C", "Embarked_Q", "Embarked_S"]

In [35]:
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,1.0,1.0,0.0,1.0,0.0,0.0
2,26.0,1.0,1.0,0.0,0.0,0.0,1.0
3,35.0,1.0,1.0,0.0,0.0,0.0,1.0
4,35.0,0.0,0.0,1.0,0.0,0.0,1.0


## 处理连续型特征：二值化与分段
* sklearn.preprocessing.Binarizer
根据阈值将数据二值化，用于处理连续型变量，大于阈值的值映射为1，小于阈值的值映射为0。二值化可以决定仅考虑某种现象的存在与否。

In [36]:
data_2 = data.copy()

In [37]:
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)

In [39]:
transformer = Binarizer(threshold=30).fit_transform(X)

In [40]:
from sklearn.preprocessing import KBinsDiscretizer

x = data.iloc[:,0].values.reshape(-1, 1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)

#查看转换后分的箱，变成了一列中的三箱
set(est.fit_transform(X).ravel())

est = KBinsDiscretizer(n_bins=3, encode = 'onehot', strategy='uniform')

#查看转换后分的箱，变成了哑变量
est.fit_transform(X).toarray()


array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])