# 数据挖掘 - 特征工程
> 数据和特征决定了机器学习的上限，而模型和算法只是逼近这个上限而已。  

- 流程：
<img style="float: center;" src="./figures/特征工程.png" width="50%">
<center>Fig. 特征工程</center>


## 数据清洗
- 数据样本抽样
    - 样本要具有代表性
    - 样本比例要平衡、样本不平衡时如何处理
    - 考虑全量数据

- 异常值（空值）处理
    - 识别异常值和重复值
        - Pandas: isnull()/duplicated()
    - 直接丢弃（包括重复数据）
        - Pandas: drop()/dropna()/drop_duplicated()
    - 将是否有异常当作一个新的属性，替代原值
        - Pandas: fillna()
    - 集中值指代
        - Pandas: fillna()
    - 边界值指代
        - Pandas: fillna()
    - 插值
        - Pandas: interpolate()---Series


In [1]:
import numpy as np # 数值计算
import pandas as pd # 数据分析

In [2]:
df = pd.DataFrame({
    "A": ["a0", "a1", "a1", "a2", "a3", "a4"],
    "B": ["b0", "b1", "b2", "b2", "b3", None],
    "C": [1, 2, None, 3, 4, 5],
    "D": [0.1, 10.2, 11.4, 8.9, 9.1, 12],
    "E": [10, 19, 32, 25, 8, None],
    "F": ["f0", "f1", "g2", "f3", "f4", "f5"]
})
df

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [3]:
df.isnull() # 返回空值

Unnamed: 0,A,B,C,D,E,F
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,True,False,False,True,False


In [4]:
df.dropna() # 丢弃空值

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [5]:
df.dropna(subset=["B"]) # 丢弃"B"中的空值

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [6]:
df.duplicated(["A"]) # 返回重复值

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool

In [7]:
df.drop_duplicates(["A"]) # 丢弃"A"中的重复值

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [8]:
df.fillna(df["E"].mean()) # 空值替换为均值

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,18.8,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,18.8,5.0,12.0,18.8,f5


In [9]:
df["E"].interpolate() # 插值替换"E"中的空值，用于 Series

0    10.0
1    19.0
2    32.0
3    25.0
4     8.0
5     8.0
Name: E, dtype: float64

In [10]:
pd.Series([1,None,4,5,20]).interpolate()

0     1.0
1     2.5
2     4.0
3     5.0
4    20.0
dtype: float64

In [11]:
df["E"].interpolate(method="spline", order=3) # 

0    10.000000
1    19.000000
2    32.000000
3    25.000000
4     8.000000
5   -20.143603
Name: E, dtype: float64

In [12]:
# 去除 "D" 中异常值 "0.1"
upper_q = df["D"].quantile(0.75) # 上四分位数
lower_q = df["D"].quantile(0.25) #下四分位数
q_int = upper_q - lower_q # 四分位间距
k = 1.5
df[df["D"]>lower_q-k*q_int][df["D"]<upper_q+k*q_int]

  df[df["D"]>lower_q-k*q_int][df["D"]<upper_q+k*q_int]


Unnamed: 0,A,B,C,D,E,F
1,a1,b1,2.0,10.2,19.0,f1
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [13]:
df[[True if item.startswith("f") else False for item in list(df["F"].values)]] # 去除 "F" 中的异常值 "g2"

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f1
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


## 特征选择
> 特征选择中使用一些样本进行评估，在正式建模中使用全量数据  

- 剔除与标注不相关或者冗余的特征
<img style="float: center;" src="./figures/数据归约.png" width="50%">
<center>数据归约</center>
- 数据归约的思路之一（另一个思路为抽样）
    - 过滤思想
    - 包裹思想
    - 嵌入思想
    <img style="float: center;" src="./figures/包裹思想.png" width="50%">
    <center>Fig. 包裹思想</center>

    <img style="float: center;" src="./figures/RFE算法.png" width="50%">
    <center>Fig. RFE算法</center>

    <img style="float: center;" src="./figures/嵌入思想.png" width="50%">
    <center>Fig. 嵌入思想</center>



In [14]:
## 导包
import numpy as np # 数值计算
import pandas as pd # 数据分析
import scipy.stats as ss # 统计分析

In [15]:
df = pd.DataFrame({
    "A": ss.norm.rvs(size=10),
    "B": ss.norm.rvs(size=10),
    "C": ss.norm.rvs(size=10),
    "D": np.random.randint(low=0, high=2, size=10)
})
df

Unnamed: 0,A,B,C,D
0,0.543411,1.853274,0.208446,0
1,0.561628,0.592413,0.083393,1
2,-0.042901,1.678708,0.769801,0
3,0.355007,0.110546,-1.15488,0
4,-1.421442,0.227605,0.264522,0
5,0.131963,0.554369,-0.293429,1
6,-0.912573,0.519707,-0.494365,1
7,0.700048,-0.860755,0.19806,0
8,-0.180716,-0.59245,-2.266915,1
9,0.168361,-0.426018,0.71856,0


In [16]:
from sklearn.svm import SVR # SVR 回归器
from sklearn.tree import DecisionTreeRegressor # 决策树回归器

In [17]:
X = df.loc[:, ["A", "B", "C"]] # 特征
Y = df.loc[:, "D"] # 标注

In [18]:
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel # 特征选择：过滤、包裹、嵌入

In [19]:
skb = SelectKBest(k=2) # 
skb.fit(X, Y) # 拟合

SelectKBest(k=2)

In [20]:
skb.transform(X) # 变换

array([[ 0.54341107,  0.20844591],
       [ 0.56162796,  0.08339278],
       [-0.04290139,  0.76980055],
       [ 0.3550067 , -1.15488044],
       [-1.42144151,  0.26452209],
       [ 0.13196307, -0.29342912],
       [-0.91257253, -0.4943652 ],
       [ 0.7000484 ,  0.19805953],
       [-0.18071642, -2.2669152 ],
       [ 0.16836105,  0.71855991]])

In [21]:
df

Unnamed: 0,A,B,C,D
0,0.543411,1.853274,0.208446,0
1,0.561628,0.592413,0.083393,1
2,-0.042901,1.678708,0.769801,0
3,0.355007,0.110546,-1.15488,0
4,-1.421442,0.227605,0.264522,0
5,0.131963,0.554369,-0.293429,1
6,-0.912573,0.519707,-0.494365,1
7,0.700048,-0.860755,0.19806,0
8,-0.180716,-0.59245,-2.266915,1
9,0.168361,-0.426018,0.71856,0


In [22]:
rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=2, step=1) # 
rfe.fit_transform(X, Y) # 拟合、变换

array([[ 0.54341107,  0.20844591],
       [ 0.56162796,  0.08339278],
       [-0.04290139,  0.76980055],
       [ 0.3550067 , -1.15488044],
       [-1.42144151,  0.26452209],
       [ 0.13196307, -0.29342912],
       [-0.91257253, -0.4943652 ],
       [ 0.7000484 ,  0.19805953],
       [-0.18071642, -2.2669152 ],
       [ 0.16836105,  0.71855991]])

In [23]:
sfm = SelectFromModel(estimator=DecisionTreeRegressor(), threshold=0.1) # 
sfm.fit_transform(X, Y) # 拟合、变换

array([[ 0.54341107,  1.85327409,  0.20844591],
       [ 0.56162796,  0.59241342,  0.08339278],
       [-0.04290139,  1.67870797,  0.76980055],
       [ 0.3550067 ,  0.11054614, -1.15488044],
       [-1.42144151,  0.22760544,  0.26452209],
       [ 0.13196307,  0.55436934, -0.29342912],
       [-0.91257253,  0.51970675, -0.4943652 ],
       [ 0.7000484 , -0.86075509,  0.19805953],
       [-0.18071642, -0.5924496 , -2.2669152 ],
       [ 0.16836105, -0.42601845,  0.71855991]])

## 特征变换
> 对指化、离散化、数据平滑、归一化（标准化）、数值化、正规化  

### 对指化
- 指数化（$e^x$）：numpy.exp
- 对数化（$lnx$）：numpy.log

### 离散化（分箱）
- 将连续变量分成几段（bins）
- 原因：克服数据缺陷、某些算法要求、非线数据映射
- 方法：
    - 分箱：等频（等深）、等距（等宽）
    - 自因变量优化


In [30]:
lst = [6, 8, 10, 15, 16, 24, 25, 40, 67]
pd.qcut(lst, 3) # 等深分箱

[(5.999, 13.333], (5.999, 13.333], (5.999, 13.333], (13.333, 24.333], (13.333, 24.333], (13.333, 24.333], (24.333, 67.0], (24.333, 67.0], (24.333, 67.0]]
Categories (3, interval[float64]): [(5.999, 13.333] < (13.333, 24.333] < (24.333, 67.0]]

In [31]:
pd.qcut(lst, q=3, labels=["low", "medium", "high"]) # 等深分箱

['low', 'low', 'low', 'medium', 'medium', 'medium', 'high', 'high', 'high']
Categories (3, object): ['low' < 'medium' < 'high']

In [32]:
pd.cut(lst, bins=3) # 等宽分箱

[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]
Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]

In [33]:
pd.cut(lst, bins=3, labels=["low", "medium", "high"]) # 等宽分箱

['low', 'low', 'low', 'low', 'low', 'low', 'low', 'medium', 'high']
Categories (3, object): ['low' < 'medium' < 'high']

### 归一化（Min-Max）
- $$x^{\prime} = \frac{x-x_{min}}{x_{max}-x_{min}}$$

### 标准化（Z-score）
- $$x^{\prime} = \frac{x-\bar{x}}{\sigma}$$


In [34]:
## 导包
from sklearn.preprocessing import MinMaxScaler,StandardScaler # 归一化、标准化

In [35]:
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1)) # 归一化

array([[0.  ],
       [0.15],
       [0.45],
       [0.7 ],
       [1.  ]])

In [36]:
StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1)) # 标准化

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.]])

### 数值化
>  定类、定序、定距、定比  

- 标签化：定序、定类
- 归一化：定距
- 独热（One-Hot Encode）：定类、定序


In [37]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder # 标签化、独热

In [38]:
LabelEncoder().fit_transform(np.array(["Down","Up","Up","Down"]).reshape(-1,1)) # 标签化

  return f(*args, **kwargs)


array([0, 1, 1, 0], dtype=int64)

In [39]:
LabelEncoder().fit_transform(np.array(["Low","Medium","High","Medium","Low"]).reshape(-1,1)) # 标签化

array([1, 2, 0, 2, 1], dtype=int64)

In [40]:
# One-Hot
lb_encoder = LabelEncoder()
lb_tran_f = lb_encoder.fit_transform(np.array(["Red","Yellow","Blue","Green"]))
oht_encoder = OneHotEncoder().fit(lb_tran_f.reshape(-1,1))
oht_encoder.transform(lb_encoder.transform(np.array(["Yellow","Blue","Green","Green","Red"])).reshape(-1,1)) # 这是一个结构体

<5x4 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [41]:
oht_encoder.transform(lb_encoder.transform(np.array(["Yellow","Blue","Green","Green","Red"])).reshape(-1,1)).toarray()

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

### 正规化（规范化）
- 公式：
    - L1: $$x^{\prime} = \frac{x_i}{\sum^{n}_{j=1}|x_j|}$$
    - L2: $$x^{\prime} = \frac{x_i}{\sqrt{\sum^{n}_{j=1}|x_j|^2}}$$
- 应用：
    - 直接用在特征上
    - 用在每个对象的各个特征的表示（特征矩阵的行）
    - 模型的参数上（回归模型使用较多）
    

In [42]:
from sklearn.preprocessing import Normalizer # 正规化

In [43]:
Normalizer(norm="l1").fit_transform(np.array([1,1,3,-1,2]).reshape(-1,1)) # 正规化（针对行）

array([[ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [ 1.]])

In [44]:
Normalizer(norm="l1").fit_transform(np.array([[1,1,3,-1,2]])) # 正规化（针对行）

array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])

In [45]:
Normalizer(norm="l2").fit_transform(np.array([[1,1,3,-1,2]])) # 正规化（针对行）

array([[ 0.25,  0.25,  0.75, -0.25,  0.5 ]])

In [46]:
print(np.array([1,1,3,-1,2]).reshape(-1,1))
print(np.array([1,1,3,-1,2]))
print(np.array([[1,1,3,-1,2]]))

[[ 1]
 [ 1]
 [ 3]
 [-1]
 [ 2]]
[ 1  1  3 -1  2]
[[ 1  1  3 -1  2]]


## 特征降维
> PCA, LDA,   

### LDA降维
- LDA降维（线性判别式分析，Linear Discriminant Analysis）：投影变换后同一标注内距离尽可能小；不同标注间距离尽可能大
    - 均值：$$\mu_0 = \frac{1}{n_0}\sum_{s\in X_0}x, \mu_1 = \frac{1}{n_1}\sum_{s\in X_1}x$$
    - 最大化：
        $$J(\omega) = \frac{||\vec{\omega}_{0}^{T}(\vec{X}_0-u_0) -\vec{\omega}_{1}^{T}(\vec{X}_1-u_1)||^2}{\vec{\omega }_{0}^{T}(\vec{X}_0-u_0)(\vec{X}_0-u_0)^{T}\vec{\omega}_0 +\vec{\omega}_{1}^{T}(\vec{X}_1-u_1)(\vec{X}_1-u_1)^T\vec{\omega}_1} = \frac{\vec{\omega}^T(\vec{X}_0-u_0)(\vec{X}_1-u_1)^T\vec{\omega}}{\vec{\omega}_{0}^{T}(\vec{X}_0-u_0)(\vec{X}_0-u_0)^{T}\vec{\omega}_0 +\vec{\omega}_{1}^{T}(\vec{X}_1-u_1)(\vec{X}_1-u_1)^T\vec{\omega}_1}$$
        $$\text{令} S_b = (\vec{X}_0-u_0)(\vec{X}_1-u_1 )^T, S_w = (\vec{X}_0-u_0)(\vec{X}_0-u_0)^{T}+(\vec{X}_1-u_1)(\vec{X}_1-u_1)^T$$
        $$\Rightarrow \max J(\omega) = \frac{\vec{\omega}^T S_b\omega}{{\omega}^T S_w\omega}$$


In [48]:
## 导包
import numpy as np # 数值计算
import pandas as pd # 数据分析

In [49]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA降维

In [50]:
X = np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y = np.array([1,1,1,2,2,2])
LinearDiscriminantAnalysis(n_components=1).fit_transform(X, Y) # LDA 降维

array([[-1.73205081],
       [-1.73205081],
       [-3.46410162],
       [ 1.73205081],
       [ 1.73205081],
       [ 3.46410162]])

In [51]:
## Fisher 判别器（fisher classifier）
clf = LinearDiscriminantAnalysis(n_components=1).fit(X, Y) # 拟合
clf.predict([[0.8,1]]) # 判别器

array([2])

## 特征衍生
- 从用户角度衍生：用户购买习惯？购买次数？...
- 从商品角度衍生：商品特征？是否快消品？是否有季节影响？...
- 从关系角度衍生：某商品是否是用户常买的？用户一般会如何搭配？...


In [52]:
## 导包
import numpy as np # 数值计算
import pandas as pd # 数据分析

In [53]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler # 归一化、标准化
from sklearn.preprocessing import LabelEncoder,OneHotEncoder # 标签化、独热编码
from sklearn.preprocessing import Normalizer #  
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA 降维
from sklearn.decomposition import PCA # PCA 降维

In [54]:
## 查看数据集
df = pd.read_csv(".//datasets//HR_comma_sep.csv")
df.head(3)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium


In [55]:
## 
# @arg
# sl: statisfaction_level --- Flase: MinMaxScaler; True: StandardScaler
# le: last_evaluation --- Flase: MinMaxScaler; True: StandardScaler
# npr: number_project --- Flase: MinMaxScaler; True: StandardScaler
# amh: average_monthly_hours --- Flase: MinMaxScaler; True: StandardScaler
# tsc: time_spend_company --- Flase: MinMaxScaler; True: StandardScaler
# wa: Work_accident --- Flase: MinMaxScaler; True: StandardScaler
# pl5: promotion_last_5years --- Flase: MinMaxScaler; True: StandardScaler
# dp: Department --- False: LabelEncoding; True: OneHotEncoding
# slr: salary --- False: LabelEncoding; True: OneHotEncoding
# lower_d: 是否降维 --- Default: False
# ld_n: 保留的维数 --- Default: 1
def hr_preprocessing(sl=False, le=False, npr=False, amh=False, tsc=False, wa=False, pl5=False, dp=False, slr=False, lower_d=False, ld_n=1):
    ## 0. 加载数据
    df = pd.read_csv(".//datasets//HR_comma_sep.csv")
    
    ## 1. 清洗数据（去除异常值、抽样，这里数据量较小不进行抽样了）
    df = df.dropna(subset=["satisfaction_level","last_evaluation"])
    df = df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    
    ## 2. 得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    
    ## 3. 特征选择
    
    
    ## 4. 特征处理
    # 对 "satisfaction_level", "last_evaluation", "number_project",
    # "average_monthly_hours", "time_spend_company", "Work_accident",
    # "promotion_last_5years" 进行归一化和标准化
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ["satisfaction_level", "last_evaluation", "number_project", 
                  "average_monthly_hours", "time_spend_company", "Work_accident", 
                  "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] =\
            MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]] =\
            StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    # 对 "Department", "salary" 进行标签化和归一化
    scaler_lst = [dp, slr]
    column_lst = ["Department", "salary"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == "salary":
                df[column_lst[i]] = [map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    # 降维
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values),label
#         return LinearDiscriminantAnalysis(n_components=ld_n)
    
    return df,label



In [56]:
##
d = dict([("low",0), ("medium",1), ("high",2)])
def map_salary(s):
    return d.get(s,0)


In [57]:
## 
def main():
    print(hr_preprocessing(sl=True, le=True, lower_d=False, ld_n=3))
    
if __name__ == "__main__":
    main()
    
    

(       satisfaction_level  last_evaluation  number_project  \
0               -0.936495        -1.087275             0.0   
1                0.752814         0.840707             0.6   
2               -2.022479         0.957554             1.0   
3                0.431041         0.899131             0.6   
4               -0.976716        -1.145699             0.0   
...                   ...              ...             ...   
14994           -0.856051        -0.853580             0.0   
14995           -0.976716        -1.379394             0.0   
14996           -0.976716        -1.087275             0.0   
14997           -2.022479         1.424944             0.8   
14998           -0.976716        -1.145699             0.0   

       average_monthly_hours  time_spend_company  Work_accident  \
0                   0.285047               0.125            0.0   
1                   0.775701               0.500            0.0   
2                   0.822430               0.250     

## 小结

<img style="float: center;" src="./figures/特征工程小结.png" width="50%">
<center>Fig. 特征工程小结</center>
