In [4]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

#导入数据
#葡萄牙银行市场营销数据
path = r'D:\DataSet\digit_recognizer\train.csv'
data = pd.read_csv(path)

In [5]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [7]:
#将特征与标签从数据集中提取出来
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [9]:
print(X.shape)
print(y.shape)

(42000, 784)
(42000,)


In [13]:
y.value_counts().sort_index()

0    4132
1    4684
2    4177
3    4351
4    4072
5    3795
6    4137
7    4401
8    4063
9    4188
Name: label, dtype: int64

也就是说，每个像素点作为一个特征，共有784个特征

## 1 特征选择-过滤型

### 1.1 方差过滤

#### 数组求方差

In [17]:
#使用numpy求数组的方差
a = [5,6,16,9]
np.var(a)

18.5

In [19]:
#使用numpy计算矩阵(二维数组)的方差
b = [[4,5],[6,7]]

#计算矩阵中所有元素的方差
print(np.var(b))
#计算矩阵中每一列的方差
print(np.var(b,axis=0))
#计算矩阵中每一行的方差
print(np.var(b,axis=1))

1.25
[1. 1.]
[0.25 0.25]


#### table数据求方差

In [15]:
#使用numpy求dataframe型数据的方差方差
np.var(X['pixel0'])

0.0

In [16]:
np.var(X)

pixel0      0.000000
pixel1      0.000000
pixel2      0.000000
pixel3      0.000000
pixel4      0.000000
              ...   
pixel779    0.171611
pixel780    0.000000
pixel781    0.000000
pixel782    0.000000
pixel783    0.000000
Length: 784, dtype: float64

In [20]:
#使用pandas求数据的方差
X.var()

pixel0      0.000000
pixel1      0.000000
pixel2      0.000000
pixel3      0.000000
pixel4      0.000000
              ...   
pixel779    0.171615
pixel780    0.000000
pixel781    0.000000
pixel782    0.000000
pixel783    0.000000
Length: 784, dtype: float64

In [21]:
#将特征按照方差进行排序
X.var().sort_values()

pixel0          0.000000
pixel448        0.000000
pixel421        0.000000
pixel420        0.000000
pixel392        0.000000
                ...     
pixel434    12712.507782
pixel461    12750.287623
pixel627    12768.248426
pixel378    12930.525263
pixel406    12961.855023
Length: 784, dtype: float64

#### 方差过滤的编程实现

In [24]:
from sklearn.feature_selection import VarianceThreshold
#实例化，不填参数阈值默认为0
var_selector = VarianceThreshold()
#使用实例化好的过滤器进行过滤
X_var = var_selector.fit_transform(X) #将方差为0的特征过滤掉
print("方差过滤后数据的形状：",X_var.shape)

方差过滤后数据的形状： (42000, 708)


In [26]:
#如果想消除方差较低的一半特征呢？
med = np.median(X.var())
X_var_1 = VarianceThreshold(med).fit_transform(X)
print(X_var_1.shape)

(42000, 392)


### 1.2 相关性过滤
衡量相关性的指标主要有：  
1.相关系数：pearson相关系数、spearson相关系数、kendall相关系数    
2.假设检验：  
卡方检验(专门针对离散型标签的相关性指标，并且卡方过滤不能计算复数)  
F检验(方差齐性检验)    
3.互信息值：  
互信息法返回的是每个特征与标签之间的互信息量的估计，这个估计量的取值范围为[0,1]  
取值为0表示两个变量独立，取值为1则表示两个变量完全相关



#### 使用相关系数进行过滤

In [30]:
#整个表格全部的相关系数
data.corr()['label'] 
#不填参数默认是pearson相关系数
#想使用其他的相关系数可以填'spearman','kendall'

label       1.000000
pixel0           NaN
pixel1           NaN
pixel2           NaN
pixel3           NaN
              ...   
pixel779    0.006075
pixel780         NaN
pixel781         NaN
pixel782         NaN
pixel783         NaN
Name: label, Length: 785, dtype: float64

In [37]:
cor = data.corr()['label'] 

In [43]:
cor.dropna(inplace=True)

In [48]:
cor.shape

(709,)

In [47]:
cor.sort_values(ascending=False)

label       1.000000
pixel381    0.431506
pixel409    0.423767
pixel436    0.420922
pixel408    0.413855
              ...   
pixel596   -0.314162
pixel539   -0.314998
pixel567   -0.315471
pixel538   -0.319009
pixel510   -0.321565
Name: label, Length: 709, dtype: float64

In [49]:
# 相关系数的绝对值
cor1=np.abs(cor)

label       1.000000
pixel12     0.002242
pixel13     0.000703
pixel14     0.004151
pixel15     0.004151
              ...   
pixel775    0.025050
pixel776    0.019558
pixel777    0.014490
pixel778    0.009790
pixel779    0.006075
Name: label, Length: 709, dtype: float64

In [54]:
#这样做，不借助SelectKBest函数也可以完成特征选择
#反正就是把相关系数高于某个阈值的挑出来就行
cor[cor.index]

label       1.000000
pixel12     0.002242
pixel13    -0.000703
pixel14    -0.004151
pixel15    -0.004151
              ...   
pixel775    0.025050
pixel776    0.019558
pixel777    0.014490
pixel778    0.009790
pixel779    0.006075
Name: label, Length: 709, dtype: float64

In [36]:
#单独两组数据之间的pearson相关系数
X['pixel779'].corr(y,method="pearson")

0.006074568784353702


#### 使用假设检验方法进行过滤

In [61]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)

In [70]:
#chi2能够返回两个数组，一个是卡方值，一个是p值，用于表达对应相关性的显著性
chi_value, chi_pvalue = chi2(X,y)

In [72]:
chi_value.shape

(784,)

In [76]:
pd.DataFrame(chi_value, index = X.columns)

Unnamed: 0,0
pixel0,
pixel1,
pixel2,
pixel3,
pixel4,
...,...
pixel779,1025.194274
pixel780,
pixel781,
pixel782,


In [78]:
#F检验
from sklearn.feature_selection import f_classif
f_value, f_pvalue = f_classif(X,y)

  22  23  24  25  26  27  28  29  30  31  52  53  54  55  56  57  82  83
  84  85 111 112 139 140 141 168 196 392 420 421 448 476 532 560 644 645
 671 672 673 699 700 701 727 728 729 730 731 754 755 756 757 758 759 760
 780 781 782 783] are constant.
  f = msb / msw


In [80]:
pd.DataFrame(f_value,index=X.columns)

Unnamed: 0,0
pixel0,
pixel1,
pixel2,
pixel3,
pixel4,
...,...
pixel779,1.89681
pixel780,
pixel781,
pixel782,


In [81]:
### 使用互信息值进行过滤
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(X,y)
print(result)
### 互信息值的计算这么消耗时间吗？看来特征数量特别多时，不适合用这种方法

[0.00000000e+00 3.35551923e-03 1.35921610e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 3.31049997e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.11492089e-04 4.08309595e-03 0.00000000e+00
 7.17782818e-03 4.56499162e-05 0.00000000e+00 5.24613572e-03
 1.36835355e-03 4.52210752e-03 3.07867609e-03 1.53228386e-03
 4.31285146e-03 4.67701925e-03 5.04897388e-03 1.28839296e-03
 1.36992594e-03 4.82317942e-03 9.56814828e-04 1.82430423e-03
 7.41957917e-05 5.69107927e-03 2.02512495e-03 3.02085078e-03
 9.70087190e-03 1.92870031e-04 7.03695278e-03 2.77482186e-03
 5.44308663e-03 0.00000000e+00 1.07778362e-03 1.67286280e-03
 0.00000000e+00 2.84075242e-04 0.00000000e+00 0.00000000e+00
 1.48455785e-03 3.40813985e-03 9.14637018e-04 9.47030886e-04
 0.00000000e+00 7.60787332e-04 0.00000000e+00 2.13285234e-03
 1.07464358e-03 0.00000000e+00 5.05729065e-03 0.00000000e+00
 2.60757947e-03 1.010584

除这两种方法以外，还可以使用机器学习模型来帮我们评估每个特征的重要性  
但是用于评估的模型必须要天然拥有特征系数或者特征重要性这些东西  
嵌入法和包装法也是以此为基础进行计算的  