### 内容

特征选择的常用方法，包括Filter、Wrapper和Embedded方法。

Filter方法包括方差分析、相关系数法、卡方检验、F检验和互信息法。

Wrapper主要是递归特征消除法。

Embedded方法主要包括基于树模型的特征选择法和基于正则化的特征选择法。

In [1]:
import numpy as np
import pandas as pd
import pymysql
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.stats import pearsonr
from collections import Counter
from sklearn.feature_selection import chi2   # 卡方检验
from sklearn.feature_selection import SelectKBest   # 根据 k个最高分选择功能。
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif   # F检验
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings("ignore")

### 1、获取数据

In [2]:
connection = pymysql.Connect(
    host="localhost",
    port=3306,
    user="root",
    passwd="root",
    charset="utf8",
    db="project_researchers"
)

In [3]:
def getData(connection):
    """
    查询数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
    SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
    FROM classifier_isTeacher_label WHERE (label =1 or label = 0) and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

data = getData(connection)
print("shape of data:", data.shape)
print("data.info():", data.info())

shape of data: (18694, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18694 entries, 0 to 18693
Data columns (total 21 columns):
bys_cn                18442 non-null float64
hindex_cn             18557 non-null float64
a_paper               18694 non-null int64
b_paper               18694 non-null int64
c_paper               18694 non-null int64
papernum2017          18694 non-null int64
papernum2016          18694 non-null int64
papernum2015          18694 non-null int64
papernum2014          18694 non-null int64
papernum2013          18694 non-null int64
num_journal           18694 non-null int64
num_conference        18694 non-null int64
degree                18623 non-null float64
pagerank              18623 non-null float64
degree_centrality     18623 non-null float64
diff_year             18623 non-null float64
coauthors_top10000    18694 non-null int64
coauthors_top20000    18694 non-null int64
coauthors_top30000    18694 non-null int64
category              18694 non-nul

### 2、数据处理

In [4]:
# 对缺失值进行处理
# Method1：直接将含有缺失字段的值去掉
data = data.dropna()
print("shape of data::", data.shape)
print("data.info()::", data.info())

shape of data:: (18372, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18372 entries, 0 to 18692
Data columns (total 21 columns):
bys_cn                18372 non-null float64
hindex_cn             18372 non-null float64
a_paper               18372 non-null int64
b_paper               18372 non-null int64
c_paper               18372 non-null int64
papernum2017          18372 non-null int64
papernum2016          18372 non-null int64
papernum2015          18372 non-null int64
papernum2014          18372 non-null int64
papernum2013          18372 non-null int64
num_journal           18372 non-null int64
num_conference        18372 non-null int64
degree                18372 non-null float64
pagerank              18372 non-null float64
degree_centrality     18372 non-null float64
diff_year             18372 non-null float64
coauthors_top10000    18372 non-null int64
coauthors_top20000    18372 non-null int64
coauthors_top30000    18372 non-null int64
category              18372 non-nu

In [5]:
# 将连续值和离散值以及y分开
continuous_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000']
discrete_features = ['category']
X_continous = data[continuous_features]
X_discrete = data[discrete_features]
y = data['label']
print("info of X_continous::", X_continous.info())
print("info of X_discrete::", X_discrete.info())
print("y::", Counter(y))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18372 entries, 0 to 18692
Data columns (total 19 columns):
bys_cn                18372 non-null float64
hindex_cn             18372 non-null float64
a_paper               18372 non-null int64
b_paper               18372 non-null int64
c_paper               18372 non-null int64
papernum2017          18372 non-null int64
papernum2016          18372 non-null int64
papernum2015          18372 non-null int64
papernum2014          18372 non-null int64
papernum2013          18372 non-null int64
num_journal           18372 non-null int64
num_conference        18372 non-null int64
degree                18372 non-null float64
pagerank              18372 non-null float64
degree_centrality     18372 non-null float64
diff_year             18372 non-null float64
coauthors_top10000    18372 non-null int64
coauthors_top20000    18372 non-null int64
coauthors_top30000    18372 non-null int64
dtypes: float64(6), int64(13)
memory usage: 2.8 MB
info of X_c

In [6]:
# 对连续值进行归一化处理，对离散值进行one-hot编码
# 暂时先不进行归一化处理，因为后面要寻找大方差的特征等
# ss = StandardScaler()
# X_continous = ss.fit_transform(X_continous)
# print("type of X_continous::", type(X_continous))

X_discrete_oneHot = OneHotEncoder(sparse=False).fit_transform(X_discrete)
print(X_discrete_oneHot)

X_all = np.hstack((X_continous, X_discrete_oneHot))
print("shape of X_all::", X_all.shape)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
shape of X_all:: (18372, 22)


### 3、Filter方法
过滤法，按照发散性或者相关性对各个特征进行评分，设定阈值或者待选择阈值的个数，选择特征。

包括方差分析、相关系数法、卡方检验、F检验和互信息法。

#### （1）方差分析
方差较大的特征说明其取值发散，使用方差法，要先计算各个特征的方差，然后根据阈值，选择方差大于阈值的特征。
#### （2）相关系数法
皮尔逊系数只能衡量线性相关性，先要计算各个特征对目标值的相关系数以及相关系数的P值
#### （3）卡方检验
卡方检验只能用用于二分类。
#### （4）F检验
F检验和卡方检验都是检验的方法，f_classif用于分类模型，f_regression用于回归模型。
#### （5）互信息法
互信息稀疏反映相关性，互信息越大，说明越相关。

In [7]:
# 方差分析
vt = VarianceThreshold(threshold=2)
X_new = vt.fit_transform(X_all, y)
print(vt.variances_)
print(X_new)
print("shape of X_new::", X_new.shape)

[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00 2.44977625e-01
 1.84609727e-01 2.19926867e-01]
[[ 41.   4.   0. ...   1.   3.   3.]
 [232.   8.   0. ...   1.   4.   4.]
 [103.   7.   0. ...   2.   4.   6.]
 ...
 [ 49.   2.   0. ...   0.   0.   0.]
 [  2.   2.   0. ...   1.   1.   2.]
 [ 21.   4.   0. ...   1.   2.   2.]]
shape of X_new:: (18372, 19)


In [16]:
# 相关系数法
columns = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category1', 'category2', 'category3']
X_all_df = pd.DataFrame(X_all, columns=columns)
# print("info of X_all_df::", X_all_df.info())
y_df = pd.DataFrame(y, columns = ['label'])
# print("info of y_df::", y_df.info())
X_y_all_df = pd.concat([X_all_df, y_df], axis=1)
# print("info of X_y_all_df::", X_y_all_df.info())
print(X_y_all_df.corr())

                      bys_cn  hindex_cn   a_paper   b_paper   c_paper  \
bys_cn              1.000000   0.644273  0.087185  0.120401  0.133408   
hindex_cn           0.644273   1.000000  0.133651  0.181690  0.201967   
a_paper             0.087185   0.133651  1.000000  0.742962  0.609541   
b_paper             0.120401   0.181690  0.742962  1.000000  0.726083   
c_paper             0.133408   0.201967  0.609541  0.726083  1.000000   
papernum2017        0.232951   0.374031  0.524128  0.557366  0.603177   
papernum2016        0.258163   0.409731  0.508840  0.559757  0.612475   
papernum2015        0.275359   0.436976  0.500560  0.560156  0.616260   
papernum2014        0.291261   0.466954  0.469717  0.548254  0.618209   
papernum2013        0.316917   0.497253  0.453531  0.527905  0.592297   
num_journal         0.462671   0.717891  0.274124  0.344442  0.383720   
num_conference      0.224820   0.355659  0.681037  0.783751  0.865838   
degree              0.358866   0.563126  0.444302  

In [9]:
# 卡方检验
k_chi = SelectKBest(chi2, k=15)
X_chi = k_chi.fit_transform(X_all, data['label'])
print(X_chi)
print(k_chi.scores_)
print(k_chi.pvalues_)

[[ 41.   4.   0. ...   4.   8.   3.]
 [232.   8.   0. ...   4.  16.   4.]
 [103.   7.   4. ...   6.  17.   6.]
 ...
 [ 49.   2.   0. ...   0.   0.   0.]
 [  2.   2.   0. ...   0.   0.   2.]
 [ 21.   4.   0. ...   3.  10.   2.]]
[2.89499631e+05 6.27809793e+03 1.63989737e+03 2.07044726e+03
 4.55888006e+03 5.47715196e+03 5.60851137e+03 5.55383244e+03
 5.39025576e+03 5.16561683e+03 6.24744338e+04 1.91444981e+04
 1.15097539e+05 1.36631954e+04 1.03057973e+04 1.83392585e+04
 7.77844366e+02 1.48823845e+03 2.26304244e+03 4.62240511e+02
 1.48370439e+01 4.54277219e+02]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 3.54143396e-171 0.00000000e+000 0.00000000e+000 1.56429966e-102
 1.17210465e-004 8.45832075e-101]


In [10]:
# F检验
k_f = SelectKBest(f_classif, k=15)
X_f = k_f.fit_transform(X_all, data['label'])
print(X_f)
print(k_f.scores_)
print(k_f.pvalues_)

[[4. 3. 3. ... 3. 0. 1.]
 [8. 2. 2. ... 4. 0. 1.]
 [7. 4. 4. ... 6. 0. 1.]
 ...
 [2. 0. 0. ... 0. 1. 0.]
 [2. 0. 0. ... 2. 1. 0.]
 [4. 0. 0. ... 2. 1. 0.]]
[ 134.84869814 1629.49073283   60.17944086  110.59771289  159.88717488
  553.57810775  601.44707216  613.1734877   636.56291361  660.83315752
  872.58167833  358.35741816  849.04675701  534.16085324  869.44003403
 7024.53190704  462.316657    772.59561546 1059.26463256  846.95410797
   19.65197876  700.22374718]
[4.57473344e-031 0.00000000e+000 9.11046861e-015 8.58106003e-026
 1.69979600e-036 1.26968235e-120 1.01976728e-130 3.46406053e-133
 4.16754562e-138 3.32352335e-143 2.11323083e-187 3.63684764e-079
 1.63075689e-182 1.60826642e-116 9.48440744e-187 0.00000000e+000
 2.66441851e-101 1.35321364e-166 5.91695246e-226 4.43876376e-182
 9.34453381e-006 1.82601260e-151]


In [11]:
# 互信息法
k_info = SelectKBest(mutual_info_classif, k=15)
X_info = k_info.fit_transform(X_all, data['label'])
print(X_info)
print(k_info.scores_)

[[ 41.   4.   0. ...   4.   8.   3.]
 [232.   8.   0. ...   4.  16.   4.]
 [103.   7.   4. ...   6.  17.   6.]
 ...
 [ 49.   2.   0. ...   0.   0.   0.]
 [  2.   2.   0. ...   0.   0.   2.]
 [ 21.   4.   0. ...   3.  10.   2.]]
[0.10633509 0.11476907 0.01292145 0.02123134 0.03895578 0.06859509
 0.07017215 0.07567139 0.07424737 0.06980705 0.16301644 0.07059706
 0.15322675 0.13242794 0.14107067 0.17478264 0.01831638 0.03570873
 0.04688432 0.02213511 0.         0.02230453]


### 4、Wrapper方法
包装法，根据目标函数（通常是预测效果评分），每次选择若干特征，或者排除若干特征。包裹式特征选择直接把最终将要使用的模型的性能作为特征子集的评价标准，也就是说，包裹式特征选择的目的就是为给定的模型选择最有利于其性能的特征子集。

从模型的性能来看，包裹式特征选择比过滤式特征选择更好，但需要多次训练模型，因此计算开销较大。

包括递归特征消除法等。

#### （1）递归特征消除法
递归消除特征法使用一个基模型来进行多轮训练，每轮训练后，消除若干权值系数的特征，再基于新的特征集进行下一轮训练。

In [12]:
# RFE
model_lg = RFE(estimator=LogisticRegression(), n_features_to_select=15)
X_lg = model_lg.fit_transform(X_all, data['label'])
print(X_lg)
print(model_lg.n_features_)
print(model_lg.support_)
print(model_lg.ranking_)

[[4. 0. 0. ... 0. 0. 1.]
 [8. 0. 0. ... 0. 0. 1.]
 [7. 0. 4. ... 0. 0. 1.]
 ...
 [2. 0. 0. ... 1. 0. 0.]
 [2. 0. 0. ... 1. 0. 0.]
 [4. 0. 0. ... 1. 0. 0.]]
15
[False  True False  True  True  True False  True  True  True False False
  True  True False  True False  True  True  True  True  True]
[8 1 4 1 1 1 5 1 1 1 3 2 1 1 6 1 7 1 1 1 1 1]


In [13]:
# RFECV
model_lg_cv = RFECV(estimator=LogisticRegression(), step=1, cv=StratifiedKFold(n_splits=3), scoring="accuracy")
X_lg_cv = model_lg_cv.fit_transform(X_all, data['label'])
print(X_lg_cv)
print(model_lg_cv.n_features_)
print(model_lg_cv.support_)
print(model_lg_cv.ranking_)

[[4. 0. 0. ... 0. 0. 1.]
 [8. 0. 0. ... 0. 0. 1.]
 [7. 0. 0. ... 0. 0. 1.]
 ...
 [2. 0. 0. ... 1. 0. 0.]
 [2. 0. 0. ... 1. 0. 0.]
 [4. 0. 0. ... 1. 0. 0.]]
20
[False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True]
[3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1]


### 5、Embedded方法
集成法，先使用某些机器学习的算法和模型进行训练，得到各个特征的权值系数，根据系数从大到小选择特征。类似于Filter方法，但是是通过训练来确定特征的优劣。

包括基于树模型的特征选择法、正则化方法等。

#### （1）基于树模型的特征选择法
树模型中GBDT也可用来作为基模型进行特征选择。

#### （2）基于L1的特征选择法
使用L1范数作为惩罚项的线性模型会得到稀疏解，可以起到特征选择的作用。

In [14]:
# 基于树模型的特征选择法
model_gdbc= SelectFromModel(GradientBoostingClassifier())
X_gdbc = model_gdbc.fit_transform(X_all, data['label'])
print(X_gdbc)
print("shape of X_gdbc::", X_gdbc.shape)

[[48.  8.  1.]
 [45. 16.  1.]
 [73. 17.  1.]
 ...
 [ 2.  0.  0.]
 [ 5.  0.  0.]
 [34. 10.  0.]]
shape of X_gdbc:: (18372, 3)


In [15]:
# L1正则化
model_lsvc = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))
X_lsvc = model_lsvc.fit_transform(X_all, data['label']) 
print(X_lsvc)
print("shape of X_lsvc::", X_lsvc.shape)

[[ 41.   4.   3. ...   0.   0.   1.]
 [232.   8.   2. ...   0.   0.   1.]
 [103.   7.   4. ...   0.   0.   1.]
 ...
 [ 49.   2.   0. ...   1.   0.   0.]
 [  2.   2.   0. ...   1.   0.   0.]
 [ 21.   4.   0. ...   1.   0.   0.]]
shape of X_lsvc:: (18372, 15)
