## 功能简介

本py文件主要用于特征选择方法的确定和特征选择参数的确定，具体如下：

### 一、使用方差选择（Filter方法）

1、特征选择

2、使用网格搜索确定特征选择参数

3、使用xgboost训练模型

### 二、使用递归特征消除法（Wrapper方法）

1、使用RFE和RFECV进行特征选择

2、网格搜索确定保留特征数

3、使用xgboost训练模型

### 三、使用正则化（Embedded方法）

1、使用L1正则化

2、网格搜索确定正则项系数

3、使用LinearSVC训练模型


In [230]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV
from sklearn.feature_extraction import DictVectorizer  
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
import pickle
import json
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

### 1、获取数据

In [231]:
connection = pymysql.Connect(
    host="localhost",
    port=3306,
    user="root",
    passwd="root",
    charset="utf8",
    db="project_researchers"
)

In [232]:
def get_train_data(connection):
    """
    查询训练数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id > 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id > 64438 and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

train_data = get_train_data(connection)
print("shape of train_data:", train_data.shape)
print("train_data.info():", train_data.info())

shape of train_data: (17844, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 21 columns):
bys_cn                17608 non-null float64
hindex_cn             17707 non-null float64
a_paper               17844 non-null int64
b_paper               17844 non-null int64
c_paper               17844 non-null int64
papernum2017          17844 non-null int64
papernum2016          17844 non-null int64
papernum2015          17844 non-null int64
papernum2014          17844 non-null int64
papernum2013          17844 non-null int64
num_journal           17844 non-null int64
num_conference        17844 non-null int64
degree                17774 non-null float64
pagerank              17774 non-null float64
degree_centrality     17774 non-null float64
diff_year             17774 non-null float64
coauthors_top10000    17844 non-null int64
coauthors_top20000    17844 non-null int64
coauthors_top30000    17844 non-null int64
category              17844 n

In [233]:
def get_test_data(connection):
    """
    查询测试数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 1 and teac_id <= 174 and category is not null
     UNION ALL
     SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
     FROM classifier_isTeacher_xgbc WHERE label = 0 and teac_id <= 64438 and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

test_data = get_test_data(connection)
print("shape of test_data:", test_data.shape)
print("test_data.info():", test_data.info())

shape of test_data: (850, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 21 columns):
bys_cn                834 non-null float64
hindex_cn             850 non-null int64
a_paper               850 non-null int64
b_paper               850 non-null int64
c_paper               850 non-null int64
papernum2017          850 non-null int64
papernum2016          850 non-null int64
papernum2015          850 non-null int64
papernum2014          850 non-null int64
papernum2013          850 non-null int64
num_journal           850 non-null int64
num_conference        850 non-null int64
degree                849 non-null float64
pagerank              849 non-null float64
degree_centrality     849 non-null float64
diff_year             849 non-null float64
coauthors_top10000    850 non-null int64
coauthors_top20000    850 non-null int64
coauthors_top30000    850 non-null int64
category              850 non-null int64
label                 850 non-null 

### 2、处理数据

In [234]:
# 对缺失值进行处理
# Method1：直接将含有缺失字段的值去掉
train_data = train_data.dropna()
print("shape of train_data::", train_data.shape)
print("train_data.info()::", train_data.info())
test_data = test_data.dropna()
print("shape of test_data::", test_data.shape)
print("test_data.info()::", test_data.info())
print("y_test_label::", Counter(test_data["label"]))

shape of train_data:: (17539, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17539 entries, 0 to 17843
Data columns (total 21 columns):
bys_cn                17539 non-null float64
hindex_cn             17539 non-null float64
a_paper               17539 non-null int64
b_paper               17539 non-null int64
c_paper               17539 non-null int64
papernum2017          17539 non-null int64
papernum2016          17539 non-null int64
papernum2015          17539 non-null int64
papernum2014          17539 non-null int64
papernum2013          17539 non-null int64
num_journal           17539 non-null int64
num_conference        17539 non-null int64
degree                17539 non-null float64
pagerank              17539 non-null float64
degree_centrality     17539 non-null float64
diff_year             17539 non-null float64
coauthors_top10000    17539 non-null int64
coauthors_top20000    17539 non-null int64
coauthors_top30000    17539 non-null int64
category              17539 

In [235]:
# 将category变成离散值（object），并将train_data变成向量（离散值变成one-hot）

# training data
train_data[['category']] = train_data[['category']].astype(object)
y_train = train_data['label']
X_train = train_data.drop(columns=['label'])
print("****", X_train.shape)

vec = DictVectorizer()   

# 因为fDictVectorizer().fit_transform()需要的参数时list of dict，所以这里将其转化
headers_train = list(X_train.columns)
value_df_train = X_train.values
feature_list_train = []
for value_train in value_df_train:
    feature_dict = {}
    for i in range(0, len(headers_train)):
        if headers_train[i]=='category':
            feature_dict[headers_train[i]] = str(value_train[i])
        else:
            feature_dict[headers_train[i]] = value_train[i]
    feature_list_train.append(feature_dict)
print(len(feature_list_train), ' ', len(feature_list_train[0]))

X_train = vec.fit_transform(feature_list_train)
print("info of X_train:", X_train.shape)
print("y_train::", Counter(y_train))

# testing data
test_data[['category']] = test_data[['category']].astype(object)
y_test = test_data['label']
X_test = test_data.drop(columns=['label'])

headers_test = list(X_test.columns)
value_df_test = X_test.values
# print(headers)
# print(value_df.shape)
feature_list_test = []
for value_test in value_df_test:
    feature_dict = {}
    for i in range(0, len(headers_test)):
        if headers_test[i]=='category':
            feature_dict[headers_test[i]] = str(value_test[i])
        else:
            feature_dict[headers_test[i]] = value_test[i]
    feature_list_test.append(feature_dict)
print(len(feature_list_test), ' ', len(feature_list_test[0]))

X_test = vec.transform(feature_list_test)
print("info of X_test:", X_test.shape)
print("y_test::", Counter(y_test))

**** (17539, 20)
17539   20
info of X_train: (17539, 22)
y_train:: Counter({1: 16551, 0: 988})
833   20
info of X_test: (833, 22)
y_test:: Counter({0: 683, 1: 150})


## 3、获取需要预测的数据

In [236]:
def get_predict_data(connection):
    """
    获取需要预测的数据，包括训练集中的特征
    :param connection:
    :return:
    """
    sql_select = """
    SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category 
    FROM classifier_isTeacher_xgbc WHERE label is null and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category']
    data = df[all_features]
    return data

data_test = get_predict_data(connection)
print("shape of data_test:", data_test.shape)
print("data_test.info():", data_test.info())

shape of data_test: (181057, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                176811 non-null float64
hindex_cn             180624 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                180847 non-null float64
pagerank              180847 non-null float64
degree_centrality     180847 non-null float64
diff_year             180847 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category

## 4、处理需要预测的数据

In [237]:
# 使用0进行填充
data_test_fill = data_test.fillna(0)
print("info of data_test_fill::", data_test_fill.info())

# 因为fDictVectorizer().fit_transform()需要的参数时list of dict，所以这里将其转化
headers_test_all = list(data_test_fill.columns)
value_df_train_all = data_test_fill.values
feature_list_train_all = []
for value_test_all in value_df_train_all:
    feature_dict = {}
    for i in range(0, len(headers_test_all)):
        if headers_test_all[i]=='category':
            feature_dict[headers_test_all[i]] = str(value_test_all[i])
        else:
            feature_dict[headers_test_all[i]] = value_test_all[i]
    feature_list_train_all.append(feature_dict)
print(len(feature_list_train_all), ' ', len(feature_list_train_all[0]))

X_test_all = vec.transform(feature_list_train_all)

print("shape of X_test_all::", X_test_all.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                181057 non-null float64
hindex_cn             181057 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                181057 non-null float64
pagerank              181057 non-null float64
degree_centrality     181057 non-null float64
diff_year             181057 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category              181057 non-null int

### 5、Filter方法

（1）特征选择使用方差分析

（2）分类器使用XGboost

In [238]:
def train_XGBC_var_select(X_train, y_train, X_test, y_test, var_threshold):
    
    print("接收到的threshold是：%d" % var_threshold)
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)
    
    # 方差选择，只选择连续特征，离散特征不做选择
    vt = VarianceThreshold(threshold=var_threshold)
    X_continuous_train_new = vt.fit_transform(X_train[:, 0:-3].A)
    print(vt.variances_)
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_continuous_train_new)
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)
    
     # 方差选择，只选择连续特征，离散特征不做选择
    X_continuous_test_new = vt.transform(X_test[:, 0:-3])
    print(vt.variances_)
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_continuous_test_new)
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    

# 网格搜索 
for var_threshold in range(0, 10, 2):
    X_train_copy = deepcopy(X_train)    
    X_test_copy = deepcopy(X_test)
    y_train_copy = deepcopy(y_train)
    y_test_copy = deepcopy(y_test)
    print()
    print("**************threshold is::%f******************" % var_threshold)
    train_XGBC_var_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, var_threshold)  


**************threshold is::0.000000******************
接收到的threshold是：0
X_train.shape (17539, 22)
X_test.shape (833, 22)
[2.56106580e+01 2.25313360e+01 4.17984507e+05 7.58498276e+01
 2.42543605e-01 1.86447944e-01 2.23900224e-01 2.43022507e+00
 5.05146466e+00 7.85393791e+00 1.08043618e+04 8.21636764e+01
 4.72188995e+01 2.53833987e+01 6.17049224e+02 2.90248006e+03
 2.16394958e+02 2.54135433e+01 2.82871362e+01]
shape of X_continuous_train_new:: (17539, 19)
type of X_continuous_train_new:: <class 'numpy.ndarray'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
[2.56106580e+01 2.25313360e+01 4.17984507e+05 7.58498276e+01
 2.42543605e-01 1.86447944e-01 2.23900224e-01 2.43022507e+00
 5.05146466e+00 7.85393791e+00 1.08043618e+04 8.21636764e+01
 4.72188995e+01 2.53833987e+01 6.17049224e+02 2.90248006e+03
 2.16394958e+02 2.54135433e+01 2.82871362e+01]
shape of X_continuous_test_new:: (833, 19)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
sha

In [240]:
# 若是特征非常多，则可以考虑去除对性能影响最小的特征。这里特征不是很多，不在进行去除，只是为了再熟悉下流程，这里写了一遍方差分析。
def train_and_test_XGBC_var_select(X_train, y_train, X_test, y_test, X_test_all, var_threshold = 0):
    
    print("接收到的threshold是：%d" % var_threshold)
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)
    
    # 方差选择，只选择连续特征，离散特征不做选择
    vt = VarianceThreshold(threshold=var_threshold)
    X_continuous_train_new = vt.fit_transform(X_train[:, 0:-3].A)
    print(vt.variances_)
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_continuous_train_new)
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)
    
     # 方差选择，只选择连续特征，离散特征不做选择
    X_continuous_test_new = vt.transform(X_test[:, 0:-3])
    print(vt.variances_)
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_continuous_test_new)
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
    
    # 测试集上，方差选择，只选择连续特征，离散特征不做选择
    X_continuous_test_all = vt.transform(X_test_all[:, 0:-3])
    print(vt.variances_)
    print("shape of X_continuous_test_all::", X_continuous_test_all.shape)

    # 归一化
    X_continuous_test_all = ss.transform(X_continuous_test_all)
    print("type of X_continuous_test_all::", type(X_continuous_test_all))
    print("shape of X_continuous_test_all::", X_continuous_test_all.shape)
    
    # 将连续值和离散值拼接
    print("shape:::", X_test_all[:, -3:].shape)
    X_test_all = np.hstack((X_continuous_test_all.A, X_test_all[:, -3:].A))
    print("shape of X_test::", X_test_all.shape)
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))
    
# 调用预测函数
X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
train_and_test_XGBC_var_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all)

接收到的threshold是：0
X_train.shape (17539, 22)
X_test.shape (833, 22)
[2.56106580e+01 2.25313360e+01 4.17984507e+05 7.58498276e+01
 2.42543605e-01 1.86447944e-01 2.23900224e-01 2.43022507e+00
 5.05146466e+00 7.85393791e+00 1.08043618e+04 8.21636764e+01
 4.72188995e+01 2.53833987e+01 6.17049224e+02 2.90248006e+03
 2.16394958e+02 2.54135433e+01 2.82871362e+01]
shape of X_continuous_train_new:: (17539, 19)
type of X_continuous_train_new:: <class 'numpy.ndarray'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
[2.56106580e+01 2.25313360e+01 4.17984507e+05 7.58498276e+01
 2.42543605e-01 1.86447944e-01 2.23900224e-01 2.43022507e+00
 5.05146466e+00 7.85393791e+00 1.08043618e+04 8.21636764e+01
 4.72188995e+01 2.53833987e+01 6.17049224e+02 2.90248006e+03
 2.16394958e+02 2.54135433e+01 2.82871362e+01]
shape of X_continuous_test_new:: (833, 19)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:

### 6、Wrapper方法

#### （1）RFE

RFE和方差分析一样，去掉特征必然会带来性能的降低。

In [241]:
def train_XGBC_rfe_select(X_train, y_train, X_test, y_test, n_features_to_select):
    
    print("接收到的n_features_to_select是：%d" % n_features_to_select)
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)
    
    # RFE特征选择
    estimator = XGBClassifier()
    selector = RFE(estimator=estimator, n_features_to_select = n_features_to_select)
    X_train_rfe = selector.fit_transform(X_train, y_train) 
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    print(selected_idx)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_train_tmp = X_train.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_train_tmp = X_train.A[:, discrete_idx]
    
    X_continuous_test_tmp = X_test.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_tmp = X_test.A[:, discrete_idx]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_train_tmp = ss.fit_transform(X_continuous_train_tmp)
    print("type of X_continuous_train_tmp::", type(X_continuous_train_tmp))
    print("shape of X_continuous_train_tmp::", X_continuous_train_tmp.shape)

    # 将连续值和离散值拼接
    X_train_new = np.hstack((X_continuous_train_tmp, X_discreate_train_tmp))
    print("shape of X_train_new::", X_train_new.shape)
    
    # testing data 归一化
    X_continuous_test_tmp = ss.fit_transform(X_continuous_test_tmp)
    print("type of X_continuous_test_tmp::", type(X_continuous_test_tmp))
    print("shape of X_continuous_test_tmp::", X_continuous_test_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new = np.hstack((X_continuous_test_tmp, X_discreate_test_tmp))
    print("shape of X_test_new::", X_test_new.shape)

    # 划分训练集和测试集
    print("shape of X_train_new::", X_train_new.shape)
    print("shape of X_test_new::", X_test_new.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_new, y_train)
    y_test_predict = xgbc.predict(X_test_new)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
for n_features_to_select in range(20, 10, -1):    # 连续特征共有19个
    X_train_copy = deepcopy(X_train)    
    X_test_copy = deepcopy(X_test)
    y_train_copy = deepcopy(y_train)
    y_test_copy = deepcopy(y_test)
    print()
    print("**************n_features_to_select is::%f******************" % n_features_to_select)
    train_XGBC_rfe_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, n_features_to_select)  


**************n_features_to_select is::20.000000******************
接收到的n_features_to_select是：20
X_train.shape (17539, 22)
X_test.shape (833, 22)
selector.support_:: [ True  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False]
[ 0  1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
type of X_continuous_train_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_train_tmp:: (17539, 18)
shape of X_train_new:: (17539, 19)
type of X_continuous_test_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_tmp:: (833, 18)
shape of X_test_new:: (833, 19)
shape of X_train_new:: (17539, 19)
shape of X_test_new:: (833, 19)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.18007   0.30519 

selector.support_:: [ True False  True  True  True False  True  True  True  True  True False
  True  True False  True False False  True False False False]
[ 0  2  3  4  6  7  8  9 10 12 13 15 18]
type of X_continuous_train_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_train_tmp:: (17539, 13)
shape of X_train_new:: (17539, 16)
type of X_continuous_test_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_tmp:: (833, 13)
shape of X_test_new:: (833, 16)
shape of X_train_new:: (17539, 16)
shape of X_test_new:: (833, 16)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.18007   0.30519       833

   micro avg    0.18007   0.18007   0.18007       833
   macro avg    0.50000   0.09004   0.15259       833
weighted avg    1.00000   0.18007   0.30519     

In [242]:
# 从上面的分析过程可以知道，RFE没什么作用，这里取15
def train_and_test_XGBC_rfe_select(X_train, y_train, X_test, y_test, X_test_all, n_features_to_select=15):
    
    print("接收到的n_features_to_select是：%d" % n_features_to_select)
    
    print("X_train.shape", X_train.shape)
    print("X_test.shape", X_test.shape)
    
    # RFE特征选择
    estimator = XGBClassifier()
    selector = RFE(estimator=estimator, n_features_to_select = n_features_to_select)
    X_train_rfe = selector.fit_transform(X_train, y_train) 
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    print(selected_idx)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_train_tmp = X_train.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_train_tmp = X_train.A[:, discrete_idx]
    
    X_continuous_test_tmp = X_test.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_tmp = X_test.A[:, discrete_idx]
    
    X_continuous_test_all_tmp = X_test_all.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_all_tmp = X_test_all.A[:, discrete_idx]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_train_tmp = ss.fit_transform(X_continuous_train_tmp)
    print("type of X_continuous_train_tmp::", type(X_continuous_train_tmp))
    print("shape of X_continuous_train_tmp::", X_continuous_train_tmp.shape)

    # 将连续值和离散值拼接
    X_train_new = np.hstack((X_continuous_train_tmp, X_discreate_train_tmp))
    print("shape of X_train_new::", X_train_new.shape)
    
    # testing data 归一化
    X_continuous_test_tmp = ss.fit_transform(X_continuous_test_tmp)
    print("type of X_continuous_test_tmp::", type(X_continuous_test_tmp))
    print("shape of X_continuous_test_tmp::", X_continuous_test_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new = np.hstack((X_continuous_test_tmp, X_discreate_test_tmp))
    print("shape of X_test_new::", X_test_new.shape)
    
    # new data 归一化
    X_continuous_test_all_tmp = ss.fit_transform(X_continuous_test_all_tmp)
    print("type of X_continuous_test_all_tmp::", type(X_continuous_test_all_tmp))
    print("shape of X_continuous_test_all_tmp::", X_continuous_test_all_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new_all = np.hstack((X_continuous_test_all_tmp, X_discreate_test_all_tmp))
    print("shape of X_test_new_all::", X_test_new_all.shape)

    # 划分训练集和测试集
    print("shape of X_train_new::", X_train_new.shape)
    print("shape of X_test_new::", X_test_new.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_new, y_train)
    y_test_predict = xgbc.predict(X_test_new)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
    y_predict = xgbc.predict(X_test_new_all)
    print("y_predict::", Counter(y_predict))

X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
X_test_all = deepcopy(X_test_all)
print()
print("**************n_features_to_select is::%f******************" % n_features_to_select)
train_and_test_XGBC_rfe_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all, n_features_to_select=15)  


**************n_features_to_select is::11.000000******************
接收到的n_features_to_select是：15
X_train.shape (17539, 22)
X_test.shape (833, 22)
selector.support_:: [ True  True  True  True  True False  True  True  True  True  True  True
  True  True False  True False False  True False False False]
[ 0  1  2  3  4  6  7  8  9 10 11 12 13 15 18]
type of X_continuous_train_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_train_tmp:: (17539, 15)
shape of X_train_new:: (17539, 18)
type of X_continuous_test_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_tmp:: (833, 15)
shape of X_test_new:: (833, 18)
type of X_continuous_test_all_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_all_tmp:: (181057, 15)
shape of X_test_new_all:: (181057, 18)
shape of X_train_new:: (17539, 18)
shape of X_test_new:: (833, 18)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              preci

### 6、Wrapper方法

#### （2）RFECV

使用交叉验证来保留最佳性能的特征。不过这里的交叉验证的数据集切割对象不再是行数据（样本），而是列数据（特征），同时学习器本身不变，最终得到不同特征对于score的重要程度，然后保留最佳的特征组合。其分割方式类似于随机森林中的列上子采样。

In [243]:
def train_XGBC_rfecv_select(X_train, y_train, X_test, y_test, step=1, n_splits=3):
    
    # RFECV
    estimator = XGBClassifier()
    selector = RFECV(estimator=estimator, step = step, cv=StratifiedKFold(n_splits=n_splits), scoring="accuracy")
    X_train_rfecv = selector.fit_transform(X_train, y_train) 
    print("Optimal number of features::%d" % selector.n_features_)
    print("Ranking of features:: %s" % selector.ranking_)
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_train_tmp = X_train.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_train_tmp = X_train.A[:, discrete_idx]
    
    X_continuous_test_tmp = X_test.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_tmp = X_test.A[:, discrete_idx]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_train_tmp = ss.fit_transform(X_continuous_train_tmp)
    print("type of X_continuous_train_tmp::", type(X_continuous_train_tmp))
    print("shape of X_continuous_train_tmp::", X_continuous_train_tmp.shape)

    # 将连续值和离散值拼接
    X_train_new = np.hstack((X_continuous_train_tmp, X_discreate_train_tmp))
    print("shape of X_train_new::", X_train_new.shape)
    
    # testing data 归一化
    X_continuous_test_tmp = ss.fit_transform(X_continuous_test_tmp)
    print("type of X_continuous_test_tmp::", type(X_continuous_test_tmp))
    print("shape of X_continuous_test_tmp::", X_continuous_test_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new = np.hstack((X_continuous_test_tmp, X_discreate_test_tmp))
    print("shape of X_test_new::", X_test_new.shape)

    # 划分训练集和测试集
    print("shape of X_train_new::", X_train_new.shape)
    print("shape of X_test_new::", X_test_new.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_new, y_train)
    y_test_predict = xgbc.predict(X_test_new)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
for step in range(1, 5, 2):    # 连续特征共有19个
    for n_splits in range(2, 4, 2):
        X_train_copy = deepcopy(X_train)    
        X_test_copy = deepcopy(X_test)
        y_train_copy = deepcopy(y_train)
        y_test_copy = deepcopy(y_test)
        print()
        print("**************step is::%d, n_splits::%d *****************" % (step, n_splits))
        train_XGBC_rfecv_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, step, n_splits)  


**************step is::1, n_splits::2 *****************
Optimal number of features::12
Ranking of features:: [ 1  4  2  1  1 11  1  1  1  1  1  3  1  1  7  1  8  9  1  5  6 10]
selector.support_:: [ True False False  True  True False  True  True  True  True  True False
  True  True False  True False False  True False False False]
type of X_continuous_train_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_train_tmp:: (17539, 12)
shape of X_train_new:: (17539, 15)
type of X_continuous_test_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_tmp:: (833, 12)
shape of X_test_new:: (833, 15)
shape of X_train_new:: (17539, 15)
shape of X_test_new:: (833, 15)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.18007   0.30519       833

   micro avg    0.1

In [244]:
# 通过观察可以发现，rfecv的效果也不是很好
def train_and_test_XGBC_rfecv_select(X_train, y_train, X_test, y_test, X_test_all, step=1, n_splits=3):
    
    # RFECV
    estimator = XGBClassifier()
    selector = RFECV(estimator=estimator, step = step, cv=StratifiedKFold(n_splits=n_splits), scoring="accuracy")
    X_train_rfecv = selector.fit_transform(X_train, y_train) 
    print("Optimal number of features::%d" % selector.n_features_)
    print("Ranking of features:: %s" % selector.ranking_)
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_train_tmp = X_train.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_train_tmp = X_train.A[:, discrete_idx]
    
    X_continuous_test_tmp = X_test.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_tmp = X_test.A[:, discrete_idx]
    
    X_continuous_test_all_tmp = X_test_all.A[:, list(set(selected_idx) - set([19, 20, 21]))]
    X_discreate_test_all_tmp = X_test_all.A[:, discrete_idx]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_train_tmp = ss.fit_transform(X_continuous_train_tmp)
    print("type of X_continuous_train_tmp::", type(X_continuous_train_tmp))
    print("shape of X_continuous_train_tmp::", X_continuous_train_tmp.shape)

    # 将连续值和离散值拼接
    X_train_new = np.hstack((X_continuous_train_tmp, X_discreate_train_tmp))
    print("shape of X_train_new::", X_train_new.shape)
    
    # testing data 归一化
    X_continuous_test_tmp = ss.fit_transform(X_continuous_test_tmp)
    print("type of X_continuous_test_tmp::", type(X_continuous_test_tmp))
    print("shape of X_continuous_test_tmp::", X_continuous_test_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new = np.hstack((X_continuous_test_tmp, X_discreate_test_tmp))
    print("shape of X_test_new::", X_test_new.shape)

    # 划分训练集和测试集
    print("shape of X_train_new::", X_train_new.shape)
    print("shape of X_test_new::", X_test_new.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train_new, y_train)
    y_test_predict = xgbc.predict(X_test_new)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=5))
    
    # new data 归一化
    X_continuous_test_all_tmp = ss.fit_transform(X_continuous_test_all_tmp)
    print("type of X_continuous_test_all_tmp::", type(X_continuous_test_all_tmp))
    print("shape of X_continuous_test_all_tmp::", X_continuous_test_all_tmp.shape)

    # 将连续值和离散值拼接
    X_test_new_all = np.hstack((X_continuous_test_all_tmp, X_discreate_test_all_tmp))
    print("shape of X_test_new_all::", X_test_new_all.shape)
    
    y_predict = xgbc.predict(X_test_new_all)
    print("y_predict::", Counter(y_predict))
    

X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
X_test_all_copy = deepcopy(X_test_all)
train_and_test_XGBC_rfecv_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all_copy)  

Optimal number of features::9
Ranking of features:: [ 1  7  5  2  1 14  1  4  1  1  1  6  1  1 10  1 11 12  3  8  9 13]
selector.support_:: [ True False False False  True False  True False  True  True  True False
  True  True False  True False False False False False False]
type of X_continuous_train_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_train_tmp:: (17539, 9)
shape of X_train_new:: (17539, 12)
type of X_continuous_test_tmp:: <class 'numpy.ndarray'>
shape of X_continuous_test_tmp:: (833, 9)
shape of X_test_new:: (833, 12)
shape of X_train_new:: (17539, 12)
shape of X_test_new:: (833, 12)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.18007   0.30519       833

   micro avg    0.18007   0.18007   0.18007       833
   macro avg    0.50000  

### 7、Embedded方法

使用L1正则项实现特征选择

In [245]:
def train_SVC_l1(X_train, y_train, X_test, y_test,  C=0.01):
    
    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0: -3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0: -3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = LinearSVC(C=C, penalty="l1", dual=False)
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=10))

# 网格搜索
"""
连续特征共有19个，
第一次网格搜索：range(0, 300, 30)，然后定位到：[1, 1.4];
第二次网格搜索：range(100, 140, 10)，然后定位到：[1, 1.2];
第三次网格搜索：range(100, 120, 5)，然后定位到：[1.1];
"""
for C in range(100, 120, 5):    
    print()
    param = C/100
    X_train_copy = deepcopy(X_train)    
    X_test_copy = deepcopy(X_test)
    y_train_copy = deepcopy(y_train)
    y_test_copy = deepcopy(y_test)
    print("**************C is::%f*****************" % param)
    train_SVC_l1(X_train_copy, y_train_copy, X_test_copy, y_test_copy, param)  


**************C is::1.000000*****************
type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:: (833, 22)
shape of X_train:: (17539, 22)
shape of X_test:: (833, 22)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0  0.6251830161 0.9976635514 0.7686768677       428
           1  0.9933333333 0.3679012346 0.5369369369       405

   micro avg  0.6914765906 0.6914765906 0.6914765906       833
   macro avg  0.8092581747 0.6827823930 0.6528069023       833
weighted avg  0.8041756673 0.6914765906 0.6560061931       833


**************C is::1.050000*****************
type of X_continuous_train_new:: <clas

In [246]:
# 通过上述网络搜索的结果，可以看出，到最后f1值的开始循环震荡，说明结果已经趋于稳定，我们只需要将C设置为1.55即可
def train_and_test_SVC_L1_select(X_train, y_train, X_test, y_test,  X_test_all, C=1.1):
    
    # 归一化
    ss = StandardScaler(with_mean=False)
    X_continuous_train_new = ss.fit_transform(X_train[:, 0: -3])
    print("type of X_continuous_train_new::", type(X_continuous_train_new))
    print("shape of X_continuous_train_new::", X_continuous_train_new.shape)

    # 将连续值和离散值拼接
    X_train = np.hstack((X_continuous_train_new.A, X_train[:, -3:].A))   # .A转化为.numpy.ndarray
    print("shape of X_train::", X_train.shape)

    # 归一化
    X_continuous_test_new = ss.transform(X_test[:, 0: -3])
    print("type of X_continuous_test_new::", type(X_continuous_test_new))
    print("shape of X_continuous_test_new::", X_continuous_test_new.shape)

    # 将连续值和离散值拼接
    X_test = np.hstack((X_continuous_test_new.A, X_test[:, -3:].A))
    print("shape of X_test::", X_test.shape)

    # 划分训练集和测试集
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    xgbc = LinearSVC(C=C, penalty="l1", dual=False)
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['0', '1'], digits=10))
    
    # 归一化
    X_continuous_test_all_new = ss.transform(X_test_all[:, 0: -3])
    print("type of X_continuous_test_all_new::", type(X_continuous_test_all_new))
    print("shape of X_continuous_test_all_new::", X_continuous_test_all_new.shape)

    # 将连续值和离散值拼接
    X_test_all = np.hstack((X_continuous_test_all_new.A, X_test_all[:, -3:].A))
    print("shape of X_test_all::", X_test_all.shape)
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))

X_train_copy = deepcopy(X_train)    
X_test_copy = deepcopy(X_test)
y_train_copy = deepcopy(y_train)
y_test_copy = deepcopy(y_test)
X_test_all = deepcopy(X_test_all)
print("**************C is::%f*****************" % param)
train_and_test_SVC_L1_select(X_train_copy, y_train_copy, X_test_copy, y_test_copy, X_test_all)  

**************C is::1.150000*****************
type of X_continuous_train_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_train_new:: (17539, 19)
shape of X_train:: (17539, 22)
type of X_continuous_test_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_continuous_test_new:: (833, 19)
shape of X_test:: (833, 22)
shape of X_train:: (17539, 22)
shape of X_test:: (833, 22)
shape of y_train:: (17539,)
Counter of y_train:: Counter({1: 16551, 0: 988})
shape of y_test:: (833,)
Counter of y_test:: Counter({0: 683, 1: 150})
              precision    recall  f1-score   support

           0  0.6251830161 0.9976635514 0.7686768677       428
           1  0.9933333333 0.3679012346 0.5369369369       405

   micro avg  0.6914765906 0.6914765906 0.6914765906       833
   macro avg  0.8092581747 0.6827823930 0.6528069023       833
weighted avg  0.8041756673 0.6914765906 0.6560061931       833

type of X_continuous_test_all_new:: <class 'scipy.sparse.csr.csr_matrix'>
shape of X_c

### 分析：

本代码说明，在所有的特征选择方法中，针对此数据集，效果最好的是使用方差分析的情况，但是仔细观察可以发现，使用方差分析的时候是保留全部特征的，这说明，对于比较小的数据集而言，可以不进行特征选择。但是即使是最好的情况，效果也不是很好，所以仍然需要采用其他的办法提高性能。