## 功能

本py文件主要用于特征选择方法的确定和特征选择参数的确定，具体如下：

### 一、使用方差选择（Filter方法）

1、特征选择

2、使用网格搜索确定特征选择参数

3、使用xgboost训练模型

### 二、使用递归特征消除法（Wrapper方法）

1、使用RFE和RFECV进行特征选择

2、网格搜索确定保留特征数

3、使用xgboost训练模型

### 三、使用正则化（Embedded方法）

1、使用L1正则化

2、网格搜索确定正则项系数

3、使用LinearSVC训练模型


In [170]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
import pickle
import json

### 1、获取数据

In [171]:
connection = pymysql.Connect(
    host="10.11.6.117",
    port=3306,
    user="itoffice",
    passwd="Dase115_",
    charset="utf8",
    db="project_researchers"
)

In [172]:
def getData(connection):
    """
    查询数据，包括特征和标签
    :param connection:
    :return:
    """
    sql_select = """
    SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category, label 
    FROM classifier_isTeacher_xgbc WHERE (label =1 or label = 0) and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category', 'label']
    data = df[all_features]
    return data

data = getData(connection)
print("shape of data:", data.shape)
print("data.info():", data.info())

shape of data: (18694, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18694 entries, 0 to 18693
Data columns (total 21 columns):
bys_cn                18442 non-null float64
hindex_cn             18557 non-null float64
a_paper               18694 non-null int64
b_paper               18694 non-null int64
c_paper               18694 non-null int64
papernum2017          18694 non-null int64
papernum2016          18694 non-null int64
papernum2015          18694 non-null int64
papernum2014          18694 non-null int64
papernum2013          18694 non-null int64
num_journal           18694 non-null int64
num_conference        18694 non-null int64
degree                18623 non-null float64
pagerank              18623 non-null float64
degree_centrality     18623 non-null float64
diff_year             18623 non-null float64
coauthors_top10000    18694 non-null int64
coauthors_top20000    18694 non-null int64
coauthors_top30000    18694 non-null int64
category              18694 non-nul

### 2、处理数据

In [173]:
# 对缺失值进行处理
# Method1：直接将含有缺失字段的值去掉
data = data.dropna()
print("shape of data::", data.shape)
print("data.info()::", data.info())

shape of data:: (18372, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18372 entries, 0 to 18692
Data columns (total 21 columns):
bys_cn                18372 non-null float64
hindex_cn             18372 non-null float64
a_paper               18372 non-null int64
b_paper               18372 non-null int64
c_paper               18372 non-null int64
papernum2017          18372 non-null int64
papernum2016          18372 non-null int64
papernum2015          18372 non-null int64
papernum2014          18372 non-null int64
papernum2013          18372 non-null int64
num_journal           18372 non-null int64
num_conference        18372 non-null int64
degree                18372 non-null float64
pagerank              18372 non-null float64
degree_centrality     18372 non-null float64
diff_year             18372 non-null float64
coauthors_top10000    18372 non-null int64
coauthors_top20000    18372 non-null int64
coauthors_top30000    18372 non-null int64
category              18372 non-nu

In [174]:
# 将连续值和离散值以及y分开
continuous_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000']
discrete_features = ['category']
X_continous = data[continuous_features]
X_discrete = data[discrete_features]
y = data['label']
print("info of X_continuous::", X_continous.info())
print("info of X_discrete::", X_discrete.info())
print("y::", Counter(y))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18372 entries, 0 to 18692
Data columns (total 19 columns):
bys_cn                18372 non-null float64
hindex_cn             18372 non-null float64
a_paper               18372 non-null int64
b_paper               18372 non-null int64
c_paper               18372 non-null int64
papernum2017          18372 non-null int64
papernum2016          18372 non-null int64
papernum2015          18372 non-null int64
papernum2014          18372 non-null int64
papernum2013          18372 non-null int64
num_journal           18372 non-null int64
num_conference        18372 non-null int64
degree                18372 non-null float64
pagerank              18372 non-null float64
degree_centrality     18372 non-null float64
diff_year             18372 non-null float64
coauthors_top10000    18372 non-null int64
coauthors_top20000    18372 non-null int64
coauthors_top30000    18372 non-null int64
dtypes: float64(6), int64(13)
memory usage: 2.8 MB
info of X_c

In [175]:
# 将离散值变成one-hot编码
X_discrete_oneHot = OneHotEncoder(sparse=False).fit_transform(X_discrete)
print(X_discrete_oneHot)

X_all = np.hstack((X_continous, X_discrete_oneHot))
print("shape of X_all::", X_all.shape)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
shape of X_all:: (18372, 22)


## 3、获取需要预测的数据

In [176]:
def getPredictData(connection):
    """
    获取需要预测的数据，包括训练集中的特征
    :param connection:
    :return:
    """
    sql_select = """
    SELECT bys_cn, hindex_cn,a_conf+a_journal as a_paper, b_conf + b_journal as b_paper,c_conf + c_journal as c_paper,papernum2017, papernum2016, papernum2015, papernum2014, papernum2013,num_journal,num_conference, project_num, degree, pagerank,degree_centrality,last_year - first_year as diff_year , coauthors_top10000, coauthors_top20000, coauthors_top30000, category 
    FROM classifier_isTeacher_xgbc WHERE label is null and category is not null
    """
    df = pd.read_sql_query(sql_select, connection)
    all_features = ['bys_cn', 'hindex_cn', 'a_paper', 'b_paper', 'c_paper', 'papernum2017', 'papernum2016', 'papernum2015', 'papernum2014', 'papernum2013', 'num_journal', 'num_conference',  'degree', 'pagerank', 'degree_centrality', 'diff_year', 'coauthors_top10000', 'coauthors_top20000', 'coauthors_top30000', 'category']
    data = df[all_features]
    return data

data_test = getPredictData(connection)
print("shape of data_test:", data_test.shape)
print("data_test.info():", data_test.info())

shape of data_test: (181057, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                176811 non-null float64
hindex_cn             180624 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                180847 non-null float64
pagerank              180847 non-null float64
degree_centrality     180847 non-null float64
diff_year             180847 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category

## 4、处理需要预测的数据

In [177]:
# 使用0进行填充
data_test_fill = data_test.fillna(0)
print("info of data_test_fill::", data_test_fill.info())

# 将连续特征和离散特征区分开
X_test_continous = data_test_fill[continuous_features]
X_test_discrete = data_test_fill[discrete_features]

# 离散特征使用one-hot编码
X_test_discrete_oneHot = OneHotEncoder(sparse=False).fit_transform(X_test_discrete)

X_test_all = np.hstack((X_test_continous, X_test_discrete_oneHot))

print(X_test_discrete_oneHot)
print("shape of X_test_all::", X_test_all.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181057 entries, 0 to 181056
Data columns (total 20 columns):
bys_cn                181057 non-null float64
hindex_cn             181057 non-null float64
a_paper               181057 non-null int64
b_paper               181057 non-null int64
c_paper               181057 non-null int64
papernum2017          181057 non-null int64
papernum2016          181057 non-null int64
papernum2015          181057 non-null int64
papernum2014          181057 non-null int64
papernum2013          181057 non-null int64
num_journal           181057 non-null int64
num_conference        181057 non-null int64
degree                181057 non-null float64
pagerank              181057 non-null float64
degree_centrality     181057 non-null float64
diff_year             181057 non-null float64
coauthors_top10000    181057 non-null int64
coauthors_top20000    181057 non-null int64
coauthors_top30000    181057 non-null int64
category              181057 non-null int

### 5、Filter方法

（1）特征选择使用方差分析

（2）分类器使用XGboost

In [178]:
def trainXGBC_varSelect(X_continous, X_discrete_oneHot, y, var_threshold):
    
    print("接收到的threshold是：%d" % var_threshold)
    
    # 方差选择，只选择连续特征，离散特征不做选择
    vt = VarianceThreshold(threshold=var_threshold)
    X_continuous_new = vt.fit_transform(X_continous)
    print(vt.variances_)
    print("shape of X_continuous_new::", X_continuous_new.shape)

    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous_new)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)

    # 将连续值和离散值拼接
    X_all = np.hstack((X_continuous_new, X_discrete_oneHot))
    print("shape of X_all::", X_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=5))
    

# 网格搜索
X_continous_copy = X_continous.copy()
X_discrete_oneHot_copy = X_discrete_oneHot.copy()
y_copy = y.copy()
for var_threshold in range(0, 100, 2):
    print()
    print("**************threshold is::%f******************" % var_threshold)
    trainXGBC_varSelect(X_continous_copy, X_discrete_oneHot_copy, y_copy, var_threshold)  


**************threshold is::0.000000******************
接收到的threshold是：0
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 19)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77008   0.80347   0.78642       346
          0    0.97948   0.97507   0.97727      3329

avg / total    0.95977   0.95891   0.95930      3675


**************threshold is::2.000000******************
接收到的threshold是：2
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 19)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77008   0.80347   0.78642       346
          0    0.97948   0.97507   0.97727      3329

avg / total    0.95977   0.95891   0.95930      3675


**************threshold is::4.000000******************
接收到的threshold是：4
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 18)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 21)
shape of X_train:: (14697, 21)
shape of X_test:: (3675, 21)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77285   0.79943   0.78592       349
          0    0.97888   0.97535   0.97711      3326

avg / total    0.95931   0.95864   0.95895      3675


**************threshold is::6.000000******************
接收到的threshold是：6
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 17)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all:: (18372, 20)
shape of X_train:: (14697, 20)
shape of X_test:: (3675, 20)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77008   0.78754   0.77871       353
          0    0.97737   0.97502   0.97619      3322

avg / total    0.95746   0.95701   0.95722      3675


**************threshold is::8.000000******************
接收到的threshold是：8
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::10.000000******************
接收到的threshold是：10
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::12.000000******************
接收到的threshold是：12
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::14.000000******************
接收到的threshold是：14
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::16.000000******************
接收到的threshold是：16
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::18.000000******************
接收到的threshold是：18
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::20.000000******************
接收到的threshold是：20
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::22.000000******************
接收到的threshold是：22
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 16)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79501   0.76738   0.78095       374
          0    0.97375   0.97758   0.97566      3301

avg / total    0.95556   0.95619   0.95585      3675


**************threshold is::24.000000******************
接收到的threshold是：24
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 15)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 15)
shape of X_all:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.78670   0.76344   0.77490       372
          0    0.97345   0.97669   0.97506      3303

avg / total    0.95454   0.95510   0.95480      3675


**************threshold is::26.000000******************
接收到的threshold是：26
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 12)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 12)
shape of X_all:: (18372, 15)
shape of X_train:: (14697, 15)
shape of X_test:: (3675, 15)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79778   0.76800   0.78261       375
          0    0.97375   0.97788   0.97581      3300

avg / total    0.95579   0.95646   0.95609      3675


**************threshold is::28.000000******************
接收到的threshold是：28
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 11)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all:: (18372, 14)
shape of X_train:: (14697, 14)
shape of X_test:: (3675, 14)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.77297   0.78249       370
          0    0.97465   0.97731   0.97598      3305

avg / total    0.95629   0.95673   0.95650      3675


**************threshold is::30.000000******************
接收到的threshold是：30
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 11)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all:: (18372, 14)
shape of X_train:: (14697, 14)
shape of X_test:: (3675, 14)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.77297   0.78249       370
          0    0.97465   0.97731   0.97598      3305

avg / total    0.95629   0.95673   0.95650      3675


**************threshold is::32.000000******************
接收到的threshold是：32
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 9)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 9)
shape of X_all:: (18372, 12)
shape of X_train:: (14697, 12)
shape of X_test:: (3675, 12)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79778   0.76596   0.78155       376
          0    0.97345   0.97787   0.97565      3299

avg / total    0.95547   0.95619   0.95579      3675


**************threshold is::34.000000******************
接收到的threshold是：34
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::36.000000******************
接收到的threshold是：36
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::38.000000******************
接收到的threshold是：38
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::40.000000******************
接收到的threshold是：40
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::42.000000******************
接收到的threshold是：42
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::44.000000******************
接收到的threshold是：44
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::46.000000******************
接收到的threshold是：46
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::48.000000******************
接收到的threshold是：48
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::50.000000******************
接收到的threshold是：50
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 8)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 8)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.79224   0.76882   0.78035       372
          0    0.97405   0.97729   0.97567      3303

avg / total    0.95565   0.95619   0.95590      3675


**************threshold is::52.000000******************
接收到的threshold是：52
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::54.000000******************
接收到的threshold是：54
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::56.000000******************
接收到的threshold是：56
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::58.000000******************
接收到的threshold是：58
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::60.000000******************
接收到的threshold是：60
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::62.000000******************
接收到的threshold是：62
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::64.000000******************
接收到的threshold是：64
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::66.000000******************
接收到的threshold是：66
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::68.000000******************
接收到的threshold是：68
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::70.000000******************
接收到的threshold是：70
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::72.000000******************
接收到的threshold是：72
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::74.000000******************
接收到的threshold是：74
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 7)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 7)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72576   0.74011   0.73287       354
          0    0.97224   0.97019   0.97121      3321

avg / total    0.94850   0.94803   0.94825      3675


**************threshold is::76.000000******************
接收到的threshold是：76
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 6)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 6)
shape of X_all:: (18372, 9)
shape of X_train:: (14697, 9)
shape of X_test:: (3675, 9)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::78.000000******************
接收到的threshold是：78
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 6)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 6)
shape of X_all:: (18372, 9)
shape of X_train:: (14697, 9)
shape of X_test:: (3675, 9)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::80.000000******************
接收到的threshold是：80
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 6)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 6)
shape of X_all:: (18372, 9)
shape of X_train:: (14697, 9)
shape of X_test:: (3675, 9)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::82.000000******************
接收到的threshold是：82
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::84.000000******************
接收到的threshold是：84
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::86.000000******************
接收到的threshold是：86
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::88.000000******************
接收到的threshold是：88
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::90.000000******************
接收到的threshold是：90
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::92.000000******************
接收到的threshold是：92
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::94.000000******************
接收到的threshold是：94
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::96.000000******************
接收到的threshold是：96
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.72022   0.73864   0.72931       352
          0    0.97224   0.96961   0.97092      3323

avg / total    0.94810   0.94748   0.94778      3675


**************threshold is::98.000000******************
接收到的threshold是：98
[4.00879970e+05 2.53632050e+01 2.53916840e+01 2.21470374e+01
 7.53296860e+01 3.22170223e+01 3.18429028e+01 3.05619039e+01
 2.77860064e+01 2.49705935e+01 2.84932472e+03 6.11521208e+02
 1.07285265e+04 2.15172126e+02 8.16238752e+01 5.12690800e+01
 2.40560804e+00 5.00732140e+00 7.80610567e+00]
shape of X_continuous_new:: (18372, 5)
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 5)
shape of X_all:: (18372, 8)
shape of X_train:: (14697, 8)
shape of X_test:: (3675, 8)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
             precision    recall  f1-score   support


  if diff:


In [187]:
# 方差选择必然会损失性能，所以这里我们使用所有的特征进行模型的预测，为了方便保留结果，这里将训练部分函数冗余再写一遍。
# 若是特征非常多，则可以考虑去除对性能影响最小的特征。这里特征不是很多，不在进行去除，只是为了再熟悉下流程，这里写了一遍方差分析。
def trainAndTestXGBCvarSelect(X_continous, X_discrete_oneHot, y, X_test_continous, X_test_discrete_oneHot):
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)
    X_test_continous_new = ss.transform(X_test_continous)
    print("type of X_test_continous_new::", type(X_test_continous_new))
    print("shape of X_test_continous_new::", X_test_continous_new.shape)

    # 将连续值和离散值拼接
    X_all = np.hstack((X_continuous_new, X_discrete_oneHot))
    print("shape of X_all::", X_all.shape)
    X_test_all = np.hstack((X_test_continous_new, X_test_discrete_oneHot))
    print("shape of X_test_all::", X_test_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=5))
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))

# 调用预测函数
X_continous_copy = X_continous.copy()
X_discrete_oneHot_copy = X_discrete_oneHot.copy()
y_copy = y.copy()
X_test_continous_copy = X_test_continous.copy()
X_test_discrete_oneHot_copy = X_test_discrete_oneHot.copy()
trainAndTestXGBCvarSelect(X_continous_copy, X_discrete_oneHot_copy, y_copy, X_test_continous_copy, X_test_discrete_oneHot_copy)

type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
type of X_test_continous_new:: <class 'numpy.ndarray'>
shape of X_test_continous_new:: (181057, 19)
shape of X_all:: (18372, 22)
shape of X_test_all:: (181057, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77008   0.80347   0.78642       346
          0    0.97948   0.97507   0.97727      3329

avg / total    0.95977   0.95891   0.95930      3675

y_predict:: Counter({1: 118210, 0: 62847})


  if diff:


### 6、Wrapper方法

#### （1）RFE

RFE和方差分析一样，去掉特征必然会带来性能的降低。

In [181]:
def trainXGBC_rfeSelect(X_all, y, n_features_to_select):
    
    # RFE特征选择
    estimator = XGBClassifier()
    selector = RFE(estimator=estimator, n_features_to_select = n_features_to_select)
    X_all_rfe = selector.fit_transform(X_all, y) 
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_tmp = pd.DataFrame(X_all_rfe)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_discreate_tmp = pd.DataFrame(X_all_rfe)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous_tmp)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)

    # 将连续值和离散值拼接
    X_all_new = np.hstack((X_continuous_new, X_discreate_tmp))
    print("shape of X_all_new::", X_all_new.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all_new, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=10))
    

# 网格搜索
X_all_copy = X_all.copy()
y_copy = y.copy()
for n_features_to_select in range(20, 10, -1):    # 连续特征共有19个
    print()
    print("**************n_features_to_select is::%f******************" % n_features_to_select)
    trainXGBC_rfeSelect(X_all_copy, y_copy, n_features_to_select)  


**************n_features_to_select is::20.000000******************
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
shape of X_all_new:: (18372, 20)
shape of X_train:: (14697, 20)
shape of X_test:: (3675, 20)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************n_features_to_select is::19.000000******************
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all_new:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************n_features_to_select is::18.000000******************
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all_new:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7971428571 0.7848101266       350
          0  0.9785757393 0.9753383459 0.9769543606      3325

avg / total  0.9589831151 0.9583673469 0.9586549097      3675


**************n_features_to_select is::17.000000******************
selector.support_:: [ True  True  True False  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 16)
shape of X_all_new:: (18372, 17)
shape of X_train:: (14697, 17)
shape of X_test:: (3675, 17)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7994269341 0.7859154930       349
          0  0.9788774894 0.9753457607 0.9771084337      3326

avg / total  0.9593121882 0.9586394558 0.9589516075      3675


**************n_features_to_select is::16.000000******************
selector.support_:: [ True  True False False  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 15)
shape of X_all_new:: (18372, 16)
shape of X_train:: (14697, 16)
shape of X_test:: (3675, 16)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************n_features_to_select is::15.000000******************
selector.support_:: [ True  True False False  True  True  True False  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 14)
shape of X_all_new:: (18372, 15)
shape of X_train:: (14697, 15)
shape of X_test:: (3675, 15)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7756232687 0.8092485549 0.7920792079       346
          0  0.9800844900 0.9756683689 0.9778714436      3329

avg / total  0.9608345356 0.9600000000 0.9603791678      3675


**************n_features_to_select is::14.000000******************
selector.support_:: [ True  True False False False  True  True False  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 13)
shape of X_all_new:: (18372, 14)
shape of X_train:: (14697, 14)
shape of X_test:: (3675, 14)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8104956268 0.7897727273       343
          0  0.9803862402 0.9750900360 0.9777309660      3332

avg / total  0.9607579473 0.9597278912 0.9601881970      3675


**************n_features_to_select is::13.000000******************
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 12)
shape of X_all_new:: (18372, 13)
shape of X_train:: (14697, 13)
shape of X_test:: (3675, 13)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8075801749 0.7869318182       343
          0  0.9800844900 0.9747899160 0.9774300331      3332

avg / total  0.9602258194 0.9591836735 0.9596501997      3675


**************n_features_to_select is::12.000000******************
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all_new:: (18372, 12)
shape of X_train:: (14697, 12)
shape of X_test:: (3675, 12)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7965616046 0.7830985915       349
          0  0.9785757393 0.9750450992 0.9768072289      3326

avg / total  0.9587760304 0.9580952381 0.9584114971      3675


**************n_features_to_select is::11.000000******************
selector.support_:: [ True  True False False False  True  True False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 10)
shape of X_all_new:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.959894

  if diff:


In [188]:
# 从上面的分析过程可以知道，当连续特征取15的时候，f1最大
def trainAndTestXGBCrfeSelect(X_all, y, X_test_all, n_features_to_select=15):
    
     # RFE特征选择
    estimator = XGBClassifier()
    selector = RFE(estimator=estimator, n_features_to_select = n_features_to_select)
    X_all_rfe = selector.fit_transform(X_all, y) 
    print("N_features %s" % selector.n_features_)   # 保留的特征数
    print("Support is %s" % selector.support_)    # 是否保留
    print("Ranking %s" % selector.ranking_)    # 重要程序排名
    X_test_all_rfe = selector.transform(X_test_all)
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_tmp = pd.DataFrame(X_all_rfe)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_discreate_tmp = pd.DataFrame(X_all_rfe)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    X_test_continuous_tmp = pd.DataFrame(X_test_all_rfe)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_test_discreate_tmp = pd.DataFrame(X_test_all_rfe)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous_tmp)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)
    X_test_continous_new = ss.transform(X_test_continuous_tmp)
    print("type of X_test_continous_new::", type(X_test_continous_new))
    print("shape of X_test_continous_new::", X_test_continous_new.shape)

    # 将连续值和离散值拼接
    X_all_new = np.hstack((X_continuous_new, X_discreate_tmp))
    print("shape of X_all::", X_all.shape)
    X_test_all_new = np.hstack((X_test_continous_new, X_test_discreate_tmp))
    print("shape of X_test_all::", X_test_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all_new, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=5))
    
    y_predict = xgbc.predict(X_test_all_new)
    print("y_predict::", Counter(y_predict))

# 调用预测函数
X_all_copy = X_all.copy()
y_copy = y.copy()
X_test_all_copy = X_test_all.copy()
trainAndTestXGBCrfeSelect(X_all_copy, y_copy, X_test_all_copy)

N_features 15
Support is [ True  True False False  True  True  True False  True False  True  True
  True  True False  True False  True  True  True False  True]
Ranking [1 1 3 4 1 1 1 2 1 6 1 1 1 1 7 1 5 1 1 1 8 1]
selector.support_:: [ True  True False False  True  True  True False  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 14)
type of X_test_continous_new:: <class 'numpy.ndarray'>
shape of X_test_continous_new:: (181057, 14)
shape of X_all:: (18372, 22)
shape of X_test_all:: (181057, 22)
shape of X_train:: (14697, 15)
shape of X_test:: (3675, 15)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.77562   0.80925   0.79208       346
          0    0.98008   0.97567   0.97787      3329

avg / total    0.96083   0.96000   0.96038      3675

y_predict:: Counter({1: 117409, 0: 63648})


  if diff:


### 6、Wrapper方法

#### （2）RFECV

使用交叉验证来保留最佳性能的特征。不过这里的交叉验证的数据集切割对象不再是行数据（样本），而是列数据（特征），同时学习器本身不变，最终得到不同特征对于score的重要程度，然后保留最佳的特征组合。其分割方式类似于随机森林中的列上子采样。

In [183]:
def trainXGBC_rfecvSelect(X_all, y, step=1, n_splits=3):
    
    # RFECV
    estimator = XGBClassifier()
    selector = RFECV(estimator=estimator, step = step, cv=StratifiedKFold(n_splits=n_splits), scoring="accuracy")
    X_all_rfecv = selector.fit_transform(X_all, y) 
    print("Optimal number of features::%d" % selector.n_features_)
    print("Ranking of features:: %s" % selector.ranking_)
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_tmp = pd.DataFrame(X_all_rfecv)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_discreate_tmp = pd.DataFrame(X_all_rfecv)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous_tmp)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)

    # 将连续值和离散值拼接
    X_all_new = np.hstack((X_continuous_new, X_discreate_tmp))
    print("shape of X_all::", X_all_new.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all_new, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=10))
    

# 网格搜索
X_all_copy = X_all.copy()
y_copy = y.copy()
for step in range(1, 5, 1):    # 连续特征共有19个
    for n_splits in range(2, 10, 1):
        print()
        print("**************step is::%d, n_splits::%d *****************" % (step, n_splits))
        trainXGBC_rfecvSelect(X_all_copy, y_copy, step, n_splits)  


**************step is::1, n_splits::2 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::10
Ranking of features:: [ 1  1  8  9  6  1  2  7  5 11  1  3  1  4 12  1 10  1  1  1 13  1]
selector.support_:: [ True  True False False False  True False False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 9)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.9598944178 0.9589115646 0.9593543985      3675


**************step is::1, n_splits::3 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::13
Ranking of features:: [ 1  1  5  6  3  1  1  4  2  8  1  1  1  1  9  1  7  1  1  1 10  1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 12)
shape of X_all:: (18372, 13)
shape of X_train:: (14697, 13)
shape of X_test:: (3675, 13)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8075801749 0.7869318182       343
          0  0.9800844900 0.9747899160 0.9774300331      3332

avg / total  0.9602258194 0.9591836735 0.9596501997      3675


**************step is::1, n_splits::4 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:


Optimal number of features::12
Ranking of features:: [ 1  1  6  7  4  1  1  5  3  9  1  1  1  2 10  1  8  1  1  1 11  1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all:: (18372, 12)
shape of X_train:: (14697, 12)
shape of X_test:: (3675, 12)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7965616046 0.7830985915       349
          0  0.9785757393 0.9750450992 0.9768072289      3326

avg / total  0.9587760304 0.9580952381 0.9584114971      3675


**************step is::1, n_splits::5 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::13
Ranking of features:: [ 1  1  5  6  3  1  1  4  2  8  1  1  1  1  9  1  7  1  1  1 10  1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 12)
shape of X_all:: (18372, 13)
shape of X_train:: (14697, 13)
shape of X_test:: (3675, 13)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8075801749 0.7869318182       343
          0  0.9800844900 0.9747899160 0.9774300331      3332

avg / total  0.9602258194 0.9591836735 0.9596501997      3675


**************step is::1, n_splits::6 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 4 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::1, n_splits::7 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::11
Ranking of features:: [ 1  1  7  8  5  1  1  6  4 10  1  2  1  3 11  1  9  1  1  1 12  1]
selector.support_:: [ True  True False False False  True  True False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 10)
shape of X_all:: (18372, 11)
shape of X_train:: (14697, 11)
shape of X_test:: (3675, 11)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.9598944178 0.9589115646 0.9593543985      3675


**************step is::1, n_splits::8 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:


Optimal number of features::16
Ranking of features:: [1 1 2 3 1 1 1 1 1 5 1 1 1 1 6 1 4 1 1 1 7 1]
selector.support_:: [ True  True False False  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 15)
shape of X_all:: (18372, 16)
shape of X_train:: (14697, 16)
shape of X_test:: (3675, 16)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::1, n_splits::9 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::2, n_splits::2 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::2, n_splits::3 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::12
Ranking of features:: [1 1 3 4 3 1 1 4 2 5 1 1 1 2 6 1 5 1 1 1 6 1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all:: (18372, 12)
shape of X_train:: (14697, 12)
shape of X_test:: (3675, 12)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7965616046 0.7830985915       349
          0  0.9785757393 0.9750450992 0.9768072289      3326

avg / total  0.9587760304 0.9580952381 0.9584114971      3675


**************step is::2, n_splits::4 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::12
Ranking of features:: [1 1 3 4 3 1 1 4 2 5 1 1 1 2 6 1 5 1 1 1 6 1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 11)
shape of X_all:: (18372, 12)
shape of X_train:: (14697, 12)
shape of X_test:: (3675, 12)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7965616046 0.7830985915       349
          0  0.9785757393 0.9750450992 0.9768072289      3326

avg / total  0.9587760304 0.9580952381 0.9584114971      3675


**************step is::2, n_splits::5 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::2, n_splits::6 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::10
Ranking of features:: [1 1 4 5 4 1 2 5 3 6 1 2 1 3 7 1 6 1 1 1 7 1]
selector.support_:: [ True  True False False False  True False False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 9)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.9598944178 0.9589115646 0.9593543985      3675


**************step is::2, n_splits::7 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::18
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 2 1 1 1 3 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7971428571 0.7848101266       350
          0  0.9785757393 0.9753383459 0.9769543606      3325

avg / total  0.9589831151 0.9583673469 0.9586549097      3675


**************step is::2, n_splits::8 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::18
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 2 1 1 1 3 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7971428571 0.7848101266       350
          0  0.9785757393 0.9753383459 0.9769543606      3325

avg / total  0.9589831151 0.9583673469 0.9586549097      3675


**************step is::2, n_splits::9 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::10
Ranking of features:: [1 1 4 5 4 1 2 5 3 6 1 2 1 3 7 1 6 1 1 1 7 1]
selector.support_:: [ True  True False False False  True False False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 9)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.9598944178 0.9589115646 0.9593543985      3675


**************step is::3, n_splits::2 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::3, n_splits::3 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::3, n_splits::4 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::3, n_splits::5 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::3, n_splits::6 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::3, n_splits::7 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::19
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 18)
shape of X_all:: (18372, 19)
shape of X_train:: (14697, 19)
shape of X_test:: (3675, 19)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.7942857143 0.7819971871       350
          0  0.9782739891 0.9750375940 0.9766531104      3325

avg / total  0.9584462856 0.9578231293 0.9581144510      3675


**************step is::3, n_splits::8 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::3, n_splits::9 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::4, n_splits::2 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::4, n_splits::3 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::4, n_splits::4 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::22
Ranking of features:: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 22)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7700831025 0.8034682081 0.7864214993       346
          0  0.9794809897 0.9750675879 0.9772693060      3329

avg / total  0.9597662499 0.9589115646 0.9593010499      3675


**************step is::4, n_splits::5 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::14
Ranking of features:: [1 1 2 2 1 1 2 2 1 3 1 1 1 1 3 1 3 1 1 1 3 1]
selector.support_:: [ True  True False False  True  True False False  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 13)
shape of X_all:: (18372, 14)
shape of X_train:: (14697, 14)
shape of X_test:: (3675, 14)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.8040345821 0.7881355932       347
          0  0.9794809897 0.9753605769 0.9774164408      3328

avg / total  0.9599708270 0.9591836735 0.9595442084      3675


**************step is::4, n_splits::6 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::10
Ranking of features:: [1 1 3 3 2 1 3 3 2 4 1 2 1 2 4 1 4 1 1 1 4 1]
selector.support_:: [ True  True False False False  True False False False False  True False
  True False False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 9)
shape of X_all:: (18372, 10)
shape of X_train:: (14697, 10)
shape of X_test:: (3675, 10)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7673130194 0.8052325581 0.7858156028       344
          0  0.9797827399 0.9747823476 0.9772761475      3331

avg / total  0.9598944178 0.9589115646 0.9593543985      3675


**************step is::4, n_splits::7 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::18
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7971428571 0.7848101266       350
          0  0.9785757393 0.9753383459 0.9769543606      3325

avg / total  0.9589831151 0.9583673469 0.9586549097      3675


**************step is::4, n_splits::8 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::18
Ranking of features:: [1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 1]
selector.support_:: [ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 17)
shape of X_all:: (18372, 18)
shape of X_train:: (14697, 18)
shape of X_test:: (3675, 18)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1  0.7728531856 0.7971428571 0.7848101266       350
          0  0.9785757393 0.9753383459 0.9769543606      3325

avg / total  0.9589831151 0.9583673469 0.9586549097      3675


**************step is::4, n_splits::9 *****************


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::14
Ranking of features:: [1 1 2 2 1 1 2 2 1 3 1 1 1 1 3 1 3 1 1 1 3 1]
selector.support_:: [ True  True False False  True  True False False  True False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 13)
shape of X_all:: (18372, 14)
shape of X_train:: (14697, 14)
shape of X_test:: (3675, 14)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
             precision    recall  f1-score   support

          1  0.7728531856 0.8040345821 0.7881355932       347
          0  0.9794809897 0.9753605769 0.9774164408      3328

avg / total  0.9599708270 0.9591836735 0.9595442084      3675



  if diff:


In [189]:
# 通过观察可以发现，当step=1，n_splits=3时，f1值可以达到最大值：0.9596501997，这也是方差分析，rfe和rfecv中最好的效果。
def trainAndTestXGBCrfecvSelect(X_all, y, X_test_all, step=1, n_splits=3):
    
     # RFECV
    estimator = XGBClassifier()
    selector = RFECV(estimator=estimator, step = step, cv=StratifiedKFold(n_splits=n_splits), scoring="accuracy")
    X_all_rfecv = selector.fit_transform(X_all, y) 
    print("Optimal number of features::%d" % selector.n_features_)
    print("Ranking of features:: %s" % selector.ranking_)
    X_test_all_rfecv = selector.transform(X_test_all)
    selected_idx = np.where(pd.Series(selector.support_)==True)[0]   # n_features_to_select个选择出来的特征，每一个特征为True
    print("selector.support_::", selector.support_)
    
    # 因为Wrapper离散特征和连续特征需要一起训练搜索特征子集，但是因为离散特征不需要标准化，所以这里需要将其分开
    discrete_idx = list(set([19, 20, 21]) - set(selected_idx))   # 最后3列为离散值
    X_continuous_tmp = pd.DataFrame(X_all_rfecv)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_discreate_tmp = pd.DataFrame(X_all_rfecv)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    X_test_continuous_tmp = pd.DataFrame(X_test_all_rfecv)[list(range(0, len(selected_idx)-len(discrete_idx)))]
    X_test_discreate_tmp = pd.DataFrame(X_test_all_rfecv)[list(range(len(selected_idx)-len(discrete_idx), len(selected_idx)))]
    
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continuous_tmp)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)
    X_test_continous_new = ss.transform(X_test_continuous_tmp)
    print("type of X_test_continous_new::", type(X_test_continous_new))
    print("shape of X_test_continous_new::", X_test_continous_new.shape)

    # 将连续值和离散值拼接
    X_all_new = np.hstack((X_continuous_new, X_discreate_tmp))
    print("shape of X_all::", X_all.shape)
    X_test_all_new = np.hstack((X_test_continous_new, X_test_discreate_tmp))
    print("shape of X_test_all::", X_test_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all_new, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = XGBClassifier()
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=5))
    
    y_predict = xgbc.predict(X_test_all_new)
    print("y_predict::", Counter(y_predict))

# 调用预测函数
X_all_copy = X_all.copy()
y_copy = y.copy()
X_test_all_copy = X_test_all.copy()
trainAndTestXGBCrfecvSelect(X_all_copy, y_copy, X_test_all_copy)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Optimal number of features::13
Ranking of features:: [ 1  1  5  6  3  1  1  4  2  8  1  1  1  1  9  1  7  1  1  1 10  1]
selector.support_:: [ True  True False False False  True  True False False False  True  True
  True  True False  True False  True  True  True False  True]
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 12)
type of X_test_continous_new:: <class 'numpy.ndarray'>
shape of X_test_continous_new:: (181057, 12)
shape of X_all:: (18372, 22)
shape of X_test_all:: (181057, 22)
shape of X_train:: (14697, 13)
shape of X_test:: (3675, 13)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})


  if diff:


             precision    recall  f1-score   support

          1    0.76731   0.80758   0.78693       343
          0    0.98008   0.97479   0.97743      3332

avg / total    0.96023   0.95918   0.95965      3675

y_predict:: Counter({1: 118030, 0: 63027})


  if diff:


### 7、Embedded方法

使用L1正则项实现特征选择

In [185]:
def trainSVC_l1(X_continous, X_discrete_oneHot, y, C=0.01):
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continous)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)

    # 将连续值和离散值拼接
    X_all = np.hstack((X_continuous_new, X_discrete_oneHot))
    print("shape of X_all::", X_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = LinearSVC(C=C, penalty="l1", dual=False)
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=10))
    

# 网格搜索
X_continous_copy = X_continous.copy()
X_discrete_oneHot_copy = X_discrete_oneHot.copy()
y_copy = y.copy()
"""
连续特征共有19个，
第一次网格搜索：range(1, 30, 2)，然后定位到：[1.5, 3];
第二次网格搜索：range(15, 30, 1)，然后定位到：[1.5, 2];
第三次网格搜索：range(150, 300, 5)，然后定位到：[1.55, 1.65];
第四次网格搜索：range(155, 166, 1)，然后定位到：[1.55, 1.65];
"""
for C in range(155, 166, 1):    
    print()
    param = C/100
    print("**************C is::%f*****************" % param)
    trainSVC_l1(X_continous_copy, X_discrete_oneHot_copy, y_copy, param)  


**************C is::1.550000*****************
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
             precision    recall  f1-score   support

          1  0.7839335180 0.7486772487 0.7658998647       378
          0  0.9713337357 0.9763421292 0.9738314930      3297

avg / total  0.9520582847 0.9529251701 0.9524442398      3675


**************C is::1.560000*****************
type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
shape of X_all:: (18372, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
  

In [186]:
# 通过上述网络搜索的结果，可以看出，到最后f1值的开始循环震荡，说明结果已经趋于稳定，我们只需要将C设置为1.55即可
def trainAndTestSVCL1Select(X_continous, X_discrete_oneHot, y, X_test_continous, X_test_discrete_oneHot, C=1.55):
    
    # 归一化
    ss = StandardScaler()
    X_continuous_new = ss.fit_transform(X_continous)
    print("type of X_continuous_new::", type(X_continuous_new))
    print("shape of X_continuous_new::", X_continuous_new.shape)
    X_test_continous_new = ss.transform(X_test_continous)
    print("type of X_test_continous_new::", type(X_test_continous_new))
    print("shape of X_test_continous_new::", X_test_continous_new.shape)

    # 将连续值和离散值拼接
    X_all = np.hstack((X_continuous_new, X_discrete_oneHot))
    print("shape of X_all::", X_all.shape)
    X_test_all = np.hstack((X_test_continous_new, X_test_discrete_oneHot))
    print("shape of X_test_all::", X_test_all.shape)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=33)
    print("shape of X_train::", X_train.shape)
    print("shape of X_test::", X_test.shape)
    print("shape of y_train::", y_train.shape)
    print("Counter of y_train::", Counter(y_train))
    print("shape of y_test::", y_test.shape)
    print("Counter of y_test::", Counter(y_test))
    
    
    xgbc = LinearSVC(C=C, penalty="l1", dual=False)
    xgbc.fit(X_train, y_train)
    y_test_predict = xgbc.predict(X_test)
    print(classification_report(y_test_predict, y_test, target_names=['1', '0'], digits=5))
    
    y_predict = xgbc.predict(X_test_all)
    print("y_predict::", Counter(y_predict))

# 调用预测函数
X_continous_copy = X_continous.copy()
X_discrete_oneHot_copy = X_discrete_oneHot.copy()
y_copy = y.copy()
X_test_continous_copy = X_test_continous.copy()
X_test_discrete_oneHot_copy = X_test_discrete_oneHot.copy()
trainAndTestSVCL1Select(X_continous_copy, X_discrete_oneHot_copy, y_copy, X_test_continous_copy, X_test_discrete_oneHot_copy)

type of X_continuous_new:: <class 'numpy.ndarray'>
shape of X_continuous_new:: (18372, 19)
type of X_test_continous_new:: <class 'numpy.ndarray'>
shape of X_test_continous_new:: (181057, 19)
shape of X_all:: (18372, 22)
shape of X_test_all:: (181057, 22)
shape of X_train:: (14697, 22)
shape of X_test:: (3675, 22)
shape of y_train:: (14697,)
Counter of y_train:: Counter({1: 13387, 0: 1310})
shape of y_test:: (3675,)
Counter of y_test:: Counter({1: 3314, 0: 361})
             precision    recall  f1-score   support

          1    0.78393   0.74868   0.76590       378
          0    0.97133   0.97634   0.97383      3297

avg / total    0.95206   0.95293   0.95244      3675

y_predict:: Counter({1: 120281, 0: 60776})


### 分析：

本代码说明，在所有的特征选择方法中，针对此数据集，最优的方法是使用RFE，参数为：n_features_to_select=15