## 特征选择工具包

### 1、综述

#### 1.1 特征选择：

- 业务层特征选择
- 技术层特征选择

#### 1.2 特征选择的流程

- 数据质量分析（盲选）
- 特征质量分析（粗选、精挑细选）

### 2、过滤法

In [87]:
import toad
import numpy as np
import pandas as pd
from minepy import MINE
from scipy.stats import pearsonr
from collections import defaultdict
import statsmodels.api as sm

from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif, f_regression

from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [8]:
def list_diff(list1, list2):
    """return: 两个list之间的差集"""
    if len(list1) > 0 and len(list2) > 0:
        return list(np.setdiff1d(list1, list2))
    else:
        print('list_diff:len <= 0 !!')

In [21]:
def count_unique(df):
    """计算每列唯一值数量，排除NA:"""
    return df.apply(lambda x:x.nunique(), axis=0)

def near_zero_var(df, freq_cut=0.95, unique_cut=10):
    """过滤得分最高的数据值占95%的序列"""
    nb_unique_values = count_unique(df)
    n_rows, _ = df.shape
    percent_unique = 100 * nb_unique_values / n_rows
    
    def helper_freq(x):
        if nb_unique_values[x.name] == 0:
            return 0.0
        elif nb_unique_values[x.name] == 1:
            return 1.0
        else:
            t = x.value_counts()
            return float(t.iloc[0])/t.iloc[0:].sum()
        
    # 只取得分最高的数据/其他数据和
    freq_ratio = df.apply(helper_freq)
    zerovar = (nb_unique_values == 0) | (nb_unique_values == 1)  # 全为空值或常数值
    near_zero = ((freq_ratio >= freq_cut) & (percent_unique <= unique_cut)) | (zerovar)
    return near_zero

In [18]:
count_unique(X)

sepal length (cm)    35
sepal width (cm)     23
petal length (cm)    43
petal width (cm)     22
dtype: int64

In [12]:
from sklearn.datasets import load_iris
data = load_iris()

# 转化为df
X = pd.DataFrame.from_records(data=data.data, columns=data.feature_names)

In [22]:
near_zero_var(X)

sepal length (cm)    0.071429
sepal width (cm)     0.209677
petal length (cm)    0.094891
petal width (cm)     0.239669
dtype: float64


sepal length (cm)    False
sepal width (cm)     False
petal length (cm)    False
petal width (cm)     False
dtype: bool

In [73]:
class FeatureFilter():
    """
    Parameters
    ----------
    X: pandas.DataFrame
        数据集. 
    y: pandas.series or nparray
        目标变量.
    n_features_to_select: 
        选择特征的数.
    only_get_index: 
        是否只返回选中特征的索引.
    """
    def __init__(self, X, y, n_features_to_select=None, only_get_index=True):
        self.cols = X.columns.tolist()
        self.X = np.array(X)
        self.y = np.array(y)
        self.x_index = range(self.X.shape[1])
        self.only_get_index = only_get_index
        self.n_features_to_select = n_features_to_select
        if n_features_to_select is None:
            self.n_features_to_select = int(np.ceil(2 / 3 * self.X.shape[1]))
            print('self.n_features_to_select:', self.n_features_to_select)
        self.removed = []
    
    def _log(self, index, method):
        print('***{}:'.format(method))
        print('  remain feature index:\n  {}'.format(index))
        rmvd = list_diff(self.x_index, index)
        self.removed += rmvd
        print('  removed feature index:\n  {}\n'.format(rmvd))

    def _return(self, ret, method):
        # True代表该特征被选中
        index = ret.get_support(indices=True)
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return ret.transform(self.X) 
    
    def _return_index(self, index, method):
        # True代表该特征被选中
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return self.X[:, index]
    
    def _by_kbest(self, func, method):
        """Filter方法"""
        ret = SelectKBest(func, k=self.n_features_to_select).fit(self.X, self.y)
        return self._return(ret, method)
    
    def by_var(self, threshold=0.16):
        """方差"""
        ret = VarianceThreshold(threshold=threshold).fit(self.X)
        return self._return(ret, 'by_var')
    
    def count_unique(self, df):
        """计算每列唯一值数量，排除NA:"""
        return df.apply(lambda x:x.nunique(), axis=0)

    def near_zero_var(self, df, freq_cut=0.95, unique_cut=10):
        """过滤得分最高的数据值占95%的序列"""
        nb_unique_values = self.count_unique(df)
        n_rows, _ = df.shape
        percent_unique = 100 * nb_unique_values / n_rows

        def helper_freq(x):
            if nb_unique_values[x.name] == 0:
                return 0.0
            elif nb_unique_values[x.name] == 1:
                return 1.0
            else:
                t = x.value_counts()
                return float(t.iloc[0])/t.iloc[0:].sum()

        # 只取得分最高的数据/其他数据和
        freq_ratio = df.apply(helper_freq)
        zerovar = (nb_unique_values == 0) | (nb_unique_values == 1)  # 全为空值或常数值
        near_zero = ((freq_ratio >= freq_cut) & (percent_unique <= unique_cut)) | (zerovar)
        return near_zero

    def by_chi2(self):
        """卡方"""
        return self._by_kbest(chi2, 'by_chi2')

    def by_pearson(self):
        """相关系数"""
        _pp = lambda X, Y: np.array(list(map(lambda x: pearsonr(x, Y), X.T))).T[0]
        return self._by_kbest(_pp, 'by_pearson')
    
    def by_mi(self):
        """互信息"""
        return self._by_kbest(mutual_info_classif, 'by_mi')
    
    def by_max_info(self):
        """最大信息系数"""
        def _mic(x, y):
            m = MINE()
            m.compute_score(x, y)
            return (m.mic(), 0.5)

        _pp = lambda X, Y: np.array(list(map(lambda x: _mic(x, Y), X.T))).T[0]
        return self._by_kbest(_pp, 'by_max_info')

    def by_f_regression(self):
        """基于样本相关系数的检验
        return
        --------
        F values of features.
        p-values of F-scores.
        """
        return self._by_kbest(f_regression, 'f_regression')

    def by_f_classif(self):
        """基于方差分析的检验统计量f值"""
        return self._by_kbest(f_classif, 'f_classif')
    
    def by_iv(self, threshold=0.02, return_iv=False):
        """基于iv的特征筛选"""
        iv = np.array(list(map(lambda x: toad.stats.IV(x, self.y), self.X.T))).T
        index = np.argwhere(iv > threshold).flatten() 
        iv_info = dict(zip(self.x_index, np.round(iv, 4)))
        if return_iv:
            return self._return_index(index, 'by_iv'), iv_info
        else:
            return self._return_index(index, 'by_iv')
    
    def filter_bycorr_with_orderly_cols(self, df, orderly_cols, columns=None, threshold=0.8, gap=0.1):
        """相关性和IV双指标过滤法

        筛选流程：
        1.按IV值降序排列特征；
        2.对排序好的特征分别计算相关性，大于人为设定的阀值则删除（多尝试几次阈值，得到大约的特征数量即可）。
        实现中加入了参数gap，其意义为：对于大IV的特征，其相关性阈值有所提高，可以理解为剔除IV高的特征需要更
        高的相关性阈值。gap设置为0则该功能失效，返回选中的特征名。
        """
        def _get_diff_list(a_column, a_list, removed=None):
            """返回a_list中不属于a_column和removed的元素！"""
            allcols = a_list
            if removed is not None and len(removed) > 0:
                all_cols = [aa for aa in a_list if aa not in removed]
            return [aa for aa in all_cols if aa != a_column]

        if columns is None:
            columns = df.columns.to_list()

        result = []
        removed = []
        to_cal_columns = [cc for cc in orderly_cols if cc in columns]
        cal_ed = []

        for cc in to_cal_columns:
            cal_ed.append(cc)
            if cc not in removed:
                tmp_cols = _get_diff_list(cc, to_cal_columns, removed=removed + cal_ed)
                thred_diff = gap * 1.0 / (len(tmp_cols) + 1)
                count = len(tmp_cols)
                for tt in tmp_cols:
                    count -= 1
                    # 计算相关性
                    relation = df[cc].corr(df[tt])
                    if abs(relation) > threshold + thred_diff * count:
                        removed.append(tt)

        result = [cc for cc in to_cal_columns if cc not in removed]
        print('After filter，remains:{}\n'.format(len(result)))
        return result
    
    def cal_mic(self, x, y):
        m = MINE()
        m.compute_score(x, y)
        return m.mic()

    def cal_mics(self, dfx,y): 
        """
        dfx: dataframe
        y: serises
        """
        return dfx.apply(lambda x: self.cal_mic(x, y))

    def mrmr(self, dfx, y, n):
        """最小冗余最大相关
        
        1、相关性分析：计算特征与目标变量间的相关性，度量方式可依据数据类型的不同使用互信息、最大信息系数和方差检验等，记为C；
        2、冗余性分析：计算特征间的相关性，度量方式可使用互信息、最大信息系数和相关系数等，记为R；
        3、最后使用Max(C-R)或Max(C/R)综合考虑相关性和冗余性；
        
        Parameter:
        ----------
        n: 待选的特征数
        """
        # 记录已选择的列
        selected = []
        # 记录特征MIC 
        mic_dict = {}
        # 计算相关
        relevances = self.cal_mics(dfx, y)
        # print('与y的关联性:\n{}'.format(relevances))
        last_sel = relevances.idxmax()
        selected.append(last_sel)
        relevances = relevances.to_dict()
        print('选中：{}'.format(last_sel))
        # 冗余-初始化为0.0
        redundances = defaultdict(float)

        while len(selected) < dfx.shape[0] and len(selected) < n:
            mr = -np.inf
            new_sel = None
            for cc in dfx.columns:
                if cc not in selected:
                    redundances[cc] += self.cal_mic(dfx[cc], dfx[last_sel])
                    # 综合考虑相关性和冗余性
                    _mrmr = relevances[cc] - (redundances[cc] / len(selected))
                    if _mrmr > mr:
                        mr = _mrmr
                        new_sel = cc
            print('选中：{}'.format(new_sel))
            selected.append(new_sel)
            last_sel = new_sel
        # print('x的冗余性：\n{}'.format(redundances))
        return selected
    
    def _by_RFE(self, mm, method, step=1):
        """Wrapper方法"""
        ret = RFE(estimator=mm,
                  n_features_to_select=self.n_features_to_select,
                  step=step).fit(self.X, self.y)
        return self._return(ret, method)
    
    def by_RFE_lr(self, args=None):
        return self._by_RFE(LogisticRegression(), 'by_REF_lr')

    def by_RFE_svm(self, args=None):
        return self._by_RFE(LinearSVC(), 'by_REF_svm')
    
    def _by_model(self, mm, method):
        """Embedded方法"""
        ret = SelectFromModel(mm).fit(self.X, self.y)
        return self._return(ret, method)
    
    def by_gbdt(self):
        return self._by_model(GradientBoostingClassifier(), 'by_gbdt')

    def by_rf(self):
        return self._by_model(RandomForestClassifier(), 'by_rf')

    def by_et(self):
        return self._by_model(ExtraTreesClassifier(), 'by_et')

    def by_lr(self, C=0.1):
        return self._by_model(LogisticRegression(penalty='l1', C=C, solver='liblinear'), 'by_lr')

    def by_svm(self, C=0.01):
        return self._by_model(LinearSVC(penalty='l1', C=C, dual=False), 'by_svm')
    
    def example_10_methods(self):
        name = [
            'by_var', 'by_max_info', 'by_pearson', 'by_RFE_svm', 'by_RFE_lr',
            'by_svm', 'by_lr', 'by_et', 'by_rf', 'by_gbdt'
        ]
        # {0:col_0,1:col_1}
        map_index_cols = dict(zip(range(len(self.cols)), self.cols))

        # 执行特征选择算法
        method_dict = {}
        method_dict['by_var'] = self.by_var()
        method_dict['by_pearson'] = self.by_pearson()
        method_dict['by_max_info'] = self.by_max_info()
        method_dict['by_RFE_svm'] = self.by_RFE_svm()
        method_dict['by_RFE_lr'] = self.by_RFE_lr()
        method_dict['by_svm'] = self.by_svm()
        method_dict['by_lr'] = self.by_lr()
        method_dict['by_et'] = self.by_et()
        method_dict['by_rf'] = self.by_rf()
        method_dict['by_gbdt'] = self.by_gbdt()

        # 打平选中特征的list
        selected = [j for i in list(method_dict.values()) for j in i]

        # 构建特征被哪些方法选中：0，1 表示
        dicts01 = {}
        for nm in name:
            dicts01[nm] = [
                1 if i in list(method_dict[nm]) else 0
                for i in range(len(self.cols))
            ]

        # 构建结果统计用的DataFrame
        stat_f = pd.Series(selected).value_counts().reset_index()
        stat_f.columns = ['col_idx', 'count']
        stat_f['feature'] = stat_f.col_idx.map(map_index_cols)

        # 升序排列匹配模型选择方法的值
        stat_f.sort_values(by='col_idx', ascending=True, inplace=True)

        for i in name:
            stat_f[i] = dicts01[i]

        # 按照特征被选中个数降序排列, 个数相同的情况下按照idx升序排列
        stat_f.sort_values(by=['count', 'col_idx'],
                           ascending=[False, True],
                           inplace=True)

        selected = stat_f['feature'][:self.n_features_to_select].tolist()
        print('*' * 10 + 'remains columns:\n{}'.format(selected))

        return selected, stat_f

In [74]:
from sklearn.datasets import load_iris
data = load_iris()

# 转化为df
X = pd.DataFrame.from_records(data=data.data, columns=data.feature_names)
df = X
df['target'] = data.target
df = df[:100].copy()
# df = df.copy()
df.shape

(100, 5)

In [75]:
x_col = [cc for cc in df.columns if cc != 'target']

In [76]:
FF = FeatureFilter(df[x_col], df['target'])

self.n_features_to_select: 3


In [271]:
FF.by_var()

***by_var:
  remain feature index:
  [0 1 2 3]
  removed feature index:
  []



array([0, 1, 2, 3], dtype=int64)

In [272]:
FF.by_chi2()

***by_chi2:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [273]:
FF.by_pearson()

***by_pearson:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [274]:
FF.by_mi()

***by_mi:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [258]:
FF.by_max_info()

***by_max_info:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [259]:
FF.by_f_regression()

***f_regression:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [260]:
FF.by_f_classif()

***f_classif:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]



array([0, 2, 3], dtype=int64)

In [264]:
FF.by_iv(threshold=4, return_iv=True)

***by_iv:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]



(array([2, 3], dtype=int64), {0: 3.456, 1: 3.2406, 2: 7.6676, 3: 7.6676})

In [78]:
data_iv = toad.quality(df, target='target')
orderly_cols = list(data_iv.index.values)
FF.filter_bycorr_with_orderly_cols(df, orderly_cols, columns=x_col)

After filter，remains:3



['petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']

In [79]:
FF.mrmr(df[x_col], df['target'], 3)

选中：petal length (cm)
选中：sepal width (cm)
选中：petal width (cm)


['petal length (cm)', 'sepal width (cm)', 'petal width (cm)']

### 3、包裹法

In [19]:
class FeatureWrapper():
    """
    Parameters
    ----------
    X: pandas.DataFrame
        数据集. 
    y: pandas.series or nparray
        目标变量.
    n_features_to_select: 
        选择特征的数.
    only_get_index: 
        是否只返回选中特征的索引.
    """
    def __init__(self, X, y, n_features_to_select=None, only_get_index=True):
        self.cols = X.columns.tolist()
        self.X = np.array(X)
        self.y = np.array(y)
        self.x_index = range(self.X.shape[1])
        self.only_get_index = only_get_index
        self.n_features_to_select = n_features_to_select
        if n_features_to_select is None:
            self.n_features_to_select = int(np.ceil(2 / 3 * self.X.shape[1]))
            print('self.n_features_to_select:', self.n_features_to_select)
        self.removed = []
    
    def _log(self, index, method):
        print('***{}:'.format(method))
        print('  remain feature index:\n  {}'.format(index))
        rmvd = list_diff(self.x_index, index)
        self.removed += rmvd
        print('  removed feature index:\n  {}\n'.format(rmvd))

    def _return(self, ret, method):
        # True代表该特征被选中
        index = ret.get_support(indices=True)
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return ret.transform(self.X) 
    
    def _return_index(self, index, method):
        # True代表该特征被选中
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return self.X[:, index]
    
    def _by_RFE(self, mm, method, step=1):
        """Wrapper方法"""
        ret = RFE(estimator=mm,
                  n_features_to_select=self.n_features_to_select,
                  step=step).fit(self.X, self.y)
        return self._return(ret, method)
    
    def by_RFE_lr(self, args=None):
        return self._by_RFE(LogisticRegression(), 'by_REF_lr')

    def by_RFE_svm(self, args=None):
        return self._by_RFE(LinearSVC(), 'by_REF_svm')

In [25]:
FW = FeatureWrapper(df[x_col], df['target'])

self.n_features_to_select: 3


In [26]:
FW.by_RFE_lr()

***by_REF_lr:
  remain feature index:
  [1 2 3]
  removed feature index:
  [0]



array([1, 2, 3], dtype=int64)

In [27]:
FW.by_RFE_svm()

***by_REF_svm:
  remain feature index:
  [1 2 3]
  removed feature index:
  [0]



array([1, 2, 3], dtype=int64)

### 4、嵌入法

In [47]:
class FeatureEmbedded():
    """
    Parameters
    ----------
    X: pandas.DataFrame
        数据集. 
    y: pandas.series or nparray
        目标变量.
    n_features_to_select: 
        选择特征的数.
    only_get_index: 
        是否只返回选中特征的索引.
    """
    def __init__(self, X, y, n_features_to_select=None, only_get_index=True):
        self.cols = X.columns.tolist()
        self.X = np.array(X)
        self.y = np.array(y)
        self.x_index = range(self.X.shape[1])
        self.only_get_index = only_get_index
        self.n_features_to_select = n_features_to_select
        if n_features_to_select is None:
            self.n_features_to_select = int(np.ceil(2 / 3 * self.X.shape[1]))
            print('self.n_features_to_select:', self.n_features_to_select)
        self.removed = []
    
    def _log(self, index, method):
        print('***{}:'.format(method))
        print('  remain feature index:\n  {}'.format(index))
        rmvd = list_diff(self.x_index, index)
        self.removed += rmvd
        print('  removed feature index:\n  {}\n'.format(rmvd))

    def _return(self, ret, method):
        # True代表该特征被选中
        index = ret.get_support(indices=True)
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return ret.transform(self.X) 
    
    def _return_index(self, index, method):
        # True代表该特征被选中
        self._log(index, method)

        if self.only_get_index == True:
            return index
        else:  #返回筛选之后的X
            return self.X[:, index]
    
    def _by_model(self, mm, method):
        """Embedded方法"""
        ret = SelectFromModel(mm).fit(self.X, self.y)
        return self._return(ret, method)
    
    def by_gbdt(self):
        return self._by_model(GradientBoostingClassifier(), 'by_gbdt')

    def by_rf(self):
        return self._by_model(RandomForestClassifier(), 'by_rf')

    def by_et(self):
        return self._by_model(ExtraTreesClassifier(), 'by_et')

    def by_lr(self, C=0.1):
        return self._by_model(LogisticRegression(penalty='l1', C=C, solver='liblinear'), 'by_lr')

    def by_svm(self, C=0.01):
        return self._by_model(LinearSVC(penalty='l1', C=C, dual=False), 'by_svm')
    
    def example_10_methods(self):
        name = [
            'by_svm', 'by_lr', 'by_et', 'by_rf', 'by_gbdt'
        ]
        # {0:col_0,1:col_1}
        map_index_cols = dict(zip(range(len(self.cols)), self.cols))

        # 执行特征选择算法
        method_dict = {}
        method_dict['by_svm'] = self.by_svm()
        method_dict['by_lr'] = self.by_lr()
        method_dict['by_et'] = self.by_et()
        method_dict['by_rf'] = self.by_rf()
        method_dict['by_gbdt'] = self.by_gbdt()

        # 打平选中特征的list
        selected = [j for i in list(method_dict.values()) for j in i]

        # 构建特征被哪些方法选中：0，1 表示
        dicts01 = {}
        for nm in name:
            dicts01[nm] = [
                1 if i in list(method_dict[nm]) else 0
                for i in range(len(self.cols))
            ]

        # 构建结果统计用的DataFrame
        stat_f = pd.Series(selected).value_counts().reset_index()
        stat_f.columns = ['col_idx', 'count']
        stat_f['feature'] = stat_f.col_idx.map(map_index_cols)

        # 升序排列匹配模型选择方法的值
        stat_f.sort_values(by='col_idx', ascending=True, inplace=True)

        for i in name:
            stat_f[i] = dicts01[i]

        # 按照特征被选中个数降序排列, 个数相同的情况下按照idx升序排列
        stat_f.sort_values(by=['count', 'col_idx'],
                           ascending=[False, True],
                           inplace=True)

        selected = stat_f['feature'][:self.n_features_to_select].tolist()
        print('*' * 10 + 'remains columns:\n{}'.format(selected))

        return selected, stat_f

In [48]:
FE = FeatureEmbedded(df[x_col], df['target'])

self.n_features_to_select: 3


In [41]:
FE.by_gbdt()

***by_gbdt:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]



array([2, 3], dtype=int64)

In [42]:
FE.by_rf()

***by_rf:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]



array([2, 3], dtype=int64)

In [43]:
FE.by_et()

***by_et:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]



array([2, 3], dtype=int64)

In [44]:
FE.by_lr()

***by_lr:
  remain feature index:
  [1 2]
  removed feature index:
  [0, 3]



array([1, 2], dtype=int64)

In [45]:
FE.by_svm()

***by_svm:
  remain feature index:
  [1 2]
  removed feature index:
  [0, 3]



array([1, 2], dtype=int64)

In [52]:
FF = FeatureFilter(df[x_col], df['target'])
FF.example_10_methods()

self.n_features_to_select: 3
***by_var:
  remain feature index:
  [0 1 2 3]
  removed feature index:
  []

***by_pearson:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]

***by_max_info:
  remain feature index:
  [0 2 3]
  removed feature index:
  [1]

***by_REF_svm:
  remain feature index:
  [1 2 3]
  removed feature index:
  [0]

***by_REF_lr:
  remain feature index:
  [1 2 3]
  removed feature index:
  [0]

***by_svm:
  remain feature index:
  [1 2]
  removed feature index:
  [0, 3]

***by_lr:
  remain feature index:
  [1 2]
  removed feature index:
  [0, 3]

***by_et:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]

***by_rf:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]

***by_gbdt:
  remain feature index:
  [2 3]
  removed feature index:
  [0, 1]

**********remains columns:
['petal length (cm)', 'petal width (cm)', 'sepal width (cm)']


(['petal length (cm)', 'petal width (cm)', 'sepal width (cm)'],
    col_idx  count            feature  by_var  by_max_info  by_pearson  \
 0        2     10  petal length (cm)       1            1           1   
 1        3      8   petal width (cm)       1            1           1   
 2        1      5   sepal width (cm)       1            0           0   
 3        0      3  sepal length (cm)       1            1           1   
 
    by_RFE_svm  by_RFE_lr  by_svm  by_lr  by_et  by_rf  by_gbdt  
 0           1          1       1      1      1      1        1  
 1           1          1       0      0      1      1        1  
 2           1          1       1      1      0      0        0  
 3           0          0       0      0      0      0        0  )

### 5、逐步回归

In [93]:
S = StepWise(X[x_col], X['target'])
S.bf()

In [96]:
df2, drop_vars = toad.selection.stepwise(df, target='target', return_drop=True)

In [102]:
drop_vars

['sepal length (cm)']