# 4) Recursive feature elimination (RFE)

This notebook shows the use of RFE to identify informative features for the classification

See also the documentation of scikit-learn library (https://scikit-learn.org/)

In [None]:
# import librairies

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb

font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
mpl.rc('font', **font)

## Generate samples to classify

We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.

In [None]:
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes

    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])

    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class
            if i<s0:
                a = -scaling
            else:
                a = scaling
            # the shift across classes linearly depends on the feature index j
            X[i,j] = a*j/m + np.random.randn()
            
    return X, lbl

In [None]:
# generate inputs and labels
m = 50 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.5) # try 0.2

## Parameterization of classifier

We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.

The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline).

In [None]:
# hyperparameter for regularization
Cs = [0.01,0.1,1.0,10.0,100.0]

# classifier in pipeline and wrapper for RFE
clf = Pipeline([('scl',StandardScaler()),
                ('mlr',LogisticRegression())])

# number of repetitions and storage of results
n_rep = 10

# outer cross-validation scheme
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)

# inner cross-validation scheme
cv_nest = StratifiedKFold(n_splits=3)

In [None]:
# check names of parameters for pipeline (for grid search)
print(clf.get_params())

# quick fix to get coefficients from mlr estimator in pipeline
def get_coef(clf_pipeline):
    return clf_pipeline['mlr'].coef_

## Optimization involving the tuning of hyperparameter

We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features.

In [None]:
# grid search for hyperparameter C
gscv = GridSearchCV(clf,
                    {'mlr__C': Cs},
                    cv=cv_nest)

acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])

# repeat classification
for train_ind, test_ind in cvs.split(X, y):
    
    # optimize hyperparameter
    gscv.fit(X[train_ind,:], y[train_ind])
    clf_best = gscv.best_estimator_
    best_C = gscv.best_params_['mlr__C']

    # wrap classifier to be fitted to data to calculate the ranking
    feature_select = RFE(clf_best, n_features_to_select=1, step=1,
                         importance_getter=get_coef)
    
    # train and test classifier
    clf_best.fit(X[train_ind,:], y[train_ind])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['test'],
         'log C': [np.log10(best_C)], 
         'score': [score],
         'ranking': [ranking]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
    
    # shuffling
    train_ind_rand = np.random.permutation(train_ind)

    clf_best.fit(X[train_ind,:], y[train_ind_rand])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind_rand])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['shuf'],
         'log C': [np.log10(best_C)], 
         'score': [score],
         'ranking': [ranking]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

Plot the results for accuracy and best parameters.

In [None]:
chance_level = 0.5

plt.figure()
sb.violinplot(data=acc, y='score', x='type', 
              palette=['brown','orange'], scale='width')
plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.tight_layout()
plt.title('train-test accuracies')


acc2 = acc[acc['type']=='test']

plt.figure()
sb.violinplot(data=acc2, y='log C', x='type', 
              palette=['orange'], scale='width')
plt.tight_layout()
plt.title('best C (log$_{10}$ scale)')

plt.show()

Plot the ranking obtained from RFE.

Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking.


In [None]:
plt.figure()
plt.plot(acc['ranking'].mean())
plt.xlabel('input index')
plt.ylabel('mean ranking across CV splits')
plt.tight_layout()
plt.title('ranking of informative features')

plt.show()

Modify the scaling for the input generation to see how the ranking changes.