# Step Backward Feature Selection by `Mr. Harshit Dawar!`

* This algorithm falls under the category of Wrapper Methods, that guarantees to select the best subset of features for a particular Machine Learning algorithm!

* This algorithm starts by training the model using all the available features in the dataset, then keeps on eliminating one feature & selecting the best subset of the features that provides the best performance. Likewise, this algorithm keeps on decreasing the features, unless & until a stopping condition is met.

* Stopping Condition can be a predefined number of features or the model performance threshold.

#### Now, that being said, let's proceed towards the practical.

In [1]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.datasets import load_boston
from mlxtend.feature_selection import SequentialFeatureSelector

### Classification Use-Case

In [3]:
# Loading the Dataset!

data = pd.read_csv("../ds/Titanic.csv")
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,-0.590495,1,0,-0.50024,S
1,1,1,female,0.643971,1,0,0.788947,C
2,1,3,female,-0.281878,0,0,-0.48665,S
3,1,1,female,0.412509,1,0,0.422861,S
4,0,3,male,0.412509,0,0,-0.484133,S


In [4]:
# Dividing the Dataset into Target & Features!
X = data.drop("Survived", axis = 1)
y = data.Survived

In [5]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,-0.590495,1,0,-0.50024,S
1,1,female,0.643971,1,0,0.788947,C
2,3,female,-0.281878,0,0,-0.48665,S
3,1,female,0.412509,1,0,0.422861,S
4,3,male,0.412509,0,0,-0.484133,S


In [6]:
y

0      0
1      1
2      1
3      1
4      0
      ..
884    0
885    1
886    0
887    1
888    0
Name: Survived, Length: 889, dtype: int64

In [8]:
# Creating the Feature Selector!

Feature_Selector = SequentialFeatureSelector(
                        RandomForestClassifier(n_estimators = 15, n_jobs = 2),
                        scoring = "roc_auc",
                        cv = 3,
                        floating = False,
                        forward = False,
                        k_features = 3,
                        verbose = 2
                        )

Feature_Selector.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.3s finished

[2021-03-14 12:15:51] Features: 6/3 -- score: 0.8456803791964557[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.8s finished

[2021-03-14 12:15:54] Features: 5/3 -- score: 0.8414808480168455[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.4s finished

[2021-03-14 12:15:56] Features: 4/3 -- score: 0.8498912222445631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

SequentialFeatureSelector(cv=3,
                          estimator=RandomForestClassifier(n_estimators=15,
                                                           n_jobs=2),
                          forward=False, k_features=3, scoring='roc_auc',
                          verbose=2)

In [9]:
# Printing the Names of the Selected Features

Feature_Selector.k_feature_names_

('Pclass', 'Sex', 'Age')

In [10]:
# Printing the Final Metrics Score!
Feature_Selector.k_score_

0.826462080213025

In [11]:
# Transforming the Dataset to selected Features
X = Feature_Selector.transform(X)

In [12]:
X

array([[ 3.        ,  1.        , -0.59049493],
       [ 1.        ,  0.        ,  0.64397101],
       [ 3.        ,  0.        , -0.28187844],
       ...,
       [ 3.        ,  0.        ,  0.00352373],
       [ 1.        ,  1.        , -0.28187844],
       [ 3.        ,  1.        ,  0.18104628]])

### Regression Use-Case

In [13]:
# Loading the Boston Housing Dataset!

reg_X, reg_Y = load_boston(return_X_y = True)

In [14]:
reg_X.shape, reg_Y.shape

((506, 13), (506,))

In [15]:
# Creating the Feature Selector!

Feature_Selector = SequentialFeatureSelector(
                        RandomForestRegressor(n_estimators = 15, n_jobs = 2),
                        scoring = "r2",
                        cv = 3,
                        floating = False,
                        forward = False,
                        k_features = 7,
                        verbose = 2,
                        n_jobs = 2
                        )

Feature_Selector.fit(reg_X, reg_Y)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  13 out of  13 | elapsed:    4.8s finished

[2021-03-14 12:18:48] Features: 12/7 -- score: 0.5778182783952241[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    4.1s finished

[2021-03-14 12:18:52] Features: 11/7 -- score: 0.5809124541795844[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  11 out of  11 | elapsed:    4.0s finished

[2021-03-14 12:18:56] Features: 10/7 -- score: 0.5854088801066885[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    3.3s finished

[2021-03-14 12:18:59] Features: 9/7 -- score: 0.5913056261166206[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    3.4s finished

[2021-03-14

SequentialFeatureSelector(cv=3,
                          estimator=RandomForestRegressor(n_estimators=15,
                                                          n_jobs=2),
                          forward=False, k_features=7, n_jobs=2, scoring='r2',
                          verbose=2)

In [16]:
# Printing the Names of the Selected Features

Feature_Selector.k_feature_names_

('0', '2', '7', '9', '10', '11', '12')

In [17]:
# Transforming the Dataset!

reg_X = Feature_Selector.transform(reg_X)

In [18]:
reg_X

array([[6.3200e-03, 2.3100e+00, 4.0900e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 7.0700e+00, 4.9671e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 7.0700e+00, 4.9671e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 1.1930e+01, 2.1675e+00, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 1.1930e+01, 2.3889e+00, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 1.1930e+01, 2.5050e+00, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

# Congratulations, you have learned a new way to select the features!