# Sequential Feature Selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer
import time

In [2]:
T = pd.read_csv("./data/pumpFeatures.csv")
X, y = T.iloc[:, :-1], T.faultCode
T.head()

Unnamed: 0,wMotor_mean,wMotor_std,wMotor_fftPeakIdx,wMotor_skewness,wMotor_kurtosis,wMotor_peak2peak,wMotor_peak2rms,wMotor_rms,wMotor_mad,wMotor_csRange,...,pOut_peak2peak,pOut_peak2rms,pOut_rms,pOut_mad,pOut_csRange,pOut_pLow,pOut_pMid,pOut_pHigh,pOut_pKur,faultCode
0,876.235677,6.204772,762,-0.164601,2.881123,39.070726,1.018288,876.257629,5.032113,1225859.0,...,0.1255,1.007635,7.226002,0.019002,10109.108199,9.1e-05,0.007472,0.001348,0.076296,0
1,876.238247,6.077463,762,-0.147077,2.673731,34.570767,1.016957,876.259308,4.982643,1225862.0,...,0.120924,1.007091,7.225622,0.01899,10108.578309,0.000118,0.006954,0.001434,0.076296,0
2,876.031159,6.250861,762,-0.123501,2.551445,35.49146,1.017266,876.053444,5.165914,1225575.0,...,0.131161,1.007494,7.225906,0.019032,10108.963684,6e-05,0.00657,0.001309,0.076296,0
3,876.196665,6.247495,762,-0.134596,2.728913,41.529486,1.018873,876.218921,5.093603,1225799.0,...,0.123254,1.00777,7.225914,0.018717,10108.973834,6.6e-05,0.00719,0.001364,0.076296,0
4,876.199734,6.095278,762,-0.139257,2.705275,36.096733,1.018066,876.220919,4.962708,1225812.0,...,0.123759,1.006735,7.22526,0.019078,10108.065292,5.1e-05,0.007814,0.0016,0.071808,0


This code fits a 5-fold cross-validated tree model to the original data and calculates the accuracy.

In [3]:
classifier = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.86  0.88  0.845 0.89  0.89 ]
Mean accuracy: 0.873
Cross validation elapsed time: 0.2084 seconds


Create a function that calculates the number of misclassifications given y_true and y_pred.

In [4]:
def score_func(y_true, y_pred):
    return (y_true != y_pred).sum()

scorer = make_scorer(score_func, greater_is_better=False)

Perform Sequential Feature Selection.

In [5]:
classifier = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True)

sfs = SequentialFeatureSelector(classifier, n_features_to_select='auto', cv=kf, scoring=scorer)
sfs.fit(X, y)

In [6]:
X_selected = X.loc[:, sfs.get_support()]
X_selected.head()

Unnamed: 0,wMotor_mean,wMotor_std,wMotor_fftPeakIdx,wMotor_peak2peak,wMotor_rms,wMotor_csRange,iMotor_fftPeakIdx,iMotor_peak2rms,iMotor_rms,iMotor_csRange,...,qOut_skewness,qOut_kurtosis,qOut_rms,qOut_mad,qOut_csRange,qOut_pLow,qOut_pHigh,qOut_pKur,pOut_mean,pOut_std
0,876.235677,6.204772,762,39.070726,876.257629,1225859.0,762,1.014779,38.23304,53487.133438,...,-0.729159,2.766482,35.637498,2.237138,49707.083608,1.30695,17.30246,2.670354,7.225965,0.023251
1,876.238247,6.077463,762,34.570767,876.259308,1225862.0,762,1.015027,38.226531,53478.048014,...,-0.686943,2.6888,35.665423,2.250044,49745.621125,0.637924,15.85815,2.733186,7.225584,0.023339
2,876.031159,6.250861,762,35.49146,876.053444,1225575.0,762,1.016187,38.231761,53485.122233,...,-0.692149,2.722829,35.664385,2.219092,49746.636093,0.629772,16.690655,0.06732,7.225869,0.023258
3,876.196665,6.247495,762,41.529486,876.218921,1225799.0,762,1.015062,38.231827,53485.537791,...,-0.702165,2.731413,35.60214,2.258929,49655.78185,0.561285,18.747575,0.363527,7.225877,0.023018
4,876.199734,6.095278,762,36.096733,876.220919,1225812.0,762,1.015704,38.229879,53482.758023,...,-0.751425,2.855754,35.651921,2.220493,49726.524042,0.696649,19.024521,2.917193,7.225222,0.023405


Fit and evaluate a 5-fold cross-validated tree model with the selected variables.

In [7]:
classifier = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X_selected, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.86  0.87  0.845 0.91  0.87 ]
Mean accuracy: 0.8710000000000001
Cross validation elapsed time: 0.1164 seconds
