In [39]:
import pandas as pd

In [2]:
df_train = pd.read_csv("../data/features/all/combined_train.csv")
df_test = pd.read_csv("../data/features/all/combined_test.csv")


df_train = df_train.drop("dataFile", axis=1)
df_test = df_test.drop("dataFile", axis=1)
df_train.shape

(1140, 1933)

Feature Selection Pipeline:

-   Sort out any features with NaN values
-   Sort out features by VarianceThreshold
-   Sort out features with low correlation to the target variable


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class DropNaNFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.nan_features_ = X.columns[X.isnull().any()].tolist()
        else:
            X_df = pd.DataFrame(X)
            self.nan_features_ = X_df.columns[X_df.isnull().any()].tolist()
        return self

    def transform(self, X):
        if not hasattr(self, "nan_features_"):
            raise ValueError("The transformer has not been fitted yet.")

        if isinstance(X, pd.DataFrame):
            # Drop features saved during fit
            return X.drop(columns=self.nan_features_, errors="ignore")
        else:
            # For numpy array, convert to DataFrame, drop columns, and return array
            X_df = pd.DataFrame(X)
            return X_df.drop(columns=self.nan_features_, errors="ignore").values


class CorrelationFilter(BaseEstimator, TransformerMixin):

    def __init__(self, quantile_range=(0.25, 0.75)):
        self.quantile_range = quantile_range

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            # Compute correlations with the target variable
            if not pd.api.types.is_numeric_dtype(y):
                self.label_encoder_ = LabelEncoder()
                y = self.label_encoder_.fit_transform(y)

            correlations = pd.concat([X, pd.Series(y, name="target")], axis=1).corr()[
                "target"
            ][:-1]
            sorted_correlations = correlations.sort_values(ascending=False)
            lower_bound = sorted_correlations.quantile(self.quantile_range[0])
            upper_bound = sorted_correlations.quantile(self.quantile_range[1])

            # Retain features outside the specified quantile range
            self.selected_features_ = sorted_correlations[
                (sorted_correlations < lower_bound)
                | (sorted_correlations > upper_bound)
            ].index.tolist()
        else:

            raise ValueError("Input data must be a pandas DataFrame.")
        return self

    def transform(self, X):

        if isinstance(X, pd.DataFrame):
            return X[self.selected_features_]
        else:
            raise ValueError("Input data must be a pandas DataFrame.")

In [53]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

selection_pipe = Pipeline(
    [
        ("nan_filter", DropNaNFeatures()),
        ("variance_threshold", VarianceThreshold().set_output(transform="pandas")),
        ("scaler", StandardScaler().set_output(transform="pandas")),
        ("correlation_filter", CorrelationFilter()),
        ("k_best", SelectKBest().set_output(transform="pandas")),
        ("SVC", SVC()),
    ]
)

X = df_train.drop("material", axis=1)
y = df_train["material"]

selection_pipe

In [54]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import f_classif, mutual_info_classif, chi2

param_grid = {
    "k_best__k": [10, 20, 30, 40, 50, 75],
    "variance_threshold__threshold": [0, 0.01, 0.02, 0.05],
    "correlation_filter__quantile_range": [
        (0.25, 0.65),
        (0.30, 0.70),
        (0.25, 0.75),
        (0.24, 0.76),
        (0.22, 0.78),
        (0.2, 0.8),
        ((0.15, 0.85)),
    ],
    "k_best__score_func": [mutual_info_classif, f_classif, chi2],
}
gsCV = GridSearchCV(selection_pipe, param_grid=param_grid, cv=StratifiedShuffleSplit(5))

In [55]:
gsCV.fit(X, y)

840 fits failed out of a total of 2520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
840 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mike2\.conda\envs\drill\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mike2\.conda\envs\drill\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mike2\.conda\envs\drill\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mike2\.con

In [60]:
best_pipeline = gsCV.best_estimator_
best_pipeline

Best Parameter set:

In [82]:
gsCV.best_params_

{'correlation_filter__quantile_range': (0.3, 0.7),
 'k_best__k': 40,
 'k_best__score_func': <function sklearn.feature_selection._mutual_info.mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None, n_jobs=None)>,
 'variance_threshold__threshold': 0}

Saving the test and train dataset with only the selected features


In [61]:
sel_feature_names = best_pipeline[-1].feature_names_in_
sel_feature_names

array(['Current__fft_coefficient__attr_"abs"__coeff_2',
       'Voltage__count_below_mean', '# Audio__cid_ce__normalize_False',
       'Current__count_above_mean', '# Audio__number_crossing_m__m_1',
       '# Audio__absolute_sum_of_changes', 'Current__quantile__q_0.4',
       'Current__index_mass_quantile__q_0.7',
       '# Audio__binned_entropy__max_bins_10', '# Audio__quantile__q_0.7',
       'Current__fft_coefficient__attr_"abs"__coeff_0',
       'Current__fft_coefficient__attr_"real"__coeff_0',
       'Current__sum_values', 'Current__sum_of_reoccurring_data_points',
       '# Audio__number_peaks__n_5',
       '# Audio__range_count__max_1__min_-1', '# Audio__mean_abs_change',
       '# Audio__abs_energy',
       'Current__fft_coefficient__attr_"real"__coeff_1',
       '# Audio__quantile__q_0.8', '# Audio__ratio_beyond_r_sigma__r_1',
       'Current__abs_energy', '# Audio__variation_coefficient',
       'Current__ratio_beyond_r_sigma__r_1', 'Current__median',
       'Voltage__ratio_b

In [93]:
df_train_selected = pd.concat([df_train[sel_feature_names], df_train["material"]],axis=1)
df_test_selected = pd.concat([df_test[sel_feature_names], df_test["material"]],axis=1)

df_train_selected.to_csv("../data/features/selection/selected_features_train.csv", index=None)
df_test_selected.to_csv("../data/features/selection/selected_features_test.csv", index=None)