# Feature selection

This notebook helps perform the feature importance assessment. To start working with this notebook, you need to download the [IEEE dataset](https://ieee-dataport.org/open-access/experimental-database-detecting-and-diagnosing-rotor-broken-bar-three-phase-induction), unpack it and place in the folder named **IEEE** above the current folder, containing the notebook. Also, you should have the libraries from *requirements.txt* installed in the virtual environment, as described in the README file. Next, run the first two stages, Features and Data, to create the training and testing datasets with needed FFT window parameters in **Data** folder.

The notebook consists of the three main parts:

- [Model training](#model-training)
- [Permutation importance](#model-training)
- [SHAP](#shap)

First, the models are trained with the given set of hyperparameters, as is in the ML pipelines. Next, permutation importance and SHAP values are calculated for the existing features.

Libraries imported

In [None]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras import regularizers
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.inspection import permutation_importance

Use DVC to get data with required dropped loading levels, then load the .csv files here.

In [None]:
X_test=pd.read_csv('Data/processed/X_test.csv')
y_test=pd.read_csv('Data/processed/y_test.csv')
X_train=pd.read_csv('Data/processed/X_train.csv')
y_train=pd.read_csv('Data/processed/y_train.csv')
dataset=pd.read_csv('Data/raw/dataset.csv')

### Feature correlation

In [None]:
fig, ax=plt.subplots()
corr=dataset.drop(columns=['Loading','Label']).corr("pearson")
sns.heatmap(corr,mask=np.zeros_like(corr, dtype=bool),
            cmap=sns.color_palette("coolwarm", as_cmap=True),
            square=True, ax=ax)
ax.set_title('Feature correlation, Pearson')

## Model training

### SVC Model

In [None]:
param_grid={
"SVC__C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
"SVC__gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
pipe=Pipeline(steps=[('scaler', StandardScaler()),
                             ('SVC', SVC(probability=True,decision_function_shape='ovr'))])
clf=GridSearchCV(estimator=pipe,
                     param_grid=param_grid,
                     cv=5,
                     scoring='neg_log_loss',
                     return_train_score=True,
                     verbose=1,
                     n_jobs=4)
clf.fit(X_train,np.ravel(y_train))

Best parameters

In [None]:
svc_model=clf.best_estimator_
clf.best_params_

Metrics

In [None]:
svc_f1=f1_score(y_test,svc_model.predict(X_test),average='macro')
svc_log_loss=log_loss(y_test,svc_model.predict_proba(X_test))
print(f"SVC model F1 = {svc_f1}\nSVC model log-loss = {svc_log_loss}")

### GBC Model

In [None]:
gbc_param_grid={
    "GBC__n_estimators": [200, 500],
    "GBC__max_depth": [6,9],
    "GBC__learning_rate": [0.1],
    "GBC__subsample": [0.3],
    "GBC__validation_fraction": [0.2],
    "GBC__tol": [0.01, 0.1],
    "GBC__n_iter_no_change": [50],
    "GBC__random_state": [0],
}
gbc_pipe=Pipeline(steps=[('scaler', StandardScaler()),
                             ('GBC', GradientBoostingClassifier())])
clf_gbc=GridSearchCV(estimator=gbc_pipe,
                     param_grid=gbc_param_grid,
                     cv=5,
                     scoring='neg_log_loss',
                     return_train_score=True,
                     verbose=1,
                     n_jobs=4)
clf_gbc.fit(X_train,np.ravel(y_train))

In [None]:
gbc_model=clf_gbc.best_estimator_
clf_gbc.best_params_

Metrics

In [None]:
gbc_f1=f1_score(y_test,gbc_model.predict(X_test),average='macro')
gbc_log_loss=log_loss(y_test,gbc_model.predict_proba(X_test))
print(f"GBC model F1 = {gbc_f1}\nGBC model log-loss = {gbc_log_loss}")

### MLP Model

In [None]:
mlp_model=keras.Sequential([
    layers.BatchNormalization(name='Layer_1',input_shape=[np.shape(X_train)[1]]),
    layers.Dense(name='Layer_2',units=128,activation='swish',kernel_regularizer=regularizers.L2(0.01)),
    layers.Dense(name='Layer_3',units=128,activation='selu',kernel_regularizer=regularizers.L2(0.001)),
    layers.BatchNormalization(name='Layer_4',),
    layers.Dropout(0.3,name='Layer_5'),
    layers.Dense(name='Layer_6',units=128,activation='swish',kernel_regularizer=regularizers.L2(0.001)),
    layers.Dense(name='Output_layer',units=5,activation='softmax'),])
mlp_model.compile(
    
    optimizer=keras.optimizers.Adam(1e-3),
    loss='SparseCategoricalCrossentropy',
    metrics=['SparseCategoricalAccuracy',
             'SparseCategoricalCrossentropy'],
    jit_compile=True
)
#training with early stoppings defined
early_stoppings=keras.callbacks.EarlyStopping(patience=10,
min_delta=0.001,restore_best_weights=True,start_from_epoch=25)
history=mlp_model.fit(X_train,y_train,
                  validation_data=(X_test,y_test),
                  batch_size=16,
                  epochs=128,
                  callbacks=[early_stoppings],
                  verbose=1,
                  use_multiprocessing=True,
                  )

Metrics

In [None]:
# f1 score
mlp_y_hat=mlp_model.predict(X_test)
dl_y_hat=np.argmax(mlp_y_hat,axis=1)
dl_f1=f1_score(y_test,dl_y_hat,average='macro')
dl_log_loss=log_loss(y_test,mlp_model(X_test))
print(f"MLP model F1 = {dl_f1}\nMLP model log-loss = {dl_log_loss}")

## Permutation importance

### SVC

In [None]:
svc_perm = PermutationImportance(svc_model, random_state=0).fit(X_test, np.ravel(y_test))
eli5.show_weights(svc_perm, feature_names = X_test.columns.tolist())

### GBC

In [None]:
gbc_perm = PermutationImportance(gbc_model, random_state=0).fit(X_test, np.ravel(y_test))
eli5.show_weights(gbc_perm, feature_names = X_test.columns.tolist())

### MLP

In [None]:
def scorer(estimator,x,y):
    dl_y_hat=estimator.predict(x)
    dl_y_hat=np.argmax(dl_y_hat,axis=1)
    dl_f1=f1_score(y,dl_y_hat,average='macro')
    return dl_f1

In [None]:
r_multi = permutation_importance(mlp_model, X_test, y_test, n_repeats=30, random_state=0, scoring=scorer)

In [None]:
mlp_perm=pd.DataFrame([X_test.columns,r_multi['importances_mean'],r_multi['importances_std']],index=['Feature','Permutation Importance','Std of importance'])
mlp_perm

## SHAP

### SVC



In [None]:
shap.initjs()
svc_explainer = shap.KernelExplainer(svc_model.predict_proba,shap.kmeans(X_train,5),seed=0)
svc_shap_values = svc_explainer.shap_values(X_test)
shap.force_plot(svc_explainer.expected_value[0], svc_shap_values[0], X_test)

In [None]:
shap.summary_plot(svc_shap_values[0], X_test, show=False)
plt.title('Inlfuence of features on 0BRB prediction of SVM model')
plt.show()

In [None]:
shap.force_plot(svc_explainer.expected_value[0], svc_shap_values[0], X_test)

### GBC

In [None]:
shap.initjs()
gbc_explainer = shap.KernelExplainer(gbc_model.predict_proba,shap.kmeans(X_train,20),seed=0)
gbc_shap_values = gbc_explainer.shap_values(X_test)
shap.force_plot(gbc_explainer.expected_value[0], gbc_shap_values[0], X_test)

In [None]:
shap.summary_plot(gbc_shap_values[3], X_test, show=False)
plt.title('Inlfuence of features on 3BRB prediction of GBM model')
plt.show()

### MLP

In [None]:
shap.initjs()
mlp_explainer = shap.KernelExplainer(mlp_model,shap.kmeans(X_train,20),seed=0)
mlp_shap_values = mlp_explainer.shap_values(X_test)
shap.force_plot(mlp_explainer.expected_value[0], mlp_shap_values[0], X_test)

In [None]:
shap.summary_plot(mlp_shap_values[0], X_test, show=False)
plt.title('Inlfuence of features on 0BRB prediction of MLP model')
plt.show()