In [9]:
# Standard libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Handeling .wav files

import librosa
from librosa import feature

# Machine Learning

from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold

from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from functools import reduce

# data vizualisation

import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
def merge_pd(machine):
    
    # Opening the .csv files
    
    df_6dB=pd.read_csv(f'Librosa_features_{machine}_6dB.csv')
    df_0dB=pd.read_csv(f'Librosa_features_{machine}_0dB.csv')
    df_min6dB=pd.read_csv(f'Librosa_features_{machine}_-6dB.csv')
    
    # Dropping the 'Unnamed: 0' column
    
    df_6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_0dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_min6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    
    # Add the number of dB to each column name for each .csv file

    df_6dB.columns = [str(col) + '_6dB' for col in df_6dB.columns]
    df_0dB.columns = [str(col) + '_0dB' for col in df_0dB.columns]
    df_min6dB.columns = [str(col) + '_-6dB' for col in df_min6dB.columns]
    
    # Merging the .csv files into one DataFrame
    
    data_frames = [df_6dB, df_0dB, df_min6dB]
    df_merged = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,how='outer'), data_frames)
    
    df_merged.head()
    
    return df_merged

In [11]:
valve = merge_pd('valve')

In [12]:
valve.drop(columns=['normal(0)/abnormal(1)_6dB','normal(0)/abnormal(1)_0dB'],axis=1,inplace=True)
valve.rename(columns={'normal(0)/abnormal(1)_-6dB': 'normal(0)/abnormal(1)'}, inplace=True)

In [13]:
pipeline = Pipeline([('scaler', StandardScaler()), ('clf', SVC())])

In [14]:
# 20% of the overal data will seperated for later validation of the model

X = valve.drop(columns = ['normal(0)/abnormal(1)'], axis=1)
y = valve['normal(0)/abnormal(1)']

X_model, X_valid, y_model, y_valid = train_test_split(X,y,test_size=0.2,random_state = 42, stratify=y)


# 60% of the overal data (75% of X_model, y_model) will be used to create a training set for the model
# 20% of the overal data (25% of X_model, y_model) will be used to create a testing set for the model

X_train, X_test, y_train, y_test = train_test_split(X_model,
                                                   y_model,
                                                   test_size = 0.25,
                                                   random_state = 10, stratify=y_model)


print(X_train.shape, y_train.shape,X_test.shape, y_test.shape,X_valid.shape, y_valid.shape)
print(y_train.value_counts())

(2502, 60) (2502,) (834, 60) (834,) (834, 60) (834,)
0    2215
1     287
Name: normal(0)/abnormal(1), dtype: int64


In [15]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 56em; }</style>"))

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier

y_pred = []
clfs = []

#clfs.append(EasyEnsembleClassifier(feature_importances_))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(RandomForestClassifier(class_weight='balanced'))
clfs.append(BalancedRandomForestClassifier(n_estimators = 100, random_state = 0))


#feature_list = [6,13,21]

#for index in feature_list:
for index in range(4,22):
    
    for classifier in clfs:
        
        print("----------------------------------------------")
        print("----------------------------------------------")
        print(classifier)
        print("----------------------------------------------")
        print("----------------------------------------------")
        
        sel = RFE(classifier, n_features_to_select = index)
        sel.fit(X_train, y_train)
        features = X_train.columns[sel.get_support()]
        X_train_rfe = sel.transform(X_train)
        X_test_rfe = sel.transform(X_test)
        print('Selected Feature', index)
        print(features)
    
        classifier.fit(X_train_rfe, y_train)
        y_pred= classifier.predict(X_test_rfe)
        scores = cross_val_score(pipeline, X_train_rfe, y_train, cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1))



        print("----------------------------------------------")
        print("TRAIN-TEST")
        print("----------------------------------------------")


        print('confusion matrix', classifier)
        print(confusion_matrix(y_test, y_pred))
        print('classification report')
        print(classification_report(y_test, y_pred))
        print('accuracy score')
        print(accuracy_score(y_test, y_pred))
        
        X_valid_rfe = sel.transform(X_valid)
        y_pred = classifier.predict(X_valid_rfe)

        print("----------------------------------------------")
        print("TRAIN-VALIDATION")
        print("----------------------------------------------")

        print('confusion matrix', classifier)
        print(confusion_matrix(y_valid, y_pred))
        print('classification report')
        print(classification_report(y_valid, y_pred))
        print('accuracy score')
        print(accuracy_score(y_valid, y_pred))

----------------------------------------------
----------------------------------------------
DecisionTreeClassifier()
----------------------------------------------
----------------------------------------------
Selected Feature 4
Index(['melspectrogram_max_6dB', 'melspectrogram_std_6dB', 'max perc_6dB',
       'max perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[710  28]
 [ 33  63]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       738
           1       0.69      0.66      0.67        96

    accuracy                           0.93       834
   macro avg       0.82      0.81      0.82       834
weighted avg       0.93      0.93      0.93       834

accuracy score
0.9268585131894485
----------------------------------------------
TRAIN-VALIDATION
-----------------------------

Selected Feature 5
Index(['melspectrogram_max_6dB', 'max perc_6dB', 'min perc_6dB',
       'max perc_0dB', 'max perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[729   9]
 [ 38  58]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       738
           1       0.87      0.60      0.71        96

    accuracy                           0.94       834
   macro avg       0.91      0.80      0.84       834
weighted avg       0.94      0.94      0.94       834

accuracy score
0.9436450839328537
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[732   6]
 [ 29  67]]
classification report
              precision    recall  f1-score   support

 

Selected Feature 7
Index(['melspectrogram_max_6dB', 'melspectrogram_std_6dB', 'rms_6dB',
       'max perc_6dB', 'melspectrogram_sum_0dB', 'melspectrogram_corr_0dB',
       'max perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[712  26]
 [ 30  66]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       738
           1       0.72      0.69      0.70        96

    accuracy                           0.93       834
   macro avg       0.84      0.83      0.83       834
weighted avg       0.93      0.93      0.93       834

accuracy score
0.9328537170263789
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[711  27]
 [ 25  71]]
classification report
              precision    recall  f1

Selected Feature 8
Index(['melspectrogram_max_6dB', 'melspectrogram_corr_6dB', 'max perc_6dB',
       'min perc_6dB', 'max perc_0dB', 'min perc_0dB', 'max perc_-6dB',
       'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[731   7]
 [ 36  60]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       738
           1       0.90      0.62      0.74        96

    accuracy                           0.95       834
   macro avg       0.92      0.81      0.85       834
weighted avg       0.95      0.95      0.94       834

accuracy score
0.9484412470023981
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[735   3]
 [ 34  62]]
classificat

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix BalancedRandomForestClassifier(random_state=0)
[[696  42]
 [ 11  85]]
classification report
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       738
           1       0.67      0.89      0.76        96

    accuracy                           0.94       834
   macro avg       0.83      0.91      0.86       834
weighted avg       0.95      0.94      0.94       834

accuracy score
0.9364508393285371
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix BalancedRandomForestClassifier(random_state=0)
[[699  39]
 [  7  89]]
classification report
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       738
           1       0.70      0.93      0.79        96

    accuracy                           0.

Selected Feature 11
Index(['melspectrogram_max_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'max perc_6dB', 'min perc_6dB',
       'melspectrogram_corr_0dB', 'max perc_0dB', 'min perc_0dB',
       'melspectrogram_corr_-6dB', 'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[735   3]
 [ 33  63]]
classification report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       738
           1       0.95      0.66      0.78        96

    accuracy                           0.96       834
   macro avg       0.96      0.83      0.88       834
weighted avg       0.96      0.96      0.95       834

accuracy score
0.9568345323741008
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClas

Selected Feature 12
Index(['melspectrogram_max_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'max perc_6dB', 'min perc_6dB',
       'melspectrogram_max_0dB', 'melspectrogram_corr_0dB', 'max perc_0dB',
       'min perc_0dB', 'melspectrogram_corr_-6dB', 'max perc_-6dB',
       'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[734   4]
 [ 34  62]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       738
           1       0.94      0.65      0.77        96

    accuracy                           0.95       834
   macro avg       0.95      0.82      0.87       834
weighted avg       0.95      0.95      0.95       834

accuracy score
0.9544364508393285
----------------------------------------------
TRAIN-VALIDATION
------------------------

Selected Feature 13
Index(['melspectrogram_6dB', 'melspectrogram_max_6dB',
       'melspectrogram_corr_6dB', 'melspectrogram_std_6dB', 'max perc_6dB',
       'min perc_6dB', 'melspectrogram_max_0dB', 'melspectrogram_corr_0dB',
       'max perc_0dB', 'min perc_0dB', 'melspectrogram_corr_-6dB',
       'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix BalancedRandomForestClassifier(random_state=0)
[[695  43]
 [  9  87]]
classification report
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       738
           1       0.67      0.91      0.77        96

    accuracy                           0.94       834
   macro avg       0.83      0.92      0.87       834
weighted avg       0.95      0.94      0.94       834

accuracy score
0.9376498800959233
----------------------------------------------
TRAIN-VALIDATION
---

Selected Feature 15
Index(['melspectrogram_max_6dB', 'melspectrogram_std_6dB',
       'spectral_bandwidth_6dB', 'spectral_contrast_6dB', 'max harm_6dB',
       'max perc_6dB', 'min perc_6dB', 'melspectrogram_0dB',
       'melspectrogram_corr_0dB', 'rms_0dB', 'spectral_flatness _0dB',
       'max perc_0dB', 'rms_-6dB', 'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[709  29]
 [ 35  61]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       738
           1       0.68      0.64      0.66        96

    accuracy                           0.92       834
   macro avg       0.82      0.80      0.81       834
weighted avg       0.92      0.92      0.92       834

accuracy score
0.9232613908872902
----------------------------------------------
TRAIN-VALIDATION
------

Selected Feature 16
Index(['melspectrogram_6dB', 'melspectrogram_max_6dB',
       'melspectrogram_sum_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'rms_6dB', 'max perc_6dB', 'min harm_6dB',
       'min perc_6dB', 'melspectrogram_max_0dB', 'melspectrogram_corr_0dB',
       'max perc_0dB', 'min perc_0dB', 'melspectrogram_corr_-6dB',
       'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[734   4]
 [ 38  58]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       738
           1       0.94      0.60      0.73        96

    accuracy                           0.95       834
   macro avg       0.94      0.80      0.85       834
weighted avg       0.95      0.95      0.94       834

accuracy score
0.9496402877697842
-----------------------------

Selected Feature 17
Index(['melspectrogram_6dB', 'melspectrogram_max_6dB',
       'melspectrogram_sum_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'mfcc_6dB', 'rms_6dB', 'max perc_6dB',
       'min harm_6dB', 'min perc_6dB', 'melspectrogram_max_0dB',
       'melspectrogram_corr_0dB', 'max perc_0dB', 'min perc_0dB',
       'melspectrogram_corr_-6dB', 'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[733   5]
 [ 39  57]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       738
           1       0.92      0.59      0.72        96

    accuracy                           0.95       834
   macro avg       0.93      0.79      0.85       834
weighted avg       0.95      0.95      0.94       834

accuracy score
0.94724220623

Selected Feature 18
Index(['melspectrogram_6dB', 'melspectrogram_max_6dB',
       'melspectrogram_sum_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'mfcc_6dB', 'rms_6dB',
       'spectral_bandwidth_6dB', 'max perc_6dB', 'min harm_6dB',
       'min perc_6dB', 'melspectrogram_max_0dB', 'melspectrogram_corr_0dB',
       'max perc_0dB', 'min perc_0dB', 'melspectrogram_corr_-6dB',
       'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix BalancedRandomForestClassifier(random_state=0)
[[696  42]
 [ 11  85]]
classification report
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       738
           1       0.67      0.89      0.76        96

    accuracy                           0.94       834
   macro avg       0.83      0.91      0.86       834
weighted avg       0.95      0.94      0.94       8

Selected Feature 20
Index(['melspectrogram_max_6dB', 'melspectrogram_std_6dB', 'mfcc_6dB',
       'spectral_bandwidth_6dB', 'spectral_contrast_6dB', 'max harm_6dB',
       'max perc_6dB', 'min perc_6dB', 'melspectrogram_0dB',
       'melspectrogram_max_0dB', 'melspectrogram_corr_0dB', 'rms_0dB',
       'spectral_flatness _0dB', 'mean harm_0dB', 'max perc_0dB',
       'min perc_0dB', 'rms_-6dB', 'max perc_-6dB', 'min harm_-6dB',
       'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[704  34]
 [ 34  62]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       738
           1       0.65      0.65      0.65        96

    accuracy                           0.92       834
   macro avg       0.80      0.80      0.80       834
weighted avg       0.92      0.92      0.92       834

ac

Selected Feature 21
Index(['melspectrogram_6dB', 'melspectrogram_max_6dB',
       'melspectrogram_sum_6dB', 'melspectrogram_corr_6dB',
       'melspectrogram_std_6dB', 'rms_6dB', 'spectral_bandwidth_6dB',
       'max perc_6dB', 'min harm_6dB', 'min perc_6dB', 'melspectrogram_0dB',
       'melspectrogram_max_0dB', 'melspectrogram_corr_0dB',
       'melspectrogram_std_0dB', 'rms_0dB', 'zero_crossing_rate_0dB',
       'max perc_0dB', 'min perc_0dB', 'melspectrogram_corr_-6dB',
       'max perc_-6dB', 'min perc_-6dB'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[734   4]
 [ 39  57]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       738
           1       0.93      0.59      0.73        96

    accuracy                           0.95       834
   macro avg       0.94      0.79      0.85    