In [14]:
# Standard libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Handeling .wav files

import librosa
from librosa import feature

# Machine Learning

from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold

from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from functools import reduce

# data vizualisation

import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
def merge_pd(machine):
    
    # Opening the .csv files
    
    df_6dB=pd.read_csv(f'Librosa_features_{machine}_6dB.csv')
    df_0dB=pd.read_csv(f'Librosa_features_{machine}_0dB.csv')
    df_min6dB=pd.read_csv(f'Librosa_features_{machine}_-6dB.csv')
    
    # Dropping the 'Unnamed: 0' column
    
    df_6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_0dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_min6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    

    
    # Merging the .csv files into one DataFrame
    
    data_frames = [df_6dB, df_0dB, df_min6dB]
    df_merged = pd.concat(data_frames)
    
    
    return df_merged

In [16]:
pump = merge_pd('pump')

In [17]:
minority_class_len = len(pump[pump['normal(0)/abnormal(1)'] == 1])
majority_class_indices = pump[pump['normal(0)/abnormal(1)'] == 0].index
random_majority_indices = np.random.choice(majority_class_indices, minority_class_len , replace= False)
minority_class_indices  = pump[pump['normal(0)/abnormal(1)'] == 1].index

under_sample_indices = np.concatenate( [minority_class_indices , random_majority_indices])
under_sample = pump.loc[under_sample_indices]
X = under_sample.drop(columns = ['normal(0)/abnormal(1)'])
y = under_sample['normal(0)/abnormal(1)']

# 20% of the overal data will seperated for later validation of the model

X_model, X_valid, y_model, y_valid = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)

# 60% of the overal data (75% of X_model, y_model) will be used to create a training set for the model
# 20% of the overal data (25% of X_model, y_model) will be used to create a testing set for the model

X_train, X_test, y_train, y_test = train_test_split(X_model,
                                                   y_model,
                                                   test_size = 0.25,
                                                   random_state = 10, stratify=y_model)

In [18]:
pipeline = Pipeline([('scaler', StandardScaler()), ('clf', SVC())])

In [19]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 56em; }</style>"))

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.feature_selection import RFE

y_pred = []
clfs = []

clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(RandomForestClassifier(class_weight='balanced'))
#clfs.append(EasyEnsembleClassifier(importance_getter=index))

#feature_list = [6,13,21]

#for index in feature_list:
for index in range(4,22):
    
    for classifier in clfs:
        
        print("----------------------------------------------")
        print("----------------------------------------------")
        print(classifier)
        print("----------------------------------------------")
        print("----------------------------------------------")
        
        sel = RFE(classifier, n_features_to_select = index)
        sel.fit(X_train, y_train)
        features = X_train.columns[sel.get_support()]
        X_train_rfe = sel.transform(X_train)
        X_test_rfe = sel.transform(X_test)
        print('Selected Feature', index)
        print(features)
    
        classifier.fit(X_train_rfe, y_train)
        y_pred= classifier.predict(X_test_rfe)
        scores = cross_val_score(pipeline, X_train_rfe, y_train, cv=5)


        print("----------------------------------------------")
        print("TRAIN-TEST")
        print("----------------------------------------------")


        print('confusion matrix', classifier)
        print(confusion_matrix(y_test, y_pred))
        print('classification report')
        print(classification_report(y_test, y_pred))
        print('accuracy score')
        print(accuracy_score(y_test, y_pred))
        
        X_valid_rfe = sel.transform(X_valid)
        y_pred = classifier.predict(X_valid_rfe)

        print("----------------------------------------------")
        print("TRAIN-VALIDATION")
        print("----------------------------------------------")

        print('confusion matrix', classifier)
        print(confusion_matrix(y_valid, y_pred))
        print('classification report')
        print(classification_report(y_valid, y_pred))
        print('accuracy score')
        print(accuracy_score(y_valid, y_pred))

----------------------------------------------
----------------------------------------------
DecisionTreeClassifier()
----------------------------------------------
----------------------------------------------
Selected Feature 4
Index(['rms', 'spectral_bandwidth', 'zero_crossing_rate', 'max perc'], dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[725  96]
 [ 33 788]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       821
           1       0.89      0.96      0.92       821

    accuracy                           0.92      1642
   macro avg       0.92      0.92      0.92      1642
weighted avg       0.92      0.92      0.92      1642

accuracy score
0.9214372716199757
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix 

Selected Feature 6
Index(['melspectrogram', 'melspectrogram_corr', 'spectral_bandwidth',
       'spectral_contrast', 'zero_crossing_rate', 'max perc'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[707 114]
 [ 33 788]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       821
           1       0.87      0.96      0.91       821

    accuracy                           0.91      1642
   macro avg       0.91      0.91      0.91      1642
weighted avg       0.91      0.91      0.91      1642

accuracy score
0.9104750304506699
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[697 124]
 [ 30 791]]
classification report
              precision    recall  f1-score   support

           0       0

Selected Feature 8
Index(['melspectrogram', 'melspectrogram_corr', 'spectral_centroid',
       'spectral_bandwidth', 'spectral_contrast', 'spectral_rolloff',
       'zero_crossing_rate', 'max perc'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[720 101]
 [ 38 783]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.88      0.91       821
           1       0.89      0.95      0.92       821

    accuracy                           0.92      1642
   macro avg       0.92      0.92      0.92      1642
weighted avg       0.92      0.92      0.92      1642

accuracy score
0.915347137637028
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[708 113]
 [ 34 787]]
classification report
              precision    r

Selected Feature 10
Index(['melspectrogram_max', 'melspectrogram_sum', 'melspectrogram_corr',
       'rms', 'spectral_centroid', 'spectral_bandwidth', 'spectral_contrast',
       'spectral_rolloff', 'zero_crossing_rate', 'max perc'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[723  98]
 [ 36 785]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.88      0.92       821
           1       0.89      0.96      0.92       821

    accuracy                           0.92      1642
   macro avg       0.92      0.92      0.92      1642
weighted avg       0.92      0.92      0.92      1642

accuracy score
0.9183922046285018
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[718 103]
 [ 33 788]]
classification 

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[769  52]
 [ 30 791]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       821
           1       0.94      0.96      0.95       821

    accuracy                           0.95      1642
   macro avg       0.95      0.95      0.95      1642
weighted avg       0.95      0.95      0.95      1642

accuracy score
0.9500609013398295
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[771  50]
 [ 30 791]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       821
           1       0.94      0.96      0.95       821

    accuracy                           

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[774  47]
 [ 27 794]]
classification report
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       821
           1       0.94      0.97      0.96       821

    accuracy                           0.95      1642
   macro avg       0.96      0.95      0.95      1642
weighted avg       0.96      0.95      0.95      1642

accuracy score
0.9549330085261876
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier()
[[770  51]
 [ 30 791]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       821
           1       0.94      0.96      0.95       821

    accuracy                           0.95      1642
   macro avg       0.95      0.

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[713 108]
 [ 39 782]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       821
           1       0.88      0.95      0.91       821

    accuracy                           0.91      1642
   macro avg       0.91      0.91      0.91      1642
weighted avg       0.91      0.91      0.91      1642

accuracy score
0.9104750304506699
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[716 105]
 [ 27 794]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.87      0.92       821
           1       0.88      0.97      0.92       821

    accuracy                           0.92      1642
   macro avg       0.92      0.

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[776  45]
 [ 32 789]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       821
           1       0.95      0.96      0.95       821

    accuracy                           0.95      1642
   macro avg       0.95      0.95      0.95      1642
weighted avg       0.95      0.95      0.95      1642

accuracy score
0.9531059683313033
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[767  54]
 [ 31 790]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       821
           1       0.94      0.96      0.95       821

    accuracy                           

Selected Feature 18
Index(['melspectrogram', 'melspectrogram_max', 'melspectrogram_sum',
       'melspectrogram_corr', 'melspectrogram_std', 'mfcc', 'rms',
       'spectral_centroid', 'spectral_bandwidth', 'spectral_contrast',
       'spectral_flatness ', 'spectral_rolloff', 'zero_crossing_rate',
       'mean perc', 'max harm', 'max perc', 'min harm', 'min perc'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[785  36]
 [ 33 788]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       821
           1       0.96      0.96      0.96       821

    accuracy                           0.96      1642
   macro avg       0.96      0.96      0.96      1642
weighted avg       0.96      0.96      0.96      1642

accuracy score
0.9579780755176613
----------------------------------------------
TRAIN-VALID

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[714 107]
 [ 35 786]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       821
           1       0.88      0.96      0.92       821

    accuracy                           0.91      1642
   macro avg       0.92      0.91      0.91      1642
weighted avg       0.92      0.91      0.91      1642

accuracy score
0.9135200974421437
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[717 104]
 [ 34 787]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       821
           1       0.88      0.96      0.92       821

    accuracy                           0.92      1642
   macro avg       0.92      0.

Selected Feature 21
Index(['melspectrogram', 'melspectrogram_min', 'melspectrogram_max',
       'melspectrogram_sum', 'melspectrogram_corr', 'melspectrogram_std',
       'mfcc', 'rms', 'spectral_centroid', 'spectral_bandwidth',
       'spectral_contrast', 'spectral_flatness ', 'spectral_rolloff',
       'zero_crossing_rate', 'mean harm', 'mean perc', 'max harm', 'max perc',
       'min harm', 'min perc'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier(class_weight='balanced')
[[779  42]
 [ 33 788]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       821
           1       0.95      0.96      0.95       821

    accuracy                           0.95      1642
   macro avg       0.95      0.95      0.95      1642
weighted avg       0.95      0.95      0.95      1642

accuracy score
0.9543239951