## Import packages

In [213]:
# Standard libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Handeling .wav files

import librosa
from librosa import feature

# Machine Learning

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_selection import RFE

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from functools import reduce

# data vizualisation

import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

#saving & loading ML method
import pickle

### Increase size of scrollable output

In [193]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 56em; }</style>"))

## Function that concatenates all dB levels for a certain machine

In [194]:
def concatenate_pd(machine):
    
    # Opening the .csv files
    
    df_6dB=pd.read_csv(f'Librosa_features_{machine}_6dB.csv')
    df_0dB=pd.read_csv(f'Librosa_features_{machine}_0dB.csv')
    df_min6dB=pd.read_csv(f'Librosa_features_{machine}_-6dB.csv')
    
    # Dropping the 'Unnamed: 0' column
    
    df_6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_0dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    df_min6dB.drop(columns = ['Unnamed: 0'], axis=1, inplace=True)
    

    
    # Merging the .csv files into one DataFrame
    
    data_frames = [df_6dB, df_0dB, df_min6dB]
    df_merged = pd.concat(data_frames)
    
    
    return df_merged

## Undersampling

In [195]:
def undersampling(machine):  
    minority_class_len = len(machine[machine['normal(0)/abnormal(1)'] == 1])
    majority_class_indices = machine[machine['normal(0)/abnormal(1)'] == 0].index
    random_majority_indices = np.random.choice(majority_class_indices, minority_class_len , replace= False)
    minority_class_indices  = machine[machine['normal(0)/abnormal(1)'] == 1].index

    under_sample_indices = np.concatenate( [minority_class_indices , random_majority_indices])
    under_sample = machine.loc[under_sample_indices]
    return under_sample

## Splitting data in train, test and validation set

In [196]:
def split_data(samples):  
    X = under_sample.drop(columns = ['normal(0)/abnormal(1)'])
    y = under_sample['normal(0)/abnormal(1)']

    # 20% of the overal data will seperated for later validation of the model

    X_model, X_valid, y_model, y_valid = train_test_split(X, y, test_size=0.3, random_state = 42, stratify = y)

    # 60% of the overal data (75% of X_model, y_model) will be used to create a training set for the model
    # 20% of the overal data (25% of X_model, y_model) will be used to create a testing set for the model

    X_train, X_test, y_train, y_test = train_test_split(X_model,
                                                       y_model,
                                                       test_size = 1/3,
                                                       random_state = 10, stratify=y_model)
    return X_train, X_test, X_valid, y_train, y_test, y_valid

## Create pipeline

In [197]:
pipeline = Pipeline([('scaler', StandardScaler()), ('clf', SVC())])

## ML feature & model selection

In [198]:
def model_selection(X_train, X_test, X_valid, y_train, y_test, y_valid): 

    y_pred = []
    clfs = []
    # append classifiers to pipeline
    clfs.append(DecisionTreeClassifier())
    clfs.append(RandomForestClassifier())

    for index in range(1,len(X_train.columns)):

        for classifier in clfs:

            print("----------------------------------------------")
            print("----------------------------------------------")
            print(classifier)
            print("----------------------------------------------")
            print("----------------------------------------------")

            sel = RFE(classifier, n_features_to_select = index)
            sel.fit(X_train, y_train)
            features = X_train.columns[sel.get_support()]
            X_train_rfe = sel.transform(X_train)
            X_test_rfe = sel.transform(X_test)
            print('Selected Feature', index)
            print(features)

            classifier.fit(X_train_rfe, y_train)
            y_pred= classifier.predict(X_test_rfe)
            scores = cross_val_score(pipeline, X_train_rfe, y_train, cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1))
            
            print("----------------------------------------------")
            print("TRAIN-TEST")
            print("----------------------------------------------")


            print('confusion matrix', classifier)
            print(confusion_matrix(y_test, y_pred))
            print('classification report')
            print(classification_report(y_test, y_pred))
            print('accuracy score')
            print(accuracy_score(y_test, y_pred))
            

            X_valid_rfe = sel.transform(X_valid)
            y_pred = classifier.predict(X_valid_rfe)

            print("----------------------------------------------")
            print("TRAIN-VALIDATION")
            print("----------------------------------------------")

            print('confusion matrix', classifier)
            print(confusion_matrix(y_valid, y_pred))
            print('classification report')
            print(classification_report(y_valid, y_pred))
            print('accuracy score')
            print(accuracy_score(y_valid, y_pred))

## ML on best model

In [217]:
def best_model_selection(X_train, X_test, X_valid, y_train, y_test, y_valid, index, best_classifier): 
    
    pipeline = Pipeline([('scaler', StandardScaler()), ('clf', best_classifier)])
    
    y_pred = []
    clfs = []
    
    # append classifiers to pipeline
    clfs.append(best_classifier)
    
    print("----------------------------------------------")
    print("----------------------------------------------")
    print(best_classifier)
    print("----------------------------------------------")
    print("----------------------------------------------")

    sel = RFE(best_classifier, n_features_to_select = index)
    sel.fit(X_train, y_train)
    features = X_train.columns[sel.get_support()]
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature', index)
    print(features)

    best_classifier.fit(X_train_rfe, y_train)
    # save the model to disk
    filename = 'finalized_model.sav'
    pickle.dump(best_classifier, open(filename, 'wb'))
    
    y_pred= best_classifier.predict(X_test_rfe)
    scores = cross_val_score(pipeline, X_train_rfe, y_train, cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1))
            
    print("----------------------------------------------")
    print("TRAIN-TEST")
    print("----------------------------------------------")


    print('confusion matrix', best_classifier)
    print(confusion_matrix(y_test, y_pred))
    print('classification report')
    print(classification_report(y_test, y_pred))
    print('accuracy score')
    print(accuracy_score(y_test, y_pred))
            

    X_valid_rfe = sel.transform(X_valid)
    y_pred = best_classifier.predict(X_valid_rfe)

    print("----------------------------------------------")
    print("TRAIN-VALIDATION")
    print("----------------------------------------------")

    print('confusion matrix', best_classifier)
    print(confusion_matrix(y_valid, y_pred))
    print('classification report')
    print(classification_report(y_valid, y_pred))
    print('accuracy score')
    print(accuracy_score(y_valid, y_pred))
    
    return filename

## Run ML on particular machine

In [199]:
machine=input('Which machine? ')

Which machine? fan


In [200]:
machine=concatenate_pd(machine)

In [201]:
samples=undersampling(machine)

In [202]:
X_train, X_test, X_valid, y_train, y_test, y_valid=split_data(samples)

In [203]:
model_selection(X_train, X_test, X_valid, y_train, y_test, y_valid)

----------------------------------------------
----------------------------------------------
DecisionTreeClassifier()
----------------------------------------------
----------------------------------------------
Selected Feature 1
Index(['melspectrogram_sum'], dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[2342  755]
 [ 424 2674]]
classification report
              precision    recall  f1-score   support

           0       0.85      0.76      0.80      3097
           1       0.78      0.86      0.82      3098

    accuracy                           0.81      6195
   macro avg       0.81      0.81      0.81      6195
weighted avg       0.81      0.81      0.81      6195

accuracy score
0.8096852300242131
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[3007  976]

Selected Feature 4
Index(['melspectrogram', 'mfcc', 'rms', 'spectral_flatness '], dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[2818  279]
 [ 162 2936]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3097
           1       0.91      0.95      0.93      3098

    accuracy                           0.93      6195
   macro avg       0.93      0.93      0.93      6195
weighted avg       0.93      0.93      0.93      6195

accuracy score
0.9288135593220339
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[3595  388]
 [ 204 3778]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.90      0.92      3983
           1       0.91      0.95 

Selected Feature 7
Index(['melspectrogram', 'melspectrogram_std', 'mfcc', 'rms',
       'spectral_contrast', 'spectral_flatness ', 'spectral_rolloff'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[2854  243]
 [ 146 2952]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3097
           1       0.92      0.95      0.94      3098

    accuracy                           0.94      6195
   macro avg       0.94      0.94      0.94      6195
weighted avg       0.94      0.94      0.94      6195

accuracy score
0.9372074253430186
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[3672  311]
 [ 157 3825]]
classification report
              precision    recall  f1-score   support

           0

Selected Feature 10
Index(['melspectrogram', 'melspectrogram_sum', 'melspectrogram_std', 'mfcc',
       'rms', 'spectral_centroid', 'spectral_contrast', 'spectral_flatness ',
       'spectral_rolloff', 'min harm'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[2844  253]
 [ 154 2944]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3097
           1       0.92      0.95      0.94      3098

    accuracy                           0.93      6195
   macro avg       0.93      0.93      0.93      6195
weighted avg       0.93      0.93      0.93      6195

accuracy score
0.9343018563357547
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[3650  333]
 [ 182 3800]]
classification report
    

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[2962  135]
 [ 105 2993]]
classification report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3097
           1       0.96      0.97      0.96      3098

    accuracy                           0.96      6195
   macro avg       0.96      0.96      0.96      6195
weighted avg       0.96      0.96      0.96      6195

accuracy score
0.9612590799031477
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier()
[[3779  204]
 [ 117 3865]]
classification report
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      3983
           1       0.95      0.97      0.96      3982

    accuracy                           0.96      7965
   macro avg       0.96

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[2853  244]
 [ 137 2961]]
classification report
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3097
           1       0.92      0.96      0.94      3098

    accuracy                           0.94      6195
   macro avg       0.94      0.94      0.94      6195
weighted avg       0.94      0.94      0.94      6195

accuracy score
0.938498789346247
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix DecisionTreeClassifier()
[[3643  340]
 [ 166 3816]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.91      0.94      3983
           1       0.92      0.96      0.94      3982

    accuracy                           0.94      7965
   macro avg       0.94 

----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[2992  105]
 [ 111 2987]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      3097
           1       0.97      0.96      0.97      3098

    accuracy                           0.97      6195
   macro avg       0.97      0.97      0.97      6195
weighted avg       0.97      0.97      0.97      6195

accuracy score
0.9651331719128329
----------------------------------------------
TRAIN-VALIDATION
----------------------------------------------
confusion matrix RandomForestClassifier()
[[3805  178]
 [ 138 3844]]
classification report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3983
           1       0.96      0.97      0.96      3982

    accuracy                           0.96      7965
   macro avg       0.96

## Select best model based on metrics

In [None]:
thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

In [234]:
index = 7

- DecisionTreeClassifier()
- RandomForestClassifier()

In [235]:
best_classifier = RandomForestClassifier()

In [236]:
filename=best_model_selection(X_train, X_test, X_valid, y_train, y_test, y_valid, index, best_classifier)

----------------------------------------------
----------------------------------------------
RandomForestClassifier()
----------------------------------------------
----------------------------------------------
Selected Feature 7
Index(['melspectrogram', 'melspectrogram_sum', 'melspectrogram_std', 'mfcc',
       'rms', 'spectral_centroid', 'spectral_rolloff'],
      dtype='object')
----------------------------------------------
TRAIN-TEST
----------------------------------------------
confusion matrix RandomForestClassifier()
[[2958  139]
 [ 118 2980]]
classification report
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3097
           1       0.96      0.96      0.96      3098

    accuracy                           0.96      6195
   macro avg       0.96      0.96      0.96      6195
weighted avg       0.96      0.96      0.96      6195

accuracy score
0.9585149313962873
----------------------------------------------
TRAIN-VAL

## Loading Model

In [237]:
def extract_features(file):

    # check if normal or abnormal
    if 'abnormal' in file:
        operation=1
    else:
        operation=0
    # loading the file, getting y and sr (sample rate)

    y, sr = librosa.load(file)

    # Getting S and phase

    S, phase = librosa.magphase(librosa.stft(y=y))

    # Features for the DataFrame
# chroma features have pitch which is probably not usefull
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))

    chroma_cqt = np.mean(librosa.feature.chroma_cqt(y=y, sr=sr))

    chroma_cens = np.mean(librosa.feature.chroma_cens(y=y, sr=sr))
# to do
    melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, S=S))
    melspectrogram_min = np.min(librosa.feature.melspectrogram(y=y, sr=sr, S=S))
    melspectrogram_max = np.max(librosa.feature.melspectrogram(y=y, sr=sr, S=S))
    melspectrogram_sum = librosa.feature.melspectrogram(y=y, sr=sr, S=S).sum()
    melspectrogram_corr= np.mean(np.corrcoef(librosa.feature.melspectrogram(y=y, sr=sr, S=S)))
    melspectrogram_std= np.std(librosa.feature.melspectrogram(y=y, sr=sr, S=S))
# to check This feature is one of the most important method to extract a feature of an audio signal and is 
#used majorly whenever working on audio signals. The mel frequency cepstral coefficients (MFCCs) of a signal 
#are a small set of features (usually about 10–20) which concisely describe the overall shape of a spectral 
#envelope.
    
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr))
#rms when plotted similarities with melspectorgram
#normal rms mean higher than abnormal?
    rms = np.mean(librosa.feature.rms(y=y, S=S))
#spectral centroid computes weighted mean of the frequencies in the sound
#plot is line on melspectogram
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr, S=S))
#Bandwidth(blue zone) is the difference between the upper and lower frequencies in a continuous band of frequencies
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr, S=S))
#needs further investigation, but makes continous data discrete in bins
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr, S=S))
#Spectral flatness (or tonality coefficient) is a measure to quantify how much noise-like a sound is, as opposed to
#being tone-like 1. A high spectral flatness (closer to 1.0) indicates the spectrum is similar to white noise. 
#It is often converted to decibel.
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y, S=S))
#The roll-off frequency is defined for each frame as the center frequency for a spectrogram bin such that at 
#least roll_percent (0.85 by default) of the energy of the spectrum in this frame is contained in this bin and 
#he bins below. This can be used to, e.g., approximate the maximum (or minimum) frequency by setting roll_percent 
#to a value close to 1 (or 0).
#rolloff with rolloff coefficient 0.01 seems to be the same for (ab)normal
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr, S=S))
#Get coefficients of fitting an nth-order polynomial to the columns of a spectrogram
    poly_features = np.mean(librosa.feature.poly_features(y=y, sr=sr, S=S))
#probably chroma
    tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr))
#normal has higher zero crossing rate?
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=y))
#divide in frequency bands could give more information bout difference normal and abnormal (BPM)
    #tempo = librosa.beat.tempo(onset_envelope=oenv, sr=sr,
                       #    hop_length=hop_length)[0]
    # putting them into a dataframe
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    return pd.DataFrame({ 'melspectrogram' : [melspectrogram],'melspectrogram_min':[melspectrogram_min],
                             'melspectrogram_max':[melspectrogram_max], 'melspectrogram_sum':[melspectrogram_sum],
                             'melspectrogram_corr':[melspectrogram_corr] ,'melspectrogram_std':[melspectrogram_std] ,
                             'mfcc' : [mfcc], 'rms' : [rms],
                            'spectral_centroid' : [spectral_centroid], 'spectral_bandwidth' : [spectral_bandwidth],
                            'spectral_contrast' : [spectral_contrast], 'spectral_flatness ' : [spectral_flatness],
                            'spectral_rolloff' : [spectral_rolloff], 
                            'zero_crossing_rate' : [zero_crossing_rate],"mean harm": np.mean(y_harmonic),
                            "mean perc": [np.mean(y_percussive)],"max harm":[np.max(y_harmonic)],"max perc": [np.max(y_percussive)],
                             "min harm":[np.min(y_harmonic)], "min perc":[np.min(y_percussive)], 'normal(0)/abnormal(1)':[operation]})

In [238]:
loaded_model = pickle.load(open(filename, 'rb'))
file='/home/regis/Desktop/Sound Project/files/fan/0_dB_fan/fan/id_02/abnormal/00000336.wav'
df=extract_features(file)

X = df.drop(columns = ['normal(0)/abnormal(1)'])
X_reduced=X[['melspectrogram', 'melspectrogram_sum', 'melspectrogram_std', 'mfcc',
       'rms', 'spectral_flatness ', 'spectral_rolloff']]
y = df['normal(0)/abnormal(1)']

In [239]:
X

Unnamed: 0,melspectrogram,melspectrogram_min,melspectrogram_max,melspectrogram_sum,melspectrogram_corr,melspectrogram_std,mfcc,rms,spectral_centroid,spectral_bandwidth,spectral_contrast,spectral_flatness,spectral_rolloff,zero_crossing_rate,mean harm,mean perc,max harm,max perc,min harm,min perc
0,0.016932,2.008702e-09,0.173215,934.114929,0.044742,0.019552,-11.308414,0.006176,1705.02095,1744.876804,21.038303,0.000733,3617.103495,0.094367,-3e-06,1e-06,0.014587,0.013274,-0.014474,-0.015754


In [240]:
y

0    1
Name: normal(0)/abnormal(1), dtype: int64

In [242]:
y_pred= loaded_model.predict(X_reduced)
y_pred

array([1])