In [57]:
import os
import librosa
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import soundfile as sf


## Prepare helping functions

In [58]:
# features extraction function
def extract_mfcc(audio, sample_rate, n_mfcc=40):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    #mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs

def extract_mel_spectrogram(audio, sample_rate, n_mels=32):
    mel_spectrogram = librosa.feature.melspectrogram(audio, sr=sample_rate, n_mels=n_mels)
    return librosa.power_to_db(mel_spectrogram)

def extract_chroma_features(audio, sample_rate):
    chroma_features = librosa.feature.chroma_stft(audio, sr=sample_rate)
    return chroma_features

def extract_spectral_contrast(audio, sample_rate):
    spectral_contrast = librosa.feature.spectral_contrast(audio, sr=sample_rate)
    return spectral_contrast

def extract_tonnetz(audio, sample_rate):
    tonnetz = librosa.feature.tonnetz(audio, sr=sample_rate)
    return tonnetz

def extract_zero_crossing_rate(audio):
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)
    return zero_crossing_rate

In [74]:
# functions for data augmentation
def time_stretch(audio, rate):
    return librosa.effects.time_stretch(audio, rate)

def pitch_shift(audio, sample_rate, n_steps):
    return librosa.effects.pitch_shift(audio, sample_rate, n_steps)

def add_noise(audio, noise_factor):
    noise = np.random.randn(len(audio))
    return audio + noise_factor * noise

def augment_audio(audio, sample_rate):
    # Time stretching
    stretch_rate = np.random.uniform(0.8, 1.2)
    stretched_audio = time_stretch(audio, stretch_rate)

    # Pitch shifting
    n_steps = np.random.randint(-2, 3)
    shifted_audio = pitch_shift(audio, sample_rate, n_steps)

    # Adding noise
    noise_factor = np.random.uniform(0.005, 0.05)
    noisy_audio = add_noise(audio, noise_factor)

    return [stretched_audio, shifted_audio, noisy_audio]


def augment_and_save(file_path, output_dir):
    audio, sample_rate = librosa.load(file_path)
    augmented_audios = augment_audio(audio, sample_rate)

    file_name, ext = os.path.splitext(os.path.basename(file_path))
    
    for i, augmented_audio in enumerate(augmented_audios):
        output_file_path = os.path.join(output_dir, f"{file_name}_augmented_{i + 1}{ext}")
        sf.write(output_file_path, augmented_audio, sample_rate)


# # Augmenting minor classes

# data_path = 'dataset/set_b'
# output_dir = 'dataset/augmented_data'
# for file in os.listdir(data_path):
#     if file.endswith('.wav'):
#         file_path = os.path.join(data_path, file)
#         if file.startswith('murmur') and 'augmented' not in file_path:
#             augment_and_save(file_path, output_dir)
#         elif file.startswith('extrastole') and 'augmented' not in file_path:
#             augment_and_save(file_path, output_dir)


  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.pitch_shift(audio, sample_rate, n_steps)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.pitch_shift(audio, sample_rate, n_steps)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.pitch_shift(audio, sample_rate, n_steps)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.pitch_shift(audio, sample_rate, n_steps)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.pitch_shift(audio, sample_rate, n_steps)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  return librosa.effects.time_stretch(audio, rate)
  retur

In [79]:
# extract features from audio files
def extract_features(file_name, fixed_length=5):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    target_length = int(fixed_length * sample_rate)

    if len(audio) < target_length:
        # Pad the audio signal with zeros
        audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
    elif len(audio) > target_length:
        # Truncate the audio signal
        audio = audio[:target_length]

    mfccs = extract_mfcc(audio, sample_rate) # No. of features = 40 
    mel_spectrogram = extract_mel_spectrogram(audio, sample_rate) # No. of features = 32
    chroma_features = extract_chroma_features(audio, sample_rate) # No. of features = 12
    spectral_contrast = extract_spectral_contrast(audio, sample_rate) # No. of features = 7
    tonnetz = extract_tonnetz(audio, sample_rate) # No. of features = 6 
    zero_crossing_rate = extract_zero_crossing_rate(audio) # No. of features = 1

    combined_features = np.concatenate([
        mfccs,
        mel_spectrogram,
        chroma_features,
        spectral_contrast,
        tonnetz,
        zero_crossing_rate
    ])
    
    return combined_features

# load data helping function
def load_data(data_path, augment):
    features = []
    labels = []
    
    if not augment:
        for file in os.listdir(data_path):
            if file.startswith('normal'):
                label = 'normal'
            elif file.startswith('murmur'):
                label = 'murmur'
            elif file.startswith('extrastole'):
                label = 'extrastole'
            else:
                # Ignore files that do not belong to the three classes
                continue

            file_path = os.path.join(data_path, file)
            feature = extract_features(file_path)
            features.append(feature)
            labels.append(label)

    elif augment:
        augmented_dir = 'dataset/augmented_data'
        for file in os.listdir(augmented_dir):
            if file.startswith('normal'):
                label = 'normal'
            elif file.startswith('murmur'):
                label = 'murmur'
            elif file.startswith('extrastole'):
                label = 'extrastole'
            else:
                # Ignore files that do not belong to the three classes
                continue

            file_path = os.path.join(augmented_dir, file)
            feature = extract_features(file_path)
            features.append(feature)
            labels.append(label)
    
    return np.array(features), np.array(labels)



In [80]:
# feature selection function
def select_feature(data, feature='all'):
    if feature == 'all':
        return data
    elif feature == 'mfccs': 
        return data[:, 0:40]
    elif feature == 'mel_spectrogram': 
        return data[:, 40:72]
    elif feature == 'chroma_features': 
        return data[:, 72:84]
    elif feature == 'spectral_contrast': 
        return data[:, 84:91]
    elif feature == 'tonnetz': 
        return data[:, 91:97]
    elif feature == 'zero_crossing_rate': 
        return data[:, 97:]

## Load data

In [81]:
# Load data and split into train and test sets before added augmented data to train set
data_path = 'dataset/set_b'  # Replace with the path to your dataset

X, y = load_data(data_path, augment=False)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
X_train_augmented, y_train_augmented = load_data(data_path, augment=True)

  mel_spectrogram = librosa.feature.melspectrogram(audio, sr=sample_rate, n_mels=n_mels)
  chroma_features = librosa.feature.chroma_stft(audio, sr=sample_rate)
  spectral_contrast = librosa.feature.spectral_contrast(audio, sr=sample_rate)
  tonnetz = librosa.feature.tonnetz(audio, sr=sample_rate)
 -0.00597504] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel_spectrogram = librosa.feature.melspectrogram(audio, sr=sample_rate, n_mels=n_mels)
 -0.00597504] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  chroma_features = librosa.feature.chroma_stft(audio, sr=sample_rate)
 -0.00597504] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  spectral_contrast = librosa.feature.spectral_contrast(audio, sr=sample_rate)
 -0.00597504] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  tonnetz = librosa

In [82]:
X_train_augmented.shape

(423, 98, 216)

In [83]:
X_train_extra = np.append(X_train, X_train_augmented, axis=0)
y_train_extra = np.append(y_train, le.fit_transform(y_train_augmented))
y_categorical_train = to_categorical(y_train_extra)
y_categorical_test = to_categorical(y_test)


In [84]:
print(X_train_extra.shape)
print(y_train_extra.shape)
print(X_test.shape)
print(y_test.shape)

(791, 98, 216)
(791,)
(93, 98, 216)
(93,)


## Construct InceptionTime model using tensorflow

In [50]:
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Concatenate, Input
from keras.models import Model

def inception_module(input_tensor, n_filters=32):
    # 1x1 convolution
    conv1 = Conv1D(n_filters, 1, activation='relu', padding='same')(input_tensor)

    # 3x3 convolution followed by 1x1 convolution
    conv3 = Conv1D(n_filters, 3, activation='relu', padding='same')(input_tensor)
    conv3 = Conv1D(n_filters, 1, activation='relu', padding='same')(conv3)

    # 5x5 convolution followed by 1x1 convolution
    conv5 = Conv1D(n_filters, 5, activation='relu', padding='same')(input_tensor)
    conv5 = Conv1D(n_filters, 1, activation='relu', padding='same')(conv5)

    # 3x3 max pooling followed by 1x1 convolution
    pool = MaxPooling1D(3, strides=1, padding='same')(input_tensor)
    pool = Conv1D(n_filters, 1, activation='relu', padding='same')(pool)

    # Concatenate the outputs of the four branches
    output = Concatenate(axis=-1)([conv1, conv3, conv5, pool])
    return output

def create_inceptiontime_model(input_shape, num_classes, n_filters=32, n_modules=6):
    input_tensor = Input(shape=input_shape)

    x = input_tensor
    for _ in range(n_modules):
        x = inception_module(x, n_filters)

    x = GlobalAveragePooling1D()(x)
    x = Dense(num_classes, activation='softmax')(x)

    model = Model(input_tensor, x)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

input_shape = (X_train_extra.shape[1], X_train_extra.shape[2])
num_classes = len(np.unique(y_encoded))

inceptiontime_model = create_inceptiontime_model(input_shape, num_classes)
inceptiontime_model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 98, 216)]    0           []                               
                                                                                                  
 conv1d_73 (Conv1D)             (None, 98, 32)       20768       ['input_3[0][0]']                
                                                                                                  
 conv1d_75 (Conv1D)             (None, 98, 32)       34592       ['input_3[0][0]']                
                                                                                                  
 max_pooling1d_12 (MaxPooling1D  (None, 98, 216)     0           ['input_3[0][0]']                
 )                                                                                          

## Run and Collect results

In [55]:
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train_extra), y = y_train_extra)
class_weight_dict = dict(enumerate(class_weights))
epochs = 50
batch_size = 32
#callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)


for feature in ['mfccs','mel_spectrogram','chroma_features',
            'spectral_contrast','tonnetz','zero_crossing_rate', 'all']:
    
    X_train_feature = select_feature(X_train_extra, feature)
    X_test_feature = select_feature(X_test, feature)

    input_shape = (X_train_feature.shape[1], X_train_feature.shape[2])
    num_classes = len(np.unique(y_train_extra))
    inceptiontime_model = create_inceptiontime_model(input_shape, num_classes)
    model_history = inceptiontime_model.fit(X_train_feature, y_categorical_train, batch_size=batch_size, epochs=epochs, 
                                            validation_split=0.1, class_weight=class_weight_dict, verbose = 0)
    # model_test_loss, model_test_accuracy = inceptiontime_model.evaluate(X_test, y_categorical_test)
    # print("Accuracy: ", model_test_accuracy)

    y_pred = [np.argmax(y, axis=None, out=None) for y in inceptiontime_model.predict(X_test_feature, verbose = 0)]
    target_names = ['extrasystole', 'murmur', 'normal']
    print(feature)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=target_names))
    print("--------------------------------------------------------------")



mfccs
[[ 5  1  5]
 [ 1 14  4]
 [ 5 13 45]]
              precision    recall  f1-score   support

extrasystole       0.45      0.45      0.45        11
      murmur       0.50      0.74      0.60        19
      normal       0.83      0.71      0.77        63

    accuracy                           0.69        93
   macro avg       0.60      0.64      0.61        93
weighted avg       0.72      0.69      0.70        93

--------------------------------------------------------------
mel_spectrogram
[[10  1  0]
 [ 0 19  0]
 [ 4 12 47]]
              precision    recall  f1-score   support

extrasystole       0.71      0.91      0.80        11
      murmur       0.59      1.00      0.75        19
      normal       1.00      0.75      0.85        63

    accuracy                           0.82        93
   macro avg       0.77      0.89      0.80        93
weighted avg       0.88      0.82      0.83        93

--------------------------------------------------------------
chroma_features
