LSTM

In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking

# Step 1: Extract all audio file paths
root_dir = 'C:/Users/Natasha/Desktop/research_module/Musical_Instrument_Data'

# Initialize an empty list to store audio file paths and corresponding labels
audio_file_paths = []
labels = []

# Walk through all subdirectories to gather file paths and instrument labels
for instrument_folder in os.listdir(root_dir):
    instrument_folder_path = os.path.join(root_dir, instrument_folder)
    if os.path.isdir(instrument_folder_path):  # Check if it's a directory
        for filename in os.listdir(instrument_folder_path):
            if filename.endswith(('.wav', '.mp3', '.flac')):  # Add more extensions if needed
                file_path = os.path.join(instrument_folder_path, filename)
                audio_file_paths.append(file_path)
                labels.append(instrument_folder)  # Use folder name as the label

# Step 2: Define a function to extract features from an audio file
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfccs = librosa.feature.delta(mfccs)
    combined_features = np.hstack((mfccs.T, delta_mfccs.T))
    return combined_features

# Step 3: Extract features for all files
X = [extract_features(path) for path in audio_file_paths]
max_timesteps = max([x.shape[0] for x in X])  # Maximum time steps
num_features = X[0].shape[1]  # Number of features per timestep

# Pad sequences to the same length
X_padded = pad_sequences(X, maxlen=max_timesteps, dtype='float32', padding='post', truncating='post')

# Encode labels as integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Step 4: Build an LSTM model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(max_timesteps, num_features)),
    LSTM(128, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
model.fit(X_padded, y_encoded, epochs=20, batch_size=32)

# Save the model if needed
model.save('musical_instrument_lstm_model.h5')

# Step 6: Print label encoding map
print("Label encoding:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


  super().__init__(**kwargs)


Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 6s/step - accuracy: 0.0749 - loss: 2.3443
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 6s/step - accuracy: 0.1969 - loss: 2.1377
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 6s/step - accuracy: 0.3644 - loss: 1.9613
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 6s/step - accuracy: 0.4661 - loss: 1.8257
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m645s[0m 71s/step - accuracy: 0.5502 - loss: 1.6861
Epoch 6/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 7s/step - accuracy: 0.6262 - loss: 1.5180
Epoch 7/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 7s/step - accuracy: 0.6692 - loss: 1.3957
Epoch 8/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 7s/step - accuracy: 0.7210 - loss: 1.2211
Epoch 9/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



Label encoding: {'Acoustic_guitar': 0, 'Bass_drum': 1, 'Cello': 2, 'Clarinet': 3, 'Double_bass': 4, 'Flute': 5, 'Hi_hat': 6, 'Saxophone': 7, 'Snare_drum': 8, 'Violin': 9}


In [2]:
model.save('musical_instrument_lstm_model.keras')