a hybrid LSTM-SVM model for musical instrument classification, which includes feature extraction, an LSTM for learning temporal features, and an SVM for final classification. 

In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam

def extract_features(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    combined_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs), axis=0)
    return combined_features.T  # Transpose for (time steps, features)

def load_data(data_directory):
    features, labels = [], []
    label_encoder = LabelEncoder()
    
    for instrument in os.listdir(data_directory):
        instrument_path = os.path.join(data_directory, instrument)
        if os.path.isdir(instrument_path):
            for file in os.listdir(instrument_path):
                if file.endswith('.wav'):
                    file_path = os.path.join(instrument_path, file)
                    feature = extract_features(file_path)
                    features.append(feature)
                    labels.append(instrument)
    
    # Flatten the features for use in SVM after LSTM
    labels_encoded = label_encoder.fit_transform(labels)
    print(f"Label encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
    return features, labels_encoded, label_encoder

def create_lstm_model(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(128, return_sequences=False),  # Use the final LSTM output as features
        Dense(64, activation='relu')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')  # Loss irrelevant for feature extraction
    return model

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_lstm(features):
    # Define a fixed number of time steps (choose based on average or max length of feature sequences)
    time_steps = 614  # Adjust this value if needed based on your data
    
    # Pad or truncate feature sequences to ensure uniform shape
    padded_features = pad_sequences(features, maxlen=time_steps, dtype='float32', padding='post', truncating='post')
    
    # Define LSTM input shape (time_steps, feature_dimension)
    lstm_input_shape = (padded_features.shape[1], padded_features.shape[2])
    
    # LSTM model
    lstm_model = Sequential([
        Input(shape=lstm_input_shape),
        LSTM(128, return_sequences=False),  # Use the final LSTM output
        Dense(64, activation='relu')
    ])
    lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    dummy_y = np.zeros((len(padded_features), 64))  # Dummy targets
    lstm_model.fit(padded_features, dummy_y, epochs=5, batch_size=32, verbose=1)
    return lstm_model

from tensorflow.keras.preprocessing.sequence import pad_sequences

def extract_lstm_features(lstm_model, features):
    # Determine the maximum time steps (optional: use fixed length like 100 for consistency)
    max_time_steps = max(len(seq) for seq in features)
    
    # Pad sequences to the same length with zeros
    padded_features = pad_sequences(features, maxlen=max_time_steps, padding='post', dtype='float32')
    
    # Get predictions (LSTM output)
    lstm_output = lstm_model.predict(padded_features, verbose=0)
    
    # Since LSTM output is 2D (samples, output_dim), return it directly
    return lstm_output

from sklearn.svm import SVC

def train_svm(X_train, y_train):
    svm = SVC(kernel='rbf', C=1.0, gamma='scale')  # Experiment with kernel, C, and gamma
    svm.fit(X_train, y_train)
    return svm


from sklearn.metrics import classification_report, accuracy_score

def evaluate_svm(svm, X_test, y_test, label_encoder):
    y_pred = svm.predict(X_test)
    print("Classification Report:")
    # Ensure labels are explicitly defined to cover all possible classes
    labels = list(range(len(label_encoder.classes_)))
    print(classification_report(y_test, y_pred, labels=labels, target_names=label_encoder.classes_,zero_division=0))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

data_directory = 'C:/Users/Natasha/Desktop/research_module/Musical_Instrument_Data'
features, labels, label_encoder = load_data(data_directory)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Use stratify to split data with balanced classes
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, stratify=labels, random_state=42)

# Check the distribution of classes after stratified split
print("Classes in y_train:", np.unique(y_train))
print("Classes in y_test:", np.unique(y_test))

# Train LSTM model
lstm_model = train_lstm(X_train)

# Extract LSTM-based features for both training and testing sets
X_train_lstm = extract_lstm_features(lstm_model, X_train)  # Shape: (num_samples, timesteps, num_features)
X_test_lstm = extract_lstm_features(lstm_model, X_test)    # Shape: (num_samples, timesteps, num_features)

# Apply mean pooling along the time axis (axis=1)
X_train_lstm_mean = np.mean(X_train_lstm, axis=1)  # Shape: (num_samples, num_features)
X_test_lstm_mean = np.mean(X_test_lstm, axis=1)    # Shape: (num_samples, num_features)

# Normalize mean-pooled LSTM-based features
scaler = StandardScaler()
X_train_lstm_scaled = scaler.fit_transform(X_train_lstm_mean)
X_test_lstm_scaled = scaler.transform(X_test_lstm_mean)

# Train and evaluate SVM with mean-pooled LSTM-based features
svm = train_svm(X_train_lstm_scaled, y_train)
evaluate_svm(svm, X_test_lstm_scaled, y_test, label_encoder)

# Check unique classes in training and testing sets
print("Classes in y_train:", np.unique(y_train))
print("Classes in y_test:", np.unique(y_test))


Label encoding: {'Acoustic_guitar': 0, 'Bass_drum': 1, 'Cello': 2, 'Clarinet': 3, 'Double_bass': 4, 'Flute': 5, 'Hi_hat': 6, 'Saxophone': 7, 'Snare_drum': 8, 'Violin': 9}
Classes in y_train: [0 1 2 3 4 5 6 7 8 9]
Classes in y_test: [0 1 2 3 4 5 6 7 8 9]
Epoch 1/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 562ms/step - loss: 0.0148
Epoch 2/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 552ms/step - loss: 0.0056
Epoch 3/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 576ms/step - loss: 0.0016
Epoch 4/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 561ms/step - loss: 4.7261e-04
Epoch 5/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 505ms/step - loss: 1.7835e-04


ValueError: Expected 2D array, got 1D array instead:
array=[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.00692893 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.