# Kapre instrumental classification

In [26]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


In [3]:
import os
import json
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [4]:
# Let's start by setting the data root and loading OpenMic-2018

# Set the following parameter before running your analysis
DATA_ROOT = Path('../data/raw/openmic-2018/')

# Load openmic
OPENMIC = np.load(DATA_ROOT.joinpath('openmic-2018.npz'), allow_pickle=True)
X, Y_true, Y_mask, sample_key = OPENMIC['X'], OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']

In [None]:
# Replace X by loading the ogg files as waveforms. This may take ~15 minutes.
wav = []

for i in tqdm(range(len(sample_key))):
    file = sample_key[i]
    dir = file[0:3]

    audio, sr = librosa.load(DATA_ROOT.joinpath('audio', dir, f"{file}.ogg"), sr=22050, mono=True)
    wav.append(audio)

X = np.array(wav)

    

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 67/20000 [00:07<37:05,  8.95it/s]  


KeyboardInterrupt: 

In [5]:
# Parameters for MFCC extraction, following Blazke & Bostek
n_fft = 1024  # Hamming window length (number of samples)
hop_length = 512  # Step size (number of samples)
n_mfcc = 40  # Number of MFCC bins

# Replace X by loading the ogg files as MFCCs
mfcc_features = []

for i in tqdm(range(len(sample_key))):
    file = sample_key[i]
    dir = file[0:3]
    
    # Load audio file as waveform
    audio, sr = librosa.load(DATA_ROOT.joinpath('audio', dir, f"{file}.ogg"), sr=22050, mono=True)
    
    # Compute MFCCs
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Transpose to match time steps on the first axis
    mfcc_features.append(mfcc.T)

# Convert the list of arrays into a numpy array
X = np.array(mfcc_features)


100%|██████████| 20000/20000 [14:28<00:00, 23.02it/s]


In [10]:
print(f"Each input audio file now has the following shape: {X[0].shape}")
np.save("../data/processed/mfcc.npy", X)


Each input audio file now has the following shape: (431, 40)


In [11]:
# Load the class map and the train / test split
with open(os.path.join(DATA_ROOT, 'class-map.json'), 'r') as f:
    class_map = json.load(f)

# Let's split the data into the training and test set
# We use squeeze=True here to return a single array for each, rather than a full DataFrame

split_train = pd.read_csv(DATA_ROOT.joinpath('partitions/split01_train.csv'), 
                          header=None, squeeze=True)
split_test = pd.read_csv(DATA_ROOT.joinpath('partitions/split01_test.csv'), 
                         header=None, squeeze=True)

train_set = set(split_train)
test_set = set(split_test)



  split_train = pd.read_csv(DATA_ROOT.joinpath('partitions/split01_train.csv'),


  split_test = pd.read_csv(DATA_ROOT.joinpath('partitions/split01_test.csv'),


In [None]:
# These loops go through all sample keys, and save their row numbers
# to either idx_train or idx_test
#
# This will be useful in the next step for slicing the array data
idx_train, idx_test = [], []

for idx, n in enumerate(sample_key):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)
    else:
        # This should never happen, but better safe than sorry.
        raise RuntimeError('Unknown sample key={}! Abort!'.format(sample_key[n]))
        
# Finally, cast the idx_* arrays to numpy structures
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

# Finally, we use the split indices to partition the features, labels, and masks
X_train = X[idx_train]
X_test = X[idx_test]

Y_true_train = Y_true[idx_train]
Y_true_test = Y_true[idx_test]

Y_mask_train = Y_mask[idx_train]
Y_mask_test = Y_mask[idx_test]

# Prepare binary labels for multi-label classification
Y_train_binary = (Y_true_train > 0.5).astype(int)
Y_test_binary = (Y_true_test > 0.5).astype(int)

In [None]:
# import numpy as np
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.optimizers import Adam
# from sklearn.metrics import classification_report

# # # Step 1: Preprocess data
# # # Average over time to create a fixed-size feature vector for each audio sample
# # X_train_avg = np.mean(X_train, axis=1)
# # X_test_avg = np.mean(X_test, axis=1)

# # Prepare binary labels for multi-label classification
# Y_train_binary = (Y_true_train > 0.5).astype(int)
# Y_test_binary = (Y_true_test > 0.5).astype(int)

# # Step 2: Define the model
# num_classes = 20  # Total number of instrument classes

# model = Sequential([
#     Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.3),
#     Dense(256, activation='relu'),
#     Dropout(0.3),
#     Dense(128, activation='relu'),
#     Dropout(0.2),
#     Dense(num_classes, activation='sigmoid')
# ])

# # Step 3: Compile the model
# model.compile(optimizer=Adam(learning_rate=0.001), 
#               loss='binary_crossentropy',
#               metrics=['Precision', 'Recall'])

# # Step 4: Train the model
# model.fit(X_train, Y_train_binary, epochs=50, batch_size=32, validation_split=0.2)

# # Step 5: Evaluate the model
# # Overall evaluation on test set
# test_loss, test_precision, test_recall = model.evaluate(X_test, Y_test_binary)
# print(f'Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}')

# # Predictions and per-class evaluation
# Y_pred_test = model.predict(X_test) > 0.5  # Convert predictions to binary

# # Print classification report per instrument
# print("Classification Report per Instrument:")
# print(classification_report(Y_test_binary, Y_pred_test, target_names=class_map.keys()))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - Precision: 0.0597 - Recall: 0.0574 - loss: 0.5033 - val_Precision: 0.6250 - val_Recall: 0.0020 - val_loss: 0.1703
Epoch 2/50
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - Precision: 0.1927 - Recall: 0.0092 - loss: 0.1991 - val_Precision: 0.3452 - val_Recall: 0.0118 - val_loss: 0.1618
Epoch 3/50
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - Precision: 0.3456 - Recall: 0.0133 - loss: 0.1827 - val_Precision: 0.6136 - val_Recall: 0.0110 - val_loss: 0.1524
Epoch 4/50
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - Precision: 0.4312 - Recall: 0.0195 - loss: 0.1700 - val_Precision: 0.6203 - val_Recall: 0.0200 - val_loss: 0.1515
Epoch 5/50
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - Precision: 0.4415 - Recall: 0.0199 - loss: 0.1668 - val_Precision: 0.5753 - val_Recall: 0.0436 - val_los

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Replicated model from Blazke & Kostek
# Changes: 1 stack for multi-instrument classification
# Dropped one dense layer, to go from 64 > 32 > 20

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, ReLU, MaxPooling2D, GlobalAveragePooling2D, Dense, Input, Flatten
from tensorflow.keras.optimizers import Adam

# Set the input shape based on MFCC output dimensions
input_shape = (431, 40, 1)

# Define the model
model = Sequential()
model.add(Input(shape=input_shape))

# # First Conv2D layer with 128 filters
# model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(MaxPooling2D((2, 2)))

# Second Conv2D layer with 64 filters
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))

# Third Conv2D layer with 32 filters
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))

# Flatten the output from convolutional layers
model.add(Flatten())

# Dense layers
# model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(20, activation='sigmoid'))  # 20 instruments for multi-label classification

# Compile the model with custom metrics
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='binary_crossentropy',
              metrics=['Precision', 'Recall'])

# Fit the model
model.fit(X_train, Y_train_binary, epochs=20, batch_size=248, validation_split=0.2)


Epoch 1/20
[1m23/49[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m2:06[0m 5s/step - Precision: 0.0510 - Recall: 0.1595 - loss: 0.7242

KeyboardInterrupt: 

In [16]:
model.evaluate(X, Y_mask)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - Precision: 0.5960 - Recall: 0.0146 - loss: 0.3610


[0.35638561844825745, 0.6097777485847473, 0.01662304997444153]

In [None]:
# Select a single sample from X, for example, the first one
sample = X[0]
label = Y_mask[0]
# If necessary, reshape it to include a batch dimension
sample = tf.expand_dims(sample, axis=0)  # Shape becomes (1, 10, 128)

# Run the prediction
prediction = model.predict(sample)

# Display the output
print("Prediction:", prediction)
print("Label:", label)

# Apply a threshold to the prediction to get binary labels (e.g., threshold=0.5)
predicted_labels = (prediction > 0.5).astype(int)

print("Predicted labels:", predicted_labels)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Prediction: [[0.05726662 0.10583296 0.06827987 0.07933081 0.10219281 0.07009462
  0.05188092 0.10060718 0.09751446 0.09184553 0.14765938 0.08527464
  0.10122564 0.0799638  0.10158505 0.12778349 0.2181904  0.14919664
  0.10578983 0.03989973]]
Label: [False False False False  True False False  True False False False False
 False False False False  True False False False]


array([175,  13, 171,  85, 170,  33, 140, 167, 165, 201, 151,  49,  82,
       116,  77,  89, 152, 104, 147, 186,  25, 255, 152,  34, 128, 209,
       121, 111, 214, 110, 221,  50, 175,  97,  91, 205, 183,  35, 116,
       144, 250, 184, 127, 232, 108, 171, 194, 112,   0, 156, 131,  49,
        15,  32, 212,  22, 143,  67, 172, 104, 141, 241,  96, 193, 165,
       163,  85,   7, 187, 224,  88, 180, 182, 161, 137, 104,  75, 255,
       163, 255, 140, 143, 162,  77,  55,  81, 131,  98, 139, 131, 195,
       155, 180,  77,  16, 182,  12,  14, 112,  96, 255, 255,   0,  89,
         0,  20,  21, 197, 173, 255, 233, 199,  85,  27, 231, 184, 100,
       159,   0, 255, 143,   0,  43, 192, 139,  55,  82, 255], dtype=int64)

In [None]:
# np.save("../data/processed/waveforms.npy")

In [16]:
# Paths to the dataset
base_path = Path('../data/raw/openmic-2018/')
audio_path = base_path.joinpath('audio')
partition_path = base_path.joinpath('partitions')
label_path = base_path.joinpath('openmic-2018-aggregated-labels.csv')
class_map_path = base_path.joinpath('class-map.json')

In [17]:
# Load data

# Load the class map
with open(class_map_path, 'r') as f:
    class_map = json.load(f)

# Convert class map to {instrument_name: index}
instrument_to_index = {instrument: idx for instrument, idx in class_map.items()}

# Load train and test partitions
train_files = pd.read_csv(partition_path.joinpath('split01_train.csv'), header=None).iloc[:, 0].tolist()
test_files = pd.read_csv(partition_path.joinpath('split01_test.csv'), header=None).iloc[:, 0].tolist()

# Load the labels
labels_df = pd.read_csv(label_path)

# Helper function to get labels as a multi-hot encoded vector
def get_labels(file):
    instruments = labels_df[labels_df['sample_key'] == file]['instrument'].values
    label_vector = [0] * len(instrument_to_index)  # Initialize vector for multi-label
    for instrument in instruments:
        idx = instrument_to_index.get(instrument)
        if idx is not None:
            label_vector[idx] = 1  # Set the index to 1 for present instruments
    return label_vector

# Generate labels for train and test sets
train_labels = [get_labels(file) for file in train_files]
test_labels = [get_labels(file) for file in test_files]

# Check the output
print("Train labels:", train_labels[:5])
print("Test labels:", test_labels[:5])

# Load audio and preprocess function
def load_audio_and_preprocess(file_path, label):
    # Load the audio file directly
    audio, _ = librosa.load(file_path, sr=44100, mono=True)
    return audio, label  # Return audio and label

# Create dataset function
def create_dataset(file_list, labels_df, audio_path):
    # Convert file paths to strings
    audio_file_paths = [str(audio_path.joinpath(file)) for file in file_list]
    labels = [get_labels(file) for file in file_list]
    
    # Create a TensorFlow Dataset with file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((audio_file_paths, labels))
    dataset = dataset.map(load_audio_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

def f1_score(y_true, y_pred):
    precision = tf.keras.metrics.Precision()
    recall = tf.keras.metrics.Recall()
    
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    
    p = precision.result()
    r = recall.result()
    
    f1 = 2 * (p * r) / (p + r + tf.keras.backend.epsilon())
    return f1


Train labels: [[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
Test labels: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]]


In [None]:
# Build the model

model = Sequential([
    get_melspectrogram_layer(input_shape=(44100, 1), sample_rate=44100, n_fft=2048, hop_length=1024, n_mels=128),  # Kapre layer
    BatchNormalization(),
    Conv2D(32, (3, 3), activation='relu'),
    GlobalAveragePooling2D(),
    Dense(20, activation='sigmoid')  # Multi-label for 20 instruments
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Precision', 'Recall', f1_score])

# Print the model summary
model.summary()


In [22]:
from sklearn.metrics import classification_report
import numpy as np

# Train the model
train_dataset = create_dataset(train_files, labels_df, audio_path).batch(32)
test_dataset = create_dataset(test_files, labels_df, audio_path).batch(32)

# Fit the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10)

# Evaluate the model on the test set
y_true, y_pred = [], []

for audio, labels in test_dataset:
    predictions = model.predict(audio)
    y_true.extend(labels.numpy())
    y_pred.extend((predictions > 0.5).astype(int))  # Threshold to get binary predictions

# Convert to NumPy arrays for classification report
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Print overall performance metrics
print("Overall Performance Metrics:")
print(classification_report(y_true, y_pred, target_names=[f'Instrument {i+1}' for i in range(15)], zero_division=0))

# Example of the output:
# precision, recall, f1-score for each instrument and averages across all.


TypeError: in user code:

    File "C:\Users\jeffr\AppData\Local\Temp\ipykernel_2228\71523560.py", line 38, in load_audio_and_preprocess  *
        audio, _ = librosa.load(file_path, sr=44100, mono=True)
    File "C:\Users\jeffr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\audio.py", line 176, in load  *
        y, sr_native = __soundfile_load(path, offset, duration, dtype)
    File "C:\Users\jeffr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\librosa\core\audio.py", line 209, in __soundfile_load  *
        context = sf.SoundFile(path)
    File "C:\Users\jeffr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\soundfile.py", line 658, in __init__  **
        self._file = self._open(file, mode_int, closefd)
    File "C:\Users\jeffr\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\soundfile.py", line 1212, in _open
        raise TypeError("Invalid file: {0!r}".format(self.name))

    TypeError: Invalid file: <tf.Tensor 'args_0:0' shape=() dtype=string>
