In [51]:
import os
from pathlib import Path
import re
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np



# Define instruments (IRMAS has 11 categories)
INSTRUMENTS = ['cello', 'clarinere', 'flauta', 'guitara acustica', 'guitara electrica', 'órgano', 'piano', 'saxofon', 'trompeta', 'violin', 'voz', 'tambores']

In [4]:
import tensorflow as tf

def load_dataset(data_dir):
    spectrograms = []
    labels = []
    for file in os.listdir(data_dir):
        if file.endswith('.png'):
            spectrograms.append(os.path.join(data_dir, file))
            label_file = os.path.join(data_dir, f"{Path(file).stem}.npy")
            labels.append(np.load(label_file))
    return spectrograms, labels


# Convert to TensorFlow dataset
def data_generator(spectrogram_paths, labels):
    for path, label in zip(spectrogram_paths, labels):
        # Load image and label
        image = tf.image.decode_png(tf.io.read_file(path), channels=3)
        image = tf.image.resize(image, (128, 128)) / 255.0
        yield image, tf.convert_to_tensor(label, dtype=tf.float32)

spectrograms, labels = load_dataset("procesado_train")  

train_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(spectrograms, labels),
    output_signature=(
        tf.TensorSpec(shape=(128, 128, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(len(INSTRUMENTS) + 1,), dtype=tf.float32)  # +1 for drum label
    )
).batch(32).repeat().prefetch(tf.data.AUTOTUNE)

spectrograms, labels = load_dataset("procesado_valid")  

valid_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(spectrograms, labels),
    output_signature=(
        tf.TensorSpec(shape=(128, 128, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(len(INSTRUMENTS) + 1,), dtype=tf.float32)  # +1 for drum label
    )
).batch(32).prefetch(tf.data.AUTOTUNE)


In [5]:
@tf.function
def f1_score(y_true, y_pred):
    # Convert probabilities to binary predictions
    y_pred_binary = tf.cast(tf.greater(y_pred, 0.5), tf.float32)
   
    # Compute precision and recall manually to avoid state-related issues
    true_positives = tf.reduce_sum(y_true * y_pred_binary)
    predicted_positives = tf.reduce_sum(y_pred_binary)
    actual_positives = tf.reduce_sum(y_true)
    
    # Calculate precision
    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    
    # Calculate recall
    recall = true_positives / (actual_positives + tf.keras.backend.epsilon())
    
    # Compute F1 Score
    f1_score = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())
   
    return f1_score


In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(INSTRUMENTS) + 1, activation='sigmoid')  # +1 for drum label
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),  # Multi-label loss
              metrics=[f1_score])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
print(len(train_ds))

TypeError: The dataset is infinite.

In [11]:
from math import ceil

spe = ceil(6705/32)
print(spe)
history = model.fit(train_ds, validation_data=valid_ds,steps_per_epoch=spe, epochs=10)

210
Epoch 1/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 434ms/step - f1_score: 0.0985 - loss: 0.3763 - val_f1_score: 0.3122 - val_loss: 0.2910
Epoch 2/10




[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 421ms/step - f1_score: 0.3425 - loss: 0.3018 - val_f1_score: 0.5527 - val_loss: 0.2474
Epoch 3/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 425ms/step - f1_score: 0.5163 - loss: 0.2479 - val_f1_score: 0.6260 - val_loss: 0.2006
Epoch 4/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 443ms/step - f1_score: 0.6625 - loss: 0.1924 - val_f1_score: 0.7926 - val_loss: 0.1224
Epoch 5/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 416ms/step - f1_score: 0.8176 - loss: 0.1146 - val_f1_score: 0.9068 - val_loss: 0.0414
Epoch 6/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 416ms/step - f1_score: 0.9558 - loss: 0.0371 - val_f1_score: 0.9512 - val_loss: 0.0113
Epoch 7/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 414ms/step - f1_score: 0.9827 - loss: 0.0152 - val_f1_score: 0.9471 - val_loss: 0.0136
Epoch 8/10
[1m210/21

In [13]:
print("GPUs available: ", tf.config.list_physical_devices('GPU'))

GPUs available:  []


In [12]:
spectrograms, labels = load_dataset("procesado_test")  

test_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(spectrograms, labels),
    output_signature=(
        tf.TensorSpec(shape=(128, 128, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(len(INSTRUMENTS) + 1,), dtype=tf.float32)  # +1 for drum label
    )
).batch(32).prefetch(tf.data.AUTOTUNE)

In [68]:
test_loss, test_accuracy = model.evaluate(test_ds)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy*100}%")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - f1_score: 0.4056 - loss: 1.1587
Test Loss: 1.3777152299880981
Test Accuracy: 35.06467342376709%


In [16]:
model.save("filepath=Modelo1.keras")

In [47]:
# Convert to TensorFlow dataset
def single_wav(wav_path, sr=22050, n_fft=2048, hop_length=512):
    
    # Load audio file
    y, sr = librosa.load(wav_path, sr=sr)
    
    # Compute Mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # Save as image
    save_path = "tests/spectograms/" + os.path.splitext(wav_path[11:])[0] + ".png"
    plt.figure(figsize=(4, 4))
    librosa.display.specshow(S_dB, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    
    image = tf.image.decode_png(tf.io.read_file(save_path), channels=3)
    image = tf.image.resize(image, (128, 128)) / 255.0
    image = tf.expand_dims(image, axis = 0)
    return image

def single_spectogram(spectrogram_path):
    image = tf.image.decode_png(tf.io.read_file(spectrogram_path), channels=3)
    image = tf.image.resize(image, (128, 128)) / 255.0
    image = tf.expand_dims(image, axis = 0)
    return image

In [101]:
# Path to a test WAV file
file_path = "procesado_test/00 - gold fronts-10.png"

# Preprocess the file
input_data = single_spectogram(file_path)

# Get predictions
predictions = model.predict(input_data)

# Output predictions
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[[3.8157744e-08 8.6779464e-09 3.5542928e-07 7.0441933e-03 9.9747986e-01
  6.3722888e-05 3.1407146e-09 6.7306075e-08 1.9849665e-08 6.9293094e-07
  7.6669294e-01 9.9907682e-17]]


In [108]:
predicted_class = np.argmax(predictions)

predicted_label = INSTRUMENTS[predicted_class]
print(f"Instrumento de mayor predominancia: {predicted_label}")

threshold = 0.8
multi_labels = [INSTRUMENTS[i] for i, prob in enumerate(predictions[0]) if prob > threshold]
print(f"Instrumentos detectados: {multi_labels}")

Instrumento de mayor predominancia: guitara electrica
Instrumentos detectados: ['guitara electrica']
