In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm

In [6]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

# --- PATCH for librosa compatibility ---
np.complex = complex

# === CONFIGURATION ===
metadata_path = "data/metadata.csv"
sample_rate = 44100
duration = 2.0
max_len = int(sample_rate * duration)
n_mels = 128
output_file = "idmt_mel_dataset.npz"

# === EFFECT MAPPING (10 effects) ===
effect_ids = {
    "21": 0, "22": 1, "23": 2,
    "31": 3, "32": 4, "33": 5, "34": 6, "35": 7,
    "41": 8, "42": 9
}

# === LOAD METADATA ===
metadata = pd.read_csv(metadata_path)

# === COLLECT DATA ===
X = []
y = []

for _, row in tqdm(metadata.iterrows(), total=len(metadata), desc="Processing audio"):
    file_path = row["filename"]
    effect_id = str(row["effect_id"]).zfill(2)
    effect_setting = int(row["effect_setting"])

    # Skip unknown effects (i.e. not in list and not 11 or 12)
    if effect_id not in effect_ids and effect_id not in ["11", "12"]:
        continue

    if not os.path.exists(file_path):
        print(f"Missing file: {file_path}")
        continue

    try:
        # --- LOAD AUDIO ---
        audio, _ = librosa.load(file_path, sr=sample_rate, mono=True)

        # Pad or trim to fixed length
        if len(audio) < max_len:
            audio = np.pad(audio, (0, max_len - len(audio)))
        else:
            audio = audio[:max_len]

        # --- MEL-SPECTROGRAM ---
        mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        # Normalize to [0, 1] (Disregard the effect of volume)
        mel_db_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
        mel_db_norm = mel_db_norm[..., np.newaxis]  # Add channel dim

        # --- EFFECT STRENGTH VECTOR ---
        strength_vector = np.zeros(10, dtype=np.float32)

        if effect_id in effect_ids:
            normalized_strength = (effect_setting - 1) / 3.0  # 1 → 0.33, 2 → 0.66, 3 → 1.0
            strength_vector[effect_ids[effect_id]] = normalized_strength
        # else: leave vector as all zeros for effect_id "11" and "12"

        X.append(mel_db_norm)
        y.append(strength_vector)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# === CONVERT AND SAVE ===
X = np.array(X)
y = np.array(y)
np.savez_compressed(output_file, X=X, y=y)

print("✅ Dataset saved to:", output_file)
print("X shape:", X.shape)
print("y shape:", y.shape)


Processing audio:   0%|          | 12/34452 [00:00<04:52, 117.58it/s]

Processing audio: 100%|██████████| 34452/34452 [04:42<00:00, 122.16it/s]


✅ Dataset saved to: idmt_mel_dataset.npz
X shape: (34452, 128, 173, 1)
y shape: (34452, 10)


In [2]:
import numpy as np

# Load the file (1.1GB file, takes about 13 secs)
data = np.load("idmt_mel_dataset.npz")

# Extract features and labels
X = data['X']  # Mel-spectrograms
y = data['y']  # 10-dimensional effect strength vectors

# Check shapes
print("X shape:", X.shape)  # e.g. (N, 128, T, 1)
print("y shape:", y.shape)  # e.g. (N, 10)


X shape: (34452, 128, 173, 1)
y shape: (34452, 10)


In [3]:
# Define keras model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout

input_shape = X.shape[1:]  # (128, T, 1)
num_outputs = y.shape[1]   # 10-dimensional effect strength vector

inputs = Input(shape=input_shape)

x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

# Output layer for regression
outputs = Dense(num_outputs, activation='sigmoid')(x)

model = Model(inputs, outputs)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


2025-03-23 06:17:09.579664: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [4]:
from sklearn.model_selection import train_test_split

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32
)

Epoch 1/30


2025-03-23 06:19:15.492281: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2441243136 exceeds 10% of free system memory.


[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step - loss: 0.0190 - mae: 0.0354

2025-03-23 06:21:57.814025: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 610377216 exceeds 10% of free system memory.


[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 198ms/step - loss: 0.0190 - mae: 0.0354 - val_loss: 0.0168 - val_mae: 0.0303
Epoch 2/30
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 195ms/step - loss: 0.0169 - mae: 0.0304 - val_loss: 0.0168 - val_mae: 0.0303
Epoch 3/30
[1m862/862[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 204ms/step - loss: 0.0167 - mae: 0.0301 - val_loss: 0.0168 - val_mae: 0.0303
Epoch 4/30
[1m183/862[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:19[0m 205ms/step - loss: 0.0173 - mae: 0.0310

KeyboardInterrupt: 