In [1]:
# Load the saved dataset
import pandas as pd
import librosa
import numpy as np

# Load metadata CSV
metadata_file = "data/generated/processed_audio_metadata.csv"
metadata_df = pd.read_csv(metadata_file)
print(metadata_df)

                                       clean_file  \
0       data/guitarset/03_Rock2-85-F_comp_mix.wav   
1       data/guitarset/03_Rock2-85-F_comp_mix.wav   
2       data/guitarset/03_Rock2-85-F_comp_mix.wav   
3       data/guitarset/03_Rock2-85-F_comp_mix.wav   
4       data/guitarset/03_Rock2-85-F_comp_mix.wav   
...                                           ...   
2155  data/guitarset/04_Funk1-114-Ab_solo_mix.wav   
2156  data/guitarset/04_Funk1-114-Ab_solo_mix.wav   
2157  data/guitarset/04_Funk1-114-Ab_solo_mix.wav   
2158  data/guitarset/04_Funk1-114-Ab_solo_mix.wav   
2159  data/guitarset/04_Funk1-114-Ab_solo_mix.wav   

                                         processed_file             effect  
0     data/generated/03_Rock2-85-F_comp_mix_distorti...         distortion  
1      data/generated/03_Rock2-85-F_comp_mix_reverb.wav             reverb  
2       data/generated/03_Rock2-85-F_comp_mix_delay.wav              delay  
3      data/generated/03_Rock2-85-F_comp_mix_chorus.wav

In [9]:
# Define the fixed number of time frames for consistency
# For guitarset dataset, it is around 1000 frames
max_frames = 256

# Padding to match the shape
def fix_spectrogram_shape(S_db, max_frames):
    """Pads or truncates spectrogram to ensure fixed shape."""
    num_frames = S_db.shape[1]
    
    if num_frames < max_frames:
        # Pad with zeros to match max_frames
        pad_width = max_frames - num_frames
        S_db = np.pad(S_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        # Truncate to max_frames
        S_db = S_db[:, :max_frames]
    
    return S_db

In [10]:
from tqdm import tqdm

total_iterations = metadata_df.shape[0]
progress_bar = tqdm(total=total_iterations, desc="Processing Audio", unit="file")

X_clean_specs = []
X_proc_specs = []
Y_labels = []

# Mel spectrogram parameters (should be same as used during saving)
sample_rate = 22050
n_mels = 128
frame_length = 2048
hop_length = 512

# Define effect labels
effects_list = ["distortion", "reverb", "delay", "chorus"]
num_effects = len(effects_list)

# Iterate through the file to generate spectrograms
for index, row in metadata_df.iterrows():
    clean_file = row["clean_file"]
    processed_file = row["processed_file"]
    effect_label = row["effect"]
    
    # Load clean audio
    y_clean, sr = librosa.load(clean_file, sr=sample_rate)
    
    # Load processed audio
    y_proc, sr = librosa.load(processed_file, sr=sample_rate)
    
    # Compute mel spectrograms
    S_clean = librosa.feature.melspectrogram(y=y_clean, sr=sr, 
                                             n_mels=n_mels, n_fft=frame_length, hop_length=hop_length)
    S_clean_db = librosa.power_to_db(S_clean, ref=np.max)
    S_clean_db = (S_clean_db - S_clean_db.min()) / (S_clean_db.max() - S_clean_db.min() + 1e-6)
    S_clean_db = fix_spectrogram_shape(S_clean_db, max_frames)
    S_clean_db = S_clean_db[..., np.newaxis]  # Add channel dimension

    S_proc = librosa.feature.melspectrogram(y=y_proc, sr=sr, 
                                            n_mels=n_mels, n_fft=frame_length, hop_length=hop_length)
    S_proc_db = librosa.power_to_db(S_proc, ref=np.max)
    S_proc_db = (S_proc_db - S_proc_db.min()) / (S_proc_db.max() - S_proc_db.min() + 1e-6)
    S_proc_db = fix_spectrogram_shape(S_proc_db, max_frames)
    S_proc_db = S_proc_db[..., np.newaxis]  # Add channel dimension

    # Append to dataset
    X_clean_specs.append(S_clean_db)
    X_proc_specs.append(S_proc_db)

    # Create multi-label effect vector
    label_vec = np.zeros(num_effects, dtype=np.float32)
    for effect_name in effect_label.split("_"):  # Handles combo effects
        if effect_name in effects_list:
            idx = effects_list.index(effect_name)
            label_vec[idx] = 1.0
    
    Y_labels.append(label_vec)

    progress_bar.update(1)

progress_bar.close()

print(X_clean_specs)
print(X_proc_specs)
print(Y_labels)

Processing Audio:   9%|▊         | 184/2160 [01:11<12:50,  2.56file/s]
Processing Audio: 100%|██████████| 2160/2160 [01:16<00:00, 28.29file/s]


[array([[[0.59716696],
        [0.62160623],
        [0.66681844],
        ...,
        [0.17804737],
        [0.20327544],
        [0.22279568]],

       [[0.57165   ],
        [0.64157236],
        [0.67333823],
        ...,
        [0.21529508],
        [0.19291535],
        [0.15200338]],

       [[0.5776198 ],
        [0.57497823],
        [0.6303071 ],
        ...,
        [0.3765923 ],
        [0.37634453],
        [0.34541473]],

       ...,

       [[0.0983635 ],
        [0.13843842],
        [0.13701573],
        ...,
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.01354637],
        [0.04766903],
        [0.03524341],
        ...,
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        ...,
        [0.        ],
        [0.        ],
        [0.        ]]], dtype=float32), array([[[0.59716696],
        [0.62160623],
        [0.66681844],
        ...,
        [

In [11]:
X_clean_specs = np.array(X_clean_specs)
X_proc_specs = np.array(X_proc_specs)
Y_labels = np.array(Y_labels)

print(f"Loaded {X_clean_specs.shape[0]} samples.")
print(f"X_clean_specs shape: {X_clean_specs.shape}")
print(f"X_proc_specs shape: {X_proc_specs.shape}")
print(f"Y_labels shape: {Y_labels.shape}")

Loaded 2160 samples.
X_clean_specs shape: (2160, 128, 256, 1)
X_proc_specs shape: (2160, 128, 256, 1)
Y_labels shape: (2160, 4)


In [None]:
# Split into train/val/test
from sklearn.model_selection import train_test_split
Xc_train, Xc_val, Xp_train, Xp_val, y_train, y_val = train_test_split(
    X_clean_specs, X_proc_specs, Y_labels, test_size=0.2, random_state=42)

# Train the model
history = model.fit(
    {"clean_input": Xc_train, "processed_input": Xp_train}, y_train,
    validation_data=({"clean_input": Xc_val, "processed_input": Xp_val}, y_val),
    epochs=20, batch_size=16
)

# After training, evaluate on a test set (not shown: you'd similarly create Xc_test, Xp_test, y_test)
test_loss, test_acc = model.evaluate({"clean_input": Xc_test, "processed_input": Xp_test}, y_test)
print("Test accuracy:", test_acc)