In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm

# Load metadata
metadata_file = "data/generated/processed_audio_metadata.csv"
metadata_df = pd.read_csv(metadata_file)

# Initialize lists for dataset
X_clean_specs = []
X_proc_specs = []
Y_labels = []

# Mel spectrogram parameters (should match model input requirements)
sample_rate = 22050
n_mels = 128
frame_length = 2048
hop_length = 512
max_frames = 256  # Fixed time dimension for spectrograms

# Function to ensure all spectrograms have a fixed size
def fix_spectrogram_shape(S_db, max_frames):
    num_frames = S_db.shape[1]
    if num_frames < max_frames:
        pad_width = max_frames - num_frames
        S_db = np.pad(S_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        S_db = S_db[:, :max_frames]
    return S_db

# Progress bar for loading dataset
progress_bar = tqdm(total=len(metadata_df), desc="Loading Audio Dataset", unit="file")

# Iterate through metadata to load spectrograms
for index, row in metadata_df.iterrows():
    clean_file = row["clean_file"]
    processed_file = row["processed_file"]

    try:
        # Load clean and processed audio
        y_clean, sr = librosa.load(clean_file, sr=sample_rate)
        y_proc, sr = librosa.load(processed_file, sr=sample_rate)

        # Compute mel spectrograms
        S_clean = librosa.feature.melspectrogram(y=y_clean, sr=sr, 
                                                 n_mels=n_mels, n_fft=frame_length, hop_length=hop_length)
        S_clean_db = librosa.power_to_db(S_clean, ref=np.max)
        S_clean_db = fix_spectrogram_shape(S_clean_db, max_frames)[..., np.newaxis]  # Add channel dimension

        S_proc = librosa.feature.melspectrogram(y=y_proc, sr=sr, 
                                                n_mels=n_mels, n_fft=frame_length, hop_length=hop_length)
        S_proc_db = librosa.power_to_db(S_proc, ref=np.max)
        S_proc_db = fix_spectrogram_shape(S_proc_db, max_frames)[..., np.newaxis]  # Add channel dimension

        # Store spectrograms
        X_clean_specs.append(S_clean_db)
        X_proc_specs.append(S_proc_db)

        # Store effect levels as labels
        effect_vector = [
            row["distortion_level"],
            row["reverb_level"],
            row["chorus_level"],
            row["echo_level"]
        ]
        Y_labels.append(effect_vector)

        # Update progress bar
        progress_bar.update(1)

    except Exception as e:
        print(f"Skipping file {processed_file}: {e}")
        continue

# Close progress bar
progress_bar.close()

# Convert lists to NumPy arrays for model training
X_clean_specs = np.array(X_clean_specs)
X_proc_specs = np.array(X_proc_specs)
Y_labels = np.array(Y_labels)  # Now contains effect levels as a continuous vector

print(f"Dataset loaded: {X_clean_specs.shape[0]} samples.")
print(f"X_clean_specs shape: {X_clean_specs.shape}")
print(f"X_proc_specs shape: {X_proc_specs.shape}")
print(f"Y_labels shape: {Y_labels.shape}")


Loading Audio Dataset:   0%|          | 0/5760 [00:00<?, ?file/s]