In [7]:
from pathlib import Path
from torch.utils.data import Dataset
import torchaudio
import librosa 
import numpy as np
import wave

In [8]:
# 

In [None]:
def preprocess_audio_librosa(
    wav_path: str,
    target_sr: int = 16000,
    clip_seconds: float = 4.0,
    n_fft: int = 512,
    hop_length: int = 160,
    win_length: int | None = 400,
    center: bool = True,
    eps: float = 1e-8,
 ):
    """
    Returns a log-magnitude STFT "image" suitable for 2D CNNs.
    Output shape: (1, F, T) where F = n_fft//2 + 1.
    """
    
    # Load only the first `clip_seconds` worth of audio (or less if file is shorter)
    y, sr = librosa.load(
        wav_path,
        sr=target_sr,
        mono=True,
        offset=0.0,
        duration=clip_seconds,
        dtype=np.float32,
    )

    # Ensure fixed length by padding if needed (librosa will NOT pad automatically)
    target_len = int(sr * clip_seconds)
    if y.shape[0] < target_len:
        y = np.pad(y, (0, target_len - y.shape[0]), mode="constant")
    elif y.shape[0] > target_len:
        # Usually won't happen because duration=clip_seconds, but kept for safety
        y = y[:target_len]

    # STFT -> magnitude spectrogram
    S = librosa.stft(
        y,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window="hann",
        center=center,
    )

    mag = np.abs(S)
    log_mag = np.log(mag + eps)

    # Per-sample normalization
    log_mag = (log_mag - log_mag.mean()) / (log_mag.std() + 1e-6)

    return log_mag[np.newaxis, :, :].astype(np.float32)