In [None]:
import torchaudio
import librosa
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset


librispeech = load_dataset("librispeech_asr", "clean", split="train.100")
sample_speech = librispeech[0]["audio"]["array"]
sr_speech = librispeech[0]["audio"]["sampling_rate"]


urbansound = load_dataset("danavery/urban-sound-8K", split="train")
sample_noise = urbansound[0]["audio"]["array"]
sr_noise = urbansound[0]["audio"]["sampling_rate"]

def plot_audio(audio, sr, title):
    plt.figure(figsize=(10, 3))
    plt.title(title)
    librosa.display.waveshow(audio, sr=sr)
    plt.show()

    D = librosa.stft(audio)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    plt.figure(figsize=(10, 3))
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Spectrogram: {title}")
    plt.show()

plot_audio(sample_speech, sr_speech, "Clean Speech (LibriSpeech)")
plot_audio(sample_noise, sr_noise, "Noise (UrbanSound8K)")

Downloading data:   0%|          | 0.00/6.39G [00:00<?, ?B/s]

---
# <center>Preprocess & Mix Signals


In [None]:
def mix_noise(clean, noise, sr_clean, sr_noise, target_snr_db):
    # resample noise to match clean audio's sample rate
    if sr_noise != sr_clean:
        noise = librosa.resample(noise, orig_sr=sr_noise, target_sr=sr_clean)
    
    # truncate or loop noise to match clean audio length
    if len(noise) < len(clean):
        # if noise is shorter, loop it until it matches the length of clean audio
        noise = np.tile(noise, (len(clean) // len(noise)) + 1)
    noise = noise[:len(clean)]
    
    # calculate power of clean speech and noise
    power_clean = np.mean(clean ** 2)
    power_noise = np.mean(noise ** 2)
    
    # scale noise to match target SNR
    target_power_noise = power_clean / (10 ** (target_snr_db / 10))
    scaled_noise = np.sqrt(target_power_noise / power_noise) * noise
    
    # mix clean speech with noise
    noisy = clean + scaled_noise
    return noisy, clean

# mix at SNR = 5dB (high noise)
noisy_audio, clean_audio = mix_noise(
    clean=sample_speech,
    noise=sample_noise,
    sr_clean=sr_speech,
    sr_noise=sr_noise,
    target_snr_db=5
)

plot_audio(noisy_audio, sr_speech, "Noisy Speech (SNR=5dB)")

---
# <center>Generate Synthetic Dataset

In [None]:
import soundfile as sf
import os

def generate_dataset(librispeech, urbansound, output_dir, num_samples=100, snr_levels=[0, 5, 10]):
    os.makedirs(output_dir, exist_ok=True)
    
    for i in range(num_samples):
        # clean speech
        speech = librispeech[i]["audio"]["array"]
        sr_speech = librispeech[i]["audio"]["sampling_rate"]
        
        # randomly select noise
        noise_idx = np.random.randint(0, len(urbansound))
        noise = urbansound[noise_idx]["audio"]["array"]
        sr_noise = urbansound[noise_idx]["audio"]["sampling_rate"]
        
        for snr in snr_levels:
            # mix signals
            noisy, clean = mix_noise(speech, noise, sr_speech, sr_noise, snr)
            
            # save audio files
            sf.write(f"{output_dir}/noisy_snr{snr}_sample{i}.wav", noisy, sr_speech)
            sf.write(f"{output_dir}/clean_snr{snr}_sample{i}.wav", clean, sr_speech)

# generate 100 samples with SNR=0dB, 5dB, 10dB
generate_dataset(librispeech, urbansound, "noisy_speech_dataset")

---
# <center>Evaluate with Whisper (WER Calculation)

In [None]:
import whisper
from jiwer import wer


model = whisper.load_model("base")

def transcribe_and_evaluate(clean_path, noisy_path):
    clean_text = model.transcribe(clean_path)["text"]
    
    noisy_text = model.transcribe(noisy_path)["text"]
    
    return wer(clean_text, noisy_text), clean_text, noisy_text

clean_path = "noisy_speech_dataset/clean_snr5_sample0.wav"
noisy_path = "noisy_speech_dataset/noisy_snr5_sample0.wav"
wer_score, clean_txt, noisy_txt = transcribe_and_evaluate(clean_path, noisy_path)

print(f"Clean Transcription: {clean_txt}")
print(f"Noisy Transcription: {noisy_txt}")
print(f"WER: {wer_score:.2f}")