In [1]:
import json
import librosa
import soundfile as sf
import numpy as np

from pathlib import Path

def load_manifest(manifest_file):
    manifest = []
    with open(manifest_file) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            manifest.append(json.loads(line))
    return manifest

def save_manifest(manifest, manifest_file):
    with open(manifest_file, 'w') as f:
        for m in manifest:
            f.write(json.dumps(m) + '\n')

def add_noise_to_audio(audio_file, noise_file, min_snr=0, max_snr=10):
    audio, sr = librosa.load(audio_file, sr=None)
    noise, _ = librosa.load(noise_file, sr=sr)
    if len(noise) < len(audio):
        # randomly pad noise to the same length as audio
        start = np.random.randint(0, len(audio) - len(noise))
        noise_add = np.zeros(len(audio))
        noise_add[start:start+len(noise)] = noise
    elif len(noise) > len(audio):
        start = np.random.randint(0, len(noise) - len(audio))
        noise_add = noise[start:start+len(audio)]
    
    if min_snr == max_snr:
        snr = min_snr
    else:
        snr = np.random.randint(min_snr, max_snr)

    audio_rms = np.sqrt(np.mean(audio**2))
    noise_rms = np.sqrt(np.mean(noise_add**2))
    noise_rms_target = audio_rms / (10**(snr/20))
    noise_add = noise_add * noise_rms_target / noise_rms

    audio_with_noise = audio + noise_add
    return audio_with_noise, audio, noise_add, snr

def save_noisy_audio(noisy_audio, audio_file, output_dir, snr):
    audio_file = Path(audio_file)
    output_audio_file = Path(output_dir) / audio_file
    output_audio_file.parent.mkdir(parents=True, exist_ok=True)
    # modify the filename to include snr
    output_audio_file = output_audio_file.with_name(f'{output_audio_file.stem}_snr{snr:d}{output_audio_file.suffix}')
    sf.write(output_audio_file, noisy_audio, 16000, subtype='PCM_16')
    return str(output_audio_file.relative_to(output_dir))


In [7]:
data_dir = Path("/media/data2/datasets/speaker_datasets")
vox1_dir = data_dir / "voxceleb1"
vox2_dir = data_dir / "voxceleb2"
vox1_val_manifest = vox1_dir / "vox1_train_manifest_val_chunk30s.json"
vox2_val_manifest = vox2_dir / "vox2_all_manifest_val_chunk30s.json"
musan_dir = Path("/media/data3/datasets/noise_data/musan")
musan_noise_manifest = musan_dir / "musan_nonspeech_manifest.json"

In [6]:
vox2_val_data = load_manifest(vox2_val_manifest)
vox1_val_data = load_manifest(vox1_val_manifest)
musan_noise_data = load_manifest(musan_noise_manifest)

In [12]:
sample_audio = vox2_dir / np.random.choice(vox2_val_data)["audio_filepath"]
sample_noise = musan_dir / np.random.choice(musan_noise_data)["audio_filepath"]

noisy_audio, clean_audio, noise, snr = add_noise_to_audio(sample_audio, sample_noise, min_snr=0, max_snr=15)


In [13]:
# play audio
import IPython
print(f"SNR: {snr}")
IPython.display.Audio(data=noisy_audio, rate=16000)

SNR: 8.35668787849069


In [20]:
def process_sample(item, input_dir, output_dir, noise_data, noise_dir, min_snr=0, max_snr=15):
    audio_file = input_dir / item["audio_filepath"]
    noise_file = noise_dir / np.random.choice(noise_data)["audio_filepath"]
    noisy_audio, clean_audio, noise, snr = add_noise_to_audio(audio_file, noise_file, min_snr=min_snr, max_snr=max_snr)
    output_audio_file = save_noisy_audio(noisy_audio, item["audio_filepath"], output_dir, snr)
    item = item.copy()
    item["audio_filepath"] = output_audio_file
    return item

In [24]:
from tqdm import tqdm
import multiprocessing as mp

vox2_output_dir = data_dir / "vox2_noisy_val"
noisy_vox2_manifest = vox2_output_dir / "vox2_noisy_val_manifest.json"
vox2_output_dir.mkdir(parents=True, exist_ok=True)

def process_vox2(item):
    return process_sample(item, vox2_dir, vox2_output_dir, musan_noise_data, musan_dir)
with mp.Pool(16) as pool:
    noisy_vox2_data = list(tqdm(pool.imap(process_vox2, vox2_val_data), total=len(vox2_val_data)))

save_manifest(noisy_vox2_data, noisy_vox2_manifest)
print(f"Saved noisy vox2 manifest to {noisy_vox2_manifest}")


  0%|          | 0/29846 [00:00<?, ?it/s]

100%|██████████| 29846/29846 [01:36<00:00, 308.37it/s]


Saved noisy vox2 manifest to /media/data2/datasets/speaker_datasets/vox2_noisy_val/vox2_noisy_val_manifest.json


In [25]:
vox1_output_dir = data_dir / "vox1_musan_val"
noisy_vox1_manifest = vox1_output_dir / "vox1_noisy_val_manifest.json"
vox1_output_dir.mkdir(parents=True, exist_ok=True)

def process_vox1(item):
    return process_sample(item, vox1_dir, vox1_output_dir, musan_noise_data, musan_dir)
with mp.Pool(16) as pool:
    noisy_vox1_data = list(tqdm(pool.imap(process_vox1, vox1_val_data), total=len(vox1_val_data)))

save_manifest(noisy_vox1_data, noisy_vox1_manifest)
print(f"Saved noisy manifest to {noisy_vox1_manifest}")

100%|██████████| 12954/12954 [00:25<00:00, 513.45it/s]

Saved noisy vox2 manifest to /media/data2/datasets/speaker_datasets/vox1_musan_val/vox1_noisy_val_manifest.json





In [3]:
slurp_root = Path("/media/data/datasets/SLURP")
train_manifest_file1 = slurp_root / "train_synthetic_slu.json"
train_manifest_file2 = slurp_root / "train_slu.json"
val_manifest_file = slurp_root / "devel_slu.json"
test_manifest_file = slurp_root / "test_slu.json"

question = "Listen to the audio, detect the scenario and action, then perform slot filling, and generate the result as a python dictionary object."

train_manifest = load_manifest(train_manifest_file1) + load_manifest(train_manifest_file2)

for i in range(len(train_manifest)):
    train_manifest[i]["question"] = question
    train_manifest[i]["answer"] = train_manifest[i]["text"]
    train_manifest[i]["audio_filepath"] = str(slurp_root / train_manifest[i]["audio_filepath"])

save_manifest(train_manifest, slurp_root / "train_slu_all_salm.json")




In [6]:
dev_manifest = load_manifest(val_manifest_file)
for i in range(len(dev_manifest)):
    dev_manifest[i]["question"] = question
    dev_manifest[i]["answer"] = dev_manifest[i]["text"]
save_manifest(dev_manifest, slurp_root / "devel_slu_salm.json")

In [7]:
test_manifest = load_manifest(test_manifest_file)
for i in range(len(test_manifest)):
    test_manifest[i]["question"] = question
    test_manifest[i]["answer"] = test_manifest[i]["text"]
save_manifest(test_manifest, slurp_root / "test_slu_salm.json")