In [1]:
import librosa
import numpy as np
import soundfile as sf
import os
from pathlib import Path
from scipy.signal import resample
from concurrent.futures import ProcessPoolExecutor


In [6]:
import numpy as np
import os
from pathlib import Path
import soundfile as sf
import librosa

def normalize_audio_by_energy(audio):
    #Normalize the audio by its energy (RMS).
    rms = np.sqrt(np.mean(np.square(audio)))
    return audio / rms if rms > 0 else audio

def scale_audio_for_snr(signal_audio, noise_audio, desired_snr_db):
    #Scale noise audio to achieve a desired SNR given the signal audio.
    signal_power = np.mean(np.square(signal_audio))
    noise_power = np.mean(np.square(noise_audio))
    desired_noise_power = signal_power / (10 ** (desired_snr_db / 10))
    scaling_factor = np.sqrt(desired_noise_power / noise_power)
    return noise_audio * scaling_factor


def process_and_mix_multi_speakers(wsj0root, output_dir, task_file_path, fs=8000, auto_scale_snr=True, desired_snr_db=0):
    with open(task_file_path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split()
        audio_files = []
        file_paths = []
        scaled_audios = []
        max_len = 0

        # Load and normalize all audio files first
        for i in range(0, len(parts), 2):
            file_path = os.path.join(wsj0root, parts[i])
            audio, _ = librosa.load(file_path, sr=fs, mono=True)
            audio = normalize_audio_by_energy(audio)
            audio_files.append(audio)
            file_paths.append(file_path)
            max_len = max(max_len, len(audio))

        # Pad all audio files
        for i, audio in enumerate(audio_files):
            if i > 0 and auto_scale_snr:
                snr = float(parts[2*i + 1])
                audio = scale_audio_for_snr(audio_files[0], audio, snr)  # Scale relative to the first audio
            padded_audio = np.pad(audio, (0, max_len - len(audio)), 'constant', constant_values=(0))
            scaled_audios.append(padded_audio)

        total_audio = np.sum(scaled_audios, axis=0)
        mixed_audio = normalize_audio_by_energy(total_audio)
        directory_name = "_".join([Path(fp).stem for fp in file_paths])
        mix_dir = Path(output_dir) / directory_name
        mix_dir.mkdir(parents=True, exist_ok=True)
        mix_path = mix_dir / "mixed.wav"
        sf.write(str(mix_path), mixed_audio, fs)

        # Save each individual audio using the original TIMIT filename with an index to prevent overriding
        for idx, audio in enumerate(scaled_audios):
            original_filename = Path(file_paths[idx]).stem
            extension = Path(file_paths[idx]).suffix
            tagged_filename = f"{original_filename}_{idx+1}{extension}"
            sf.write(str(mix_dir / tagged_filename), audio, fs)

        print(f"Processed mix saved in: {mix_dir}")

In [8]:
wsj0root = 'G:'
output_dir = '4Speakers5K'
task_file_path = '4Speakers5K.txt'

process_and_mix_multi_speakers(wsj0root, output_dir, task_file_path, fs=8000, auto_scale_snr=True, desired_snr_db=5)

Processed mix saved in: 4Speakers5K\SA2_SA1_SA2_SA2
Processed mix saved in: 4Speakers5K\SI1788_SX419_SX256_SX402
Processed mix saved in: 4Speakers5K\SX451_SX434_SX251_SX38
Processed mix saved in: 4Speakers5K\SI2006_SX432_SX231_SX41
Processed mix saved in: 4Speakers5K\SX66_SX447_SI1683_SI1773
Processed mix saved in: 4Speakers5K\SX60_SX65_SI1999_SI1385
Processed mix saved in: 4Speakers5K\SX358_SX181_SX308_SA2
Processed mix saved in: 4Speakers5K\SI937_SX162_SI1761_SX415
Processed mix saved in: 4Speakers5K\SA1_SX66_SI654_SI505
Processed mix saved in: 4Speakers5K\SI968_SA2_SX305_SX74
Processed mix saved in: 4Speakers5K\SX129_SI1504_SA1_SX437
Processed mix saved in: 4Speakers5K\SI2040_SX334_SI1887_SX171
Processed mix saved in: 4Speakers5K\SX442_SX210_SX147_SI1440
Processed mix saved in: 4Speakers5K\SI1806_SX440_SA2_SX245
Processed mix saved in: 4Speakers5K\SI1315_SA1_SI2233_SX352
Processed mix saved in: 4Speakers5K\SX288_SX147_SI1142_SX59
Processed mix saved in: 4Speakers5K\SX333_SA2_SI1383_

In [15]:
import numpy as np
import os
from pathlib import Path
import soundfile as sf
import librosa

def normalize_audio_by_energy(audio):
    """Normalize the audio by its energy (RMS)."""
    rms = np.sqrt(np.mean(np.square(audio)))
    return audio / rms if rms > 0 else audio

def scale_audio_for_snr(signal_audio, noise_audio, desired_snr_db):
    """Scale noise audio to achieve a desired SNR given the signal audio."""
    signal_power = np.mean(np.square(signal_audio))
    noise_power = np.mean(np.square(noise_audio))
    desired_noise_power = signal_power / (10 ** (desired_snr_db / 10))
    scaling_factor = np.sqrt(desired_noise_power / noise_power)
    return noise_audio * scaling_factor

def extract_first_samples(audio, num_samples):
    """Extract the first num_samples from the given audio."""
    return audio[:num_samples] if num_samples <= len(audio) else audio

def process_and_mix_multi_speakers(wsj0root, output_dir, task_file_path, fs=8000, auto_scale_snr=True, desired_snr_db=0):
    num_first_samples = 10000
    with open(task_file_path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split()
        audio_files = []
        file_paths = []
        scaled_audios = []
        max_len = 0

        # Load and normalize all audio files first
        for i in range(0, len(parts), 2):
            file_path = os.path.join(wsj0root, parts[i])
            audio, _ = librosa.load(file_path, sr=fs, mono=True)
            audio = normalize_audio_by_energy(audio)
            audio_files.append(audio)
            file_paths.append(file_path)
            max_len = max(max_len, len(audio))

        # Pad and optionally scale all audio files
        for i, audio in enumerate(audio_files):
            if i > 0 and auto_scale_snr:
                snr = float(parts[2*i + 1])
                audio = scale_audio_for_snr(audio_files[0], audio, snr)  # Scale relative to the first audio
            padded_audio = np.pad(audio, (0, max_len - len(audio)), 'constant', constant_values=(0))
            scaled_audios.append(padded_audio)

        total_audio = np.sum(scaled_audios, axis=0)
        mixed_audio = normalize_audio_by_energy(total_audio)
        mixed_audio_first = extract_first_samples(mixed_audio, num_first_samples)
        directory_name = "_".join([Path(fp).stem for fp in file_paths])
        mix_dir = Path(output_dir) / directory_name
        mix_dir.mkdir(parents=True, exist_ok=True)
        mix_path = mix_dir / "mixed.wav"
        sf.write(str(mix_path), mixed_audio_first, fs)
        for idx, audio in enumerate(scaled_audios):
            original_filename = Path(file_paths[idx]).stem
            extension = Path(file_paths[idx]).suffix
            tagged_filename = f"{original_filename}_{idx+1}{extension}"
            individual_audio_first = extract_first_samples(audio, num_first_samples)
            sf.write(str(mix_dir / tagged_filename), individual_audio_first, fs)

        print(f"Processed mix saved in: {mix_dir}")


In [16]:
for NUM_SOURCES in range(2, 5): 
    wsj0root = 'G:'
    output_dir = f'{NUM_SOURCES}Speakers5K10KSamples'
    task_file_path = f'{NUM_SOURCES}Speakers5K.txt'

    process_and_mix_multi_speakers(wsj0root, output_dir, task_file_path, fs=8000, auto_scale_snr=True, desired_snr_db=5)

Processed mix saved in: 2Speakers5K10KSamples\SX31_SX257
Processed mix saved in: 2Speakers5K10KSamples\SI2137_SX39
Processed mix saved in: 2Speakers5K10KSamples\SI888_SX130
Processed mix saved in: 2Speakers5K10KSamples\SI652_SX301
Processed mix saved in: 2Speakers5K10KSamples\SX177_SI2264
Processed mix saved in: 2Speakers5K10KSamples\SX57_SA1
Processed mix saved in: 2Speakers5K10KSamples\SA1_SX441
Processed mix saved in: 2Speakers5K10KSamples\SX451_SX250
Processed mix saved in: 2Speakers5K10KSamples\SI1239_SA1
Processed mix saved in: 2Speakers5K10KSamples\SX35_SA1
Processed mix saved in: 2Speakers5K10KSamples\SI1443_SX267
Processed mix saved in: 2Speakers5K10KSamples\SA2_SX322
Processed mix saved in: 2Speakers5K10KSamples\SA2_SX338
Processed mix saved in: 2Speakers5K10KSamples\SX45_SX291
Processed mix saved in: 2Speakers5K10KSamples\SI1464_SI453
Processed mix saved in: 2Speakers5K10KSamples\SI1782_SI702
Processed mix saved in: 2Speakers5K10KSamples\SX247_SX276
Processed mix saved in: 2