In [60]:
import os
import math
from random import shuffle, sample, choice
import numpy as np
from scipy.signal import convolve
from scipy.io import loadmat
from pathlib import Path
from tqdm.notebook import tqdm
from librosa import load, resample, get_samplerate, get_duration
import soundfile as sf

In [61]:
speech_dir = Path('/home/guillaume/GitHub/MS-SNSD/clean_train')
noise_dir = Path('/home/guillaume/GitHub/MS-SNSD/noise_train')
RIR_dir = Path('/home/guillaume/GitHub/RIRs')

SNRs = [-10, -5, 0, 5, 10, 20, 30, 40]

validation_frac = 0.1
training_frac = 1 - validation_frac

max_size = 20

output_dir = Path('/home/guillaume/GitHub/EHNet/WAVs/dataset')

In [62]:
def power(x):
    return np.sum(np.square(x))

def SNR(s, n):
    return 10*math.log10(power(s)/power(n))

In [63]:
def resample_convolve(speech_wav, RIR_path, SNRs):
    RIR = loadmat(RIR_path)
    RIR = RIR['h_air'][0]
    speech_wav_upsampled = resample(speech_wav, 16000, 48000, res_type='kaiser_fast')
    convolved = convolve(speech_wav_upsampled, RIR)
    convolved = convolved[:3*len(speech_wav)]
    convolved_downsampled = resample(convolved, 48000, 16000, res_type='kaiser_fast')
    current_SNR = SNR(speech_wav, convolved_downsampled)
    alpha = 1/math.sqrt(math.pow(10, (choice(SNRs) - current_SNR)/10))
    return speech_wav + alpha * convolved_downsampled

In [64]:
def snr_mixer(clean, noise, snr):
    # Normalizing to -25 dB FS
    rmsclean = (clean**2).mean()**0.5
    scalarclean = 10 ** (-25 / 20) / rmsclean
    clean = clean * scalarclean
    rmsclean = (clean**2).mean()**0.5

    rmsnoise = (noise**2).mean()**0.5
    scalarnoise = 10 ** (-25 / 20) /rmsnoise
    noise = noise * scalarnoise
    rmsnoise = (noise**2).mean()**0.5
    
    # Set the noise level for a given SNR
    noisescalar = np.sqrt(rmsclean / (10**(snr/10)) / rmsnoise)
    noisenewlevel = noise * noisescalar
    noisyspeech = clean + noisenewlevel
    return noisyspeech

In [73]:
def generate_wavs(speech_dir, noise_dir, RIR_dir, speech_filenames, noise_filenames, SNRs, output_path, max_size):
    clean_output_path = output_path.joinpath('clean')
    noisy_output_path = output_path.joinpath('noisy')
    if not Path.exists(clean_output_path):
        os.makedirs(clean_output_path)
    if not Path.exists(noisy_output_path):
        os.makedirs(noisy_output_path)
        
    RIR_filenames = os.listdir(RIR_dir)
    
    pbar = tqdm(total=max_size)
    
    num_samples = 0
    filecounter = 0
    while num_samples < max_size:
        ##------- Getting speech -------##
        idx_s = np.random.randint(0, np.size(speech_filenames))
        path_speech = speech_dir.joinpath(speech_filenames[idx_s])
        if get_samplerate(path_speech) == 16000:
            speech_wav, _ = load(path_speech, sr=None, mono=True)
        else:
            speech_wav, _ = load(path_speech, sr=16000, mono=True, res_type='kaiser_fast')
        
        # make sure all WAVs are 128000 samples (8s) long
        length_difference = 128000 - len(speech_wav)
        while length_difference > 0:
            idx_s = idx_s + 1
            if idx_s >= np.size(speech_filenames)-1:
                idx_s = np.random.randint(0, np.size(speech_filenames))
                
            path_speech = speech_dir.joinpath(speech_filenames[idx_s])
            if get_samplerate(path_speech) == 16000:
                new_speech_wav, _ = load(path_speech, sr=None, mono=True)
            else:
                new_speech_wav, _ = load(path_speech, sr=16000, mono=True, res_type='kaiser_fast')
            
            cleanconcat = np.append(speech_wav, np.zeros(int(16000*0.2))) # 0.2 seconds of silence between utterances
            speech_wav = np.append(cleanconcat, new_speech_wav)
            
            length_difference = 128000 - len(speech_wav)
        
        if length_difference == 0:
            pass
        if length_difference < 0:
            start = np.random.randint(0, -length_difference)
            speech_wav = speech_wav[start:start+128000]
            
        assert len(speech_wav) == 128000, 'Speech waveform has ' + str(len(speech_wav)) + ' samples instead of 128000 samples!'
        
        RIR_filenames = [filename for filename in RIR_filenames if filename.endswith('.mat')]
        RIR_path = RIR_dir.joinpath(choice(RIR_filenames))
        speech_convolved_wav = resample_convolve(speech_wav, RIR_path, [0])
        
        # normalize speech part to -25dBFS
        rms = (speech_convolved_wav ** 2).mean() ** 0.5
        scalar = 10 ** (-25 / 20) / (rms)
        speech_convolved_wav = speech_convolved_wav * scalar
        speech_wav = speech_wav * scalar
        

        ##------- Adding noise -------##
        idx_n = np.random.randint(0, np.size(noise_filenames))
        path_noise = noise_dir.joinpath(noise_filenames[idx_n])
        if get_samplerate(path_noise) == 16000:
            noise_wav, _ = load(path_noise, sr=None, mono=True)
        else:
            noise_wav, _ = load(path_noise, sr=16000, mono=True, res_type='kaiser_fast')

        # normalize noise part to -25dBFS
        rms = (noise_wav ** 2).mean() ** 0.5
        scalar = 10 ** (-25 / 20) / (rms)
        noise_wav = noise_wav * scalar

        length_difference = len(speech_wav) - len(noise_wav)
        if length_difference == 0:
            pass
        if length_difference > 0:
            print("Noise was shorter than 8s!")
            continue
        if length_difference < 0:
            start = np.random.randint(0, -length_difference)
            noise_wav = noise_wav[start:start+len(speech_wav)]

        desired_SNR = choice(SNRs)
        noisy_speech_wav = snr_mixer(speech_convolved_wav, noise_wav, desired_SNR)

        if np.max(np.abs(noisy_speech_wav)) > 1: # skip if we are clipping
            continue

        filecounter = filecounter + 1
        
        clean_speech_filename = clean_output_path.joinpath(str(filecounter) + '.wav')
        sf.write(clean_speech_filename, speech_wav, 16000)
        
        noisy_speech_filename = noisy_output_path.joinpath(str(filecounter) + '+SNR' + str(desired_SNR) + 'dB' + '.wav')
        sf.write(noisy_speech_filename, noisy_speech_wav, 16000)
        
        num_samples += 1
        pbar.update(1)
        
    pbar.close()

In [74]:
def split_generate_wavs(speech_dir, noise_dir, RIR_dir, SNRs, validation_frac, training_frac, max_size, output_dir):
    speech_filenames = os.listdir(speech_dir)
    speech_filenames = [filename for filename in speech_filenames if filename.endswith('.wav')]
    shuffle(speech_filenames)
    
    noise_filenames = os.listdir(noise_dir)
    noise_filenames = [filename for filename in noise_filenames if filename.endswith('.wav') and get_duration(filename=noise_dir.joinpath(filename)) >= 8]
    shuffle(noise_filenames)
    
    print('Using', len(speech_filenames), 'speech files.')
    print('Using', len(noise_filenames), 'noise files.')
    

    speech_train, speech_val = np.split(speech_filenames, [int(training_frac*len(speech_filenames))])
    
    training_output_path = output_dir.joinpath('training')
    val_output_path = output_dir.joinpath('validation')
    
    # generate training set
    generate_wavs(speech_dir, noise_dir, RIR_dir, speech_train, noise_filenames, SNRs, training_output_path, int(max_size * training_frac))
    
    # generate validation set
    generate_wavs(speech_dir, noise_dir, RIR_dir, speech_val, noise_filenames, SNRs, val_output_path, int(max_size * validation_frac))

In [75]:
split_generate_wavs(speech_dir, noise_dir, RIR_dir, SNRs, validation_frac, training_frac, max_size, output_dir)

Using 23075 speech files.
Using 128 noise files.


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


