In [2]:
import os
import math
from random import shuffle, sample, choice
import numpy as np
from scipy.signal import convolve, butter, sosfilt
from pathlib import Path
from tqdm.notebook import tqdm
from librosa import load, resample, get_samplerate, get_duration
import soundfile as sf

In [11]:
speech_dir = Path('/home/guillaume/Github/MS-SNSD/clean_test')
seen_noise_dir = Path('/home/guillaume/Github/MS-SNSD/noise_train')
unseen_noise_dir = Path('/home/guillaume/Github/MS-SNSD/noise_test')

SNRs = [-10, -5, 0, 5]

max_size = 2000

output_dir = Path('/home/guillaume/Github/EHNet/WAVs/MS-SNSD-test')

In [12]:
def power(x):
    return np.sum(np.square(x))

def SNR(s, n):
    return 10*math.log10(power(s)/power(n))

In [13]:
def resample_convolve(speech_wav, RIR, SNRs):
    speech_wav_upsampled = resample(speech_wav, 16000, 48000, res_type='kaiser_fast')
    convolved = convolve(speech_wav_upsampled, RIR)
    convolved = convolved[:3*len(speech_wav)]
    convolved_downsampled = resample(convolved, 48000, 16000, res_type='kaiser_fast')
    current_SNR = SNR(speech_wav, convolved_downsampled)
    alpha = 1/math.sqrt(math.pow(10, (choice(SNRs) - current_SNR)/10))
    return speech_wav + alpha * convolved_downsampled

In [14]:
def snr_mixer(clean, noise, snr):
    # Normalizing to -25 dB FS
    rmsclean = (clean**2).mean()**0.5
    scalarclean = 10 ** (-25 / 20) / rmsclean
    clean = clean * scalarclean
    rmsclean = (clean**2).mean()**0.5

    rmsnoise = (noise**2).mean()**0.5
    scalarnoise = 10 ** (-25 / 20) /rmsnoise
    noise = noise * scalarnoise
    rmsnoise = (noise**2).mean()**0.5
    
    # Set the noise level for a given SNR
    noisescalar = np.sqrt(rmsclean / (10**(snr/20)) / rmsnoise)
    noisenewlevel = noise * noisescalar
    noisyspeech = clean + noisenewlevel
    return noisyspeech

In [15]:
def filter(waveform):
    sos = butter(10, 100, 'hp', fs=16000, output='sos') # 100Hz high pass filter
    return sosfilt(sos, waveform)

In [21]:
def generate_wavs(speech_dir, noise_dir, RIR_dir, speech_filenames, noise_filenames, SNRs, output_path, max_size):
    clean_output_path = output_path.joinpath('clean')
    noisy_output_path = output_path.joinpath('noisy')
    if not Path.exists(clean_output_path):
        os.makedirs(clean_output_path)
    if not Path.exists(noisy_output_path):
        os.makedirs(noisy_output_path)
        
    RIR_filenames = os.listdir(RIR_dir)
    
    for filename_speech in tqdm(speech_filenames):
        path_speech = speech_dir.joinpath(filename_speech)
        if get_samplerate(path_speech) == 16000:
            speech_wav, _ = load(path_speech, sr=None, mono=True)
        else:
            speech_wav, _ = load(path_speech, sr=16000, mono=True, res_type='kaiser_fast')
        
        # make sure all WAVs are 128000 samples (8s) long
        length_difference = 128000 - len(speech_wav)
        if length_difference == 0:
            pass
        if length_difference > 0:
            start = np.random.randint(0, length_difference)
            speech_wav = np.pad(speech_wav, (start, length_difference - start))
        if length_difference < 0:
            start = np.random.randint(0, -length_difference)
            speech_wav = speech_wav[start:start+128000]
            
        assert len(speech_wav) == 128000, 'Speech waveform has ' + str(len(speech_wav)) + ' samples instead of 128000 samples!'
        
#         RIR = choice(np.load(RIR_dir.joinpath(choice(RIR_filenames))).transpose())
#         speech_convolved_wav = resample_convolve(speech_wav, RIR, [10, 20, 30, 40])
        speech_convolved_wav = speech_wav
        
#         # HPF the speech to 100Hz
#         speech_convolved_wav = filter(speech_convolved_wav)
        
        # normalize speech part to -25dBFS
        rms = (speech_convolved_wav ** 2).mean() ** 0.5
        scalar = 10 ** (-25 / 20) / (rms)
        speech_convolved_wav = speech_convolved_wav * scalar

        clean_speech_filename = clean_output_path.joinpath(filename_speech.rsplit( ".", 1 )[0] + '.wav')
        sf.write(clean_speech_filename, speech_convolved_wav, 16000)
        
        if len(speech_filenames) * len(noise_filenames) > max_size:
            noise_filenames_sampled = sample(list(noise_filenames), max_size // len(speech_filenames))
        else:
            noise_filenames_sampled = noise_filenames
        
        for filename_noise in noise_filenames_sampled:
            path_noise = noise_dir.joinpath(filename_noise)
            if get_samplerate(path_noise) == 16000:
                noise_wav, _ = load(path_noise, sr=None, mono=True)
            else:
                noise_wav, _ = load(path_noise, sr=16000, mono=True, res_type='kaiser_fast')
                
            # HPF the noise to 100Hz
            noise_wav = filter(noise_wav)
                
            # normalize noise part to -25dBFS
            rms = (noise_wav ** 2).mean() ** 0.5
            scalar = 10 ** (-25 / 20) / (rms)
            noise_wav = noise_wav * scalar

            length_difference = len(speech_wav) - len(noise_wav)
            if length_difference == 0:
                pass
            if length_difference > 0:
                print("Noise was shorter than 8s!")
                start = np.random.randint(0, length_difference)
                noise_wav = np.pad(noise_wav, (start, length_difference - start))
            if length_difference < 0:
                start = np.random.randint(0, -length_difference)
                noise_wav = noise_wav[start:start+len(speech_wav)]

            desired_SNR = choice(SNRs)
            noisy_speech_wav = snr_mixer(speech_convolved_wav, noise_wav, desired_SNR)
            
            if np.max(np.abs(noisy_speech_wav)) > 1: # skip if we are clipping
                print('Noisy speech was clipping!')
                continue
            
            
            noisy_speech_filename = noisy_output_path.joinpath(filename_speech.rsplit( ".", 1 )[0] + '+' + filename_noise.rsplit( ".", 1 )[0] + '+SNR' + str(desired_SNR) + 'dB' + '.wav')
            sf.write(noisy_speech_filename, noisy_speech_wav, 16000)

In [22]:
def split_generate_wavs(speech_dir, seen_noise_dir, unseen_noise_dir, SNRs, max_size, output_dir):
    RIR_dir = Path()
    
    speech_filenames = os.listdir(speech_dir)
    speech_filenames = [filename for filename in speech_filenames if filename.endswith('.wav') and get_duration(filename=speech_dir.joinpath(filename)) >= 5]
    shuffle(speech_filenames)
    
    seen_noise_filenames = os.listdir(seen_noise_dir)
    seen_noise_filenames = [filename for filename in seen_noise_filenames if filename.endswith('.wav') and get_duration(filename=seen_noise_dir.joinpath(filename)) >= 8]
    shuffle(seen_noise_filenames)
    
    unseen_noise_filenames = os.listdir(unseen_noise_dir)
    unseen_noise_filenames = [filename for filename in unseen_noise_filenames if filename.endswith('.wav') and get_duration(filename=unseen_noise_dir.joinpath(filename)) >= 8]
    shuffle(unseen_noise_filenames)
    
    print('Using', len(speech_filenames), 'speech files.')

    test_seen_noise_output_path = output_dir.joinpath('testing_seen_noise')
    test_unseen_noise_output_path = output_dir.joinpath('testing_unseen_noise')
    
    # generate testing set on seen noise
    generate_wavs(speech_dir, seen_noise_dir, RIR_dir, speech_filenames, seen_noise_filenames, SNRs, test_seen_noise_output_path, int(max_size))
    
    # generate testing set on unseen noise
    generate_wavs(speech_dir, noise_dir, RIR_dir, speech_filenames, unseen_noise_filenames, SNRs, test_unseen_noise_output_path, int(max_size))

In [23]:
split_generate_wavs(speech_dir, seen_noise_dir, unseen_noise_dir, SNRs, max_size, output_dir)

Using 1100 speech files.


HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))

Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
N

Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!
Noisy speech was clipping!



NameError: name 'noise_dir' is not defined