In [31]:
import os
import math
from random import randrange, shuffle, sample
import numpy as np
from scipy.io import wavfile
from pathlib import Path
from tqdm.notebook import tqdm

In [32]:
# speech_dir = Path('./WAVs/speech')
speech_dir = Path('/home/guillaume/Nextcloud/Thesis_Guillaume/Datasets/Dry Speech/Parsed TIMIT/parsed_output')
noise_dir = Path('./WAVs/noise')

SNRs = [-5, 0, 5, 10, 25]

testing_frac = 0.1
validation_frac = 0.1
training_frac = 1 - testing_frac - validation_frac

max_size = 10000

output_dir = Path('./WAVs/dataset')

In [33]:
def power(x):
    return np.sum(np.square(x))

def SNR(s, n):
    return 10*math.log10(power(s)/power(n))

In [34]:
def generate_wavs(speech_dir, noise_dir, speech_filenames, noise_filenames, SNRs, output_path):
    clean_output_path = output_path.joinpath('clean')
    noisy_output_path = output_path.joinpath('noisy')
    if not Path.exists(clean_output_path):
        os.makedirs(clean_output_path)
    if not Path.exists(noisy_output_path):
        os.makedirs(noisy_output_path)
    
    for filename_speech in tqdm(speech_filenames):
        sample_rate, speech_wav = wavfile.read(speech_dir.joinpath(filename_speech))
        assert sample_rate == 16000, 'Sample rate is not 16kHz!'
        
        # make sure all WAVs are 64000 samples (4s) long
        length_difference = 64000 - len(speech_wav)
        if length_difference > 0:
            start = randrange(0, length_difference)
            speech_wav = np.pad(speech_wav, (start, length_difference - start))
        if length_difference < 0:
            start = randrange(0, -length_difference)
            speech_wav = speech_wav[start:start+64000]
            
        assert len(speech_wav) == 64000, 'Speech waveform has ' + str(len(speech_wav)) + ' samples instead of 64000 samples!'
        
        clean_speech_filename = clean_output_path.joinpath(filename_speech.rsplit( ".", 1 )[0].replace('_', '') + '.wav')
        wavfile.write(clean_speech_filename, 16000, speech_wav.astype(np.int16))
        speech_wav = speech_wav/2**15

        for filename_noise in noise_filenames:
            sample_rate, noise_wav = wavfile.read(noise_dir.joinpath(filename_noise))
            assert sample_rate == 16000, 'Sample rate is not 16kHz!'
            noise_wav = noise_wav/2**15

            current_SNR = SNR(speech_wav, noise_wav)

            length_difference = len(speech_wav) - len(noise_wav)

            for desired_SNR in SNRs:
                alpha = 1/math.sqrt( math.pow(10, (desired_SNR - current_SNR)/10)) # alpha so the SNR matches the wanted SNR
                if length_difference == 0:
                    noisy_speech_wav = speech_wav + alpha * noise_wav
                if length_difference > 0:
                    start = randrange(0, length_difference)
                    noisy_speech_wav = speech_wav + alpha * np.pad(noise_wav, (start, length_difference - start))
                if length_difference < 0:
                    start = randrange(0, -length_difference)
                    noisy_speech_wav = speech_wav + alpha * noise_wav[start:start+len(speech_wav)]

                noisy_speech_wav = noisy_speech_wav * 2**15
                noisy_speech_wav = noisy_speech_wav.astype(np.int16)
                noisy_speech_filename = noisy_output_path.joinpath(filename_speech.rsplit( ".", 1 )[0].replace('_', '') + '_' + filename_noise.rsplit( ".", 1 )[0].replace('_', '') + '_SNR' + str(desired_SNR) + 'dB' + '.wav')
                wavfile.write(noisy_speech_filename, 16000, noisy_speech_wav)

In [35]:
def split_generate_wavs(speech_dir, noise_dir, SNRs, testing_frac, validation_frac, training_frac, max_size, output_dir):
    assert testing_frac + validation_frac + training_frac == 1, 'Split fractions do not sum to 1!'
    
    speech_filenames = os.listdir(speech_dir)
    shuffle(speech_filenames)
    
    noise_filenames = os.listdir(noise_dir)
    shuffle(noise_filenames)
    
    if max_size:
        if len(speech_filenames) * len(noise_filenames) * len(SNRs) > max_size:
            reduce_factor = max_size / (len(SNRs) * len(speech_filenames) * len(noise_filenames))
            speech_filenames = sample(speech_filenames, int(len(speech_filenames) * reduce_factor))
    
    print('Will output', len(speech_filenames) * len(noise_filenames) * len(SNRs), 'WAV files.')
    

    speech_train, speech_val, speech_test = np.split(speech_filenames,
                                                     [int(training_frac*len(speech_filenames)), int((training_frac + validation_frac)*len(speech_filenames))])


    noise_seen, noise_unseen = np.split(noise_filenames,
                                        [int(training_frac*len(noise_filenames))])
    
    training_output_path = output_dir.joinpath('training')
    val_output_path = output_dir.joinpath('validation')
    test_seen_noise_output_path = output_dir.joinpath('testing_seen_noise')
    test_unseen_noise_output_path = output_dir.joinpath('testing_unseen_noise')
    
    # generate training set
    generate_wavs(speech_dir, noise_dir, speech_train, noise_seen, SNRs, training_output_path)
    
    # generate validation set
    generate_wavs(speech_dir, noise_dir, speech_val, noise_seen, SNRs, val_output_path)
    
    # generate testing set on seen noise
    generate_wavs(speech_dir, noise_dir, speech_test, noise_seen, SNRs, test_seen_noise_output_path)
    
    # generate testing set on seen noise
    generate_wavs(speech_dir, noise_dir, speech_test, noise_unseen, SNRs, test_unseen_noise_output_path)

In [36]:
split_generate_wavs(speech_dir, noise_dir, SNRs, testing_frac, validation_frac, training_frac, max_size, output_dir)

Will output 9975 WAV files.


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


