diff --git a/datasets/mailabs.py b/datasets/mailabs.py index 5eefa53..7c810cc 100644 --- a/datasets/mailabs.py +++ b/datasets/mailabs.py @@ -1,3 +1,8 @@ +"""mailabs dataset is sampled at 16000 kHz with 0.5 seconds of silence + in the start and end of the audio data. Make sure to change the + sample_size hparams to match this. +""" + from concurrent.futures import ProcessPoolExecutor from functools import partial import numpy as np @@ -5,11 +10,11 @@ from util import audio -def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda x: x): +def build_from_path(in_dir, out_dir, books, num_workers=1, tqdm=lambda x: x): '''Preprocesses the mailabs Speech dataset from a given input path into a given output directory. Args: - in_dir: The directory where you have downloaded the LJ Speech dataset + in_dir: The directory where you have downloaded the mailabs Speech dataset out_dir: The directory to write the output into num_workers: Optional number of worker processes to parallelize across tqdm: You can optionally pass tqdm to get a nice progress bar @@ -35,12 +40,12 @@ def build_from_path(in_dir, out_dir, books, hparams, num_workers=1, tqdm=lambda text = parts[2] futures.append( executor.submit(partial( - _process_utterance, out_dir, name, wav_path, text, hparams) + _process_utterance, out_dir, name, wav_path, text) )) return [future.result() for future in tqdm(futures)] -def _process_utterance(out_dir, name, wav_path, text, hparams): +def _process_utterance(out_dir, name, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write @@ -57,17 +62,17 @@ def _process_utterance(out_dir, name, wav_path, text, hparams): ''' # Load the audio to a numpy array: - wav = audio.load_wav(wav_path, hparams) + wav = audio.load_wav(wav_path) # trim silences here - wav = audio.trim_silence(wav, hparams) + wav = audio.trim_silence(wav) # Compute the linear-scale spectrogram from the wav: - spectrogram = audio.spectrogram(wav, hparams).astype(np.float32) + spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: - mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) + mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'mailabs-spec-{}.npy'.format(name) diff --git a/preprocess.py b/preprocess.py index 56fe0c1..01acd88 100644 --- a/preprocess.py +++ b/preprocess.py @@ -3,7 +3,7 @@ from multiprocessing import cpu_count from tqdm import tqdm from datasets import amy, blizzard, ljspeech, kusal, mailabs -from hparams import hparams +from hparams import hparams, hparams_debug_string def preprocess_blizzard(args): @@ -47,11 +47,11 @@ def preprocess_mailabs(args): os.makedirs(out_dir, exist_ok=True) books = args.books metadata = mailabs.build_from_path( - in_dir, out_dir, books, args.hparams, args.num_workers, tqdm) - write_metadata(metadata, out_dir, args.hparams) + in_dir, out_dir, books, args.num_workers, tqdm) + write_metadata(metadata, out_dir) -def write_metadata(metadata, out_dir, hparams=hparams): +def write_metadata(metadata, out_dir): with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: for m in metadata: f.write('|'.join([str(x) for x in m]) + '\n') @@ -76,9 +76,6 @@ def main(): '--books', help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.', ) - parser.add_argument( - '--hparams', default='', - help='Hyperparameter overrides as a comma-separated list of name=value pairs') parser.add_argument('--num_workers', type=int, default=cpu_count()) args = parser.parse_args() @@ -89,7 +86,7 @@ def main(): parser.error( "--mailabs_books_dir required if mailabs is chosen for dataset.") - args.hparams = hparams.parse(args.hparams) + print(hparams_debug_string()) if args.dataset == 'amy': preprocess_amy(args) diff --git a/util/audio.py b/util/audio.py index 89b38e3..1f79458 100644 --- a/util/audio.py +++ b/util/audio.py @@ -6,30 +6,29 @@ from scipy import signal from hparams import hparams - -def load_wav(path, hparams=hparams): +def load_wav(path): return librosa.core.load(path, sr=hparams.sample_rate)[0] -def save_wav(wav, path, hparams=hparams): +def save_wav(wav, path): wav *= 32767 / max(0.01, np.max(np.abs(wav))) librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) -def trim_silence(wav, hparams=hparams): +def trim_silence(wav): return librosa.effects.trim( wav, top_db=hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0] -def spectrogram(y, hparams=hparams): +def spectrogram(y): D = _stft(y) S = _amp_to_db(np.abs(D)) - hparams.ref_level_db return _normalize(S) -def inv_spectrogram(spectrogram, hparams=hparams): +def inv_spectrogram(spectrogram): '''Converts spectrogram to waveform using librosa''' S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear @@ -37,20 +36,20 @@ def inv_spectrogram(spectrogram, hparams=hparams): return _griffin_lim(S ** hparams.power) -def inv_spectrogram_tensorflow(spectrogram, hparams=hparams): +def inv_spectrogram_tensorflow(spectrogram): '''Builds computational graph to convert spectrogram to waveform using TensorFlow.''' S = _db_to_amp_tensorflow(_denormalize_tensorflow( spectrogram) + hparams.ref_level_db) return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) -def melspectrogram(y, hparams=hparams): +def melspectrogram(y): D = _stft(y) S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db return _normalize(S) -def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams): +def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(hparams.sample_rate * min_silence_sec) hop_length = int(window_length / 4) threshold = _db_to_amp(threshold_db) @@ -60,7 +59,7 @@ def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8, hparams=hparams): return len(wav) -def _griffin_lim(S, hparams=hparams): +def _griffin_lim(S): '''librosa implementation of Griffin-Lim Based on https://github.com/librosa/librosa/issues/434 ''' @@ -73,7 +72,7 @@ def _griffin_lim(S, hparams=hparams): return y -def _griffin_lim_tensorflow(S, hparams=hparams): +def _griffin_lim_tensorflow(S): '''TensorFlow implementation of Griffin-Lim Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb ''' @@ -109,7 +108,7 @@ def _istft_tensorflow(stfts): return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) -def _stft_parameters(hparams=hparams): +def _stft_parameters(): n_fft = (hparams.num_freq - 1) * 2 hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) @@ -128,7 +127,7 @@ def _linear_to_mel(spectrogram): return np.dot(_mel_basis, spectrogram) -def _build_mel_basis(hparams=hparams): +def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, fmin=hparams.min_mel_freq, fmax=hparams.max_mel_freq) @@ -146,13 +145,13 @@ def _db_to_amp_tensorflow(x): return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) -def _normalize(S, hparams=hparams): +def _normalize(S): return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) -def _denormalize(S, hparams=hparams): +def _denormalize(S): return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db -def _denormalize_tensorflow(S, hparams=hparams): +def _denormalize_tensorflow(S): return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db