In [1]:
import logging
import os
import shutil
import time
from argparse import ArgumentParser

from audio_reader import AudioReader
from constants import c
from utils import InputsGenerator

In [31]:
%load_ext autoreload
%autoreload

In [2]:
def arg_parse():
    #Read arguments
    arg_p = ArgumentParser()
    arg_p.add_argument('--audio_dir', required=True)
    arg_p.add_argument('--cache_output_dir', required=True)
    arg_p.add_argument('--regenerate_full_cache', action='store_true')
    arg_p.add_argument('--update_cache', action='store_true')
    arg_p.add_argument('--generate_training_inputs', action='store_true')
    arg_p.add_argument('--multi_threading', action='store_true')
    arg_p.add_argument('--unseen_speakers')  # p225,p226 example.
    arg_p.add_argument('--get_embeddings')  # p225 example.
    return arg_p


def regenerate_full_cache(audio_reader, args):
    cache_output_dir = os.path.expanduser(args.cache_output_dir)
    print('The directory containing the cache is {}.'.format(cache_output_dir))
    print('Going to wipe out and regenerate the cache in 5 seconds. Ctrl+C to kill this script.')
    time.sleep(5)
    try:
        shutil.rmtree(cache_output_dir)
    except:
        pass
    os.makedirs(cache_output_dir)
    audio_reader.build_cache()


def generate_cache_from_training_inputs(audio_reader, args):
    cache_dir = os.path.expanduser(args.cache_output_dir)
    inputs_generator = InputsGenerator(cache_dir=cache_dir,
                                       audio_reader=audio_reader,
                                       max_count_per_class=1000,
                                       speakers_sub_list=None,
                                       multi_threading=args.multi_threading)
    inputs_generator.start_generation()

# Main()

Parse arguments

In [9]:
# Read arguments to use multi threading, define the output directory for the cache and audio directories
# args = arg_parse().parse_args()


Define working directories

In [10]:
DS_DIR='/Users/j/deep-speaker-data'
AUDIO_DIR=DS_DIR + '/VCTK-Corpus/'
CACHE_DIR=DS_DIR + '/cache/'

AUDIO_READER: Parameters

In [26]:
config_sample_rate = 8000
multi_threading = True

In [27]:
# Create an audio reader that returns a cache (dictionary) and metadata (dictionary)
audio_reader = AudioReader(input_audio_dir=AUDIO_DIR,
                           output_cache_dir=CACHE_DIR,
                           sample_rate=config_sample_rate,
                           multi_threading=multi_threading)

regenerate_full_cache(audio_reader, args)

In [28]:
# Generate cache for the audio files. Caching usually involves sampling the WAV files at 8KHz and trimming the silences. 
# regenerate_full_cache(audio_reader, args)

In [35]:
cache_output_dir = os.path.expanduser(CACHE_DIR)
print('The directory containing the cache is {}.'.format(cache_output_dir))
print('Going to wipe out and regenerate the cache in 5 seconds. Ctrl+C to kill this script.')
time.sleep(5)
try:
    shutil.rmtree(cache_output_dir)
except:
    pass
os.makedirs(cache_output_dir)

The directory containing the cache is /Users/j/deep-speaker-data/cache/.
Going to wipe out and regenerate the cache in 5 seconds. Ctrl+C to kill this script.


In [36]:
audio_reader.build_cache()

generate_cache_from_training_inputs(audio_reader, args):

In [37]:
# Generate inputs used in the softmax training, MFCC windows randomly sampled from the audio cached files and put in a unified pickle file.

cache_dir = os.path.expanduser(CACHE_DIR)

print(cache_dir)

/Users/j/deep-speaker-data/cache/


In [39]:
inputs_generator = InputsGenerator(cache_dir=cache_dir,
                                   audio_reader=audio_reader,
                                   max_count_per_class=1000,
                                   speakers_sub_list=None,
                                   multi_threading=multi_threading)

In [40]:
inputs_generator.start_generation()