In [327]:
from bwfsoundfile import BwfSoundFile
import soundfile as sf
import librosa
from IPython.display import display, Audio
import glob
import os
from joblib import Parallel, delayed
import multiprocessing
import sys
import re

In [328]:
conditioning_text = set()

In [329]:
def process_file(file_path, root_dir, out_dir='Preprocessed/text_only/', drop_sample_length=16384):
    '''
    drop_sample_length: Maximum length of the audio clip (in samples) before the audio will be dropped. 
    This ensures we do learn to generate clipped audio.
    root_dir: Only directory structure below the given directory will be used to extract meta data
    '''
    with BwfSoundFile(file_path) as bwf_file:        
        # Extract meta data from file path
        # Strip off root directory
        cond_text = re.split(root_dir.strip('/').strip('\\'), file_path, 1)[-1].strip('/').strip('\\')
        # Strip off extension
        cond_text = cond_text.split('.')[:-1]
        cond_text = '.'.join(cond_text)

        # Extract extra meta data from BWF headers if available
        bwf_file.get_bext()
        meta_description = bwf_file.bext_info['description']
        if meta_description:
            cond_text += ' {}'.format(meta_description)
        
        # Remove Numbers and newlines
        cond_text = re.sub(r'[\d\r\n]+', '', cond_text, re.M)
        # Tokenize (splits on and removes any whitespace, dashes, underscores and any other special characters)
        cond_text = re.split(r'[\W_]+', cond_text)
        # Rejoin tokens with whitespace for easy feeding to embedding algorithm
        cond_text = ' '.join(cond_text)
        # Strip whitespace from beginning and end
        cond_text = cond_text.strip()
        # Make Lowercase
        cond_text = cond_text.lower()

        conditioning_text.add(cond_text)

In [330]:
out_dir = 'Preprocessed/text_only/'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)

In [331]:
file_list = []
root_dir = 'Source'
for filename in glob.iglob(os.path.join(root_dir, '**/*.wav'), recursive=True):
    file_list.append(filename)

In [332]:
for filename in file_list:
    process_file(filename, root_dir, out_dir)

In [333]:
extra_items = ['banana', 'shotgun cock', 'polarity shotgun blast metallic explosion crunch punchy massive', 'explosion', 'fireball whoosh', 'fireball explosion', 'footsteps cartoon', 'footsteps fart']
for item in extra_items:
    conditioning_text.add(item)

In [334]:
conditioning_text = list(conditioning_text)
print(len(conditioning_text))

2303


In [335]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.contrib.tensorboard.plugins import projector

In [336]:
LOGDIR = 'embed_visualization/'
if not os.path.isdir(LOGDIR):
    os.mkdir(LOGDIR)

In [337]:
# Write vocab to file
import csv
with open(os.path.join(LOGDIR, 'vocab.tsv'), 'w') as vocab_file:
    wr = csv.writer(vocab_file, delimiter='\t', lineterminator='\n')
    for line in conditioning_text:
        wr.writerow([line])

In [338]:
tf.reset_default_graph()

with tf.Session() as sess:
    embed_op = hub.Module("https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1", trainable=False, name='embed')
    
    cond_text_ph = tf.placeholder(tf.string, shape=[None], name='cond_text_ph')
    embeddings_op = embed_op(cond_text_ph)

    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    
    embeddings = tf.zeros([0, 128])
    batch_size = 128
    for batch_start in range(0, len(conditioning_text), batch_size):
        batch_end = batch_start + batch_size
        cond_text_batch = conditioning_text[batch_start:batch_end]
        embeddings = sess.run(tf.concat([embeddings, embeddings_op], 0), {cond_text_ph: cond_text_batch})
    print(embeddings.shape)
    
    # Convert to variable for visualization in tensorboard
    embeddings_var = tf.get_variable('embeddings_var', initializer=embeddings)
    sess.run(embeddings_var.initializer)
    
    # Get projector config and summary writer
    config = projector.ProjectorConfig()
    summary_writer = tf.summary.FileWriter(LOGDIR)
    
    # add embeddings to config
    embeddings_config = config.embeddings.add()
    embeddings_config.tensor_name = embeddings_var.name
    
    # link the embeddings to their metadata file. In this case, the file that contains
    # the 500 most popular words in our vocabulary
    embeddings_config.metadata_path = 'vocab.tsv'
    
    # save a configuration file that TensorBoard will read during startup
    projector.visualize_embeddings(summary_writer, config)

    # save our embedding
    saver_embed = tf.train.Saver([embeddings_var])
    saver_embed.save(sess, LOGDIR + '/embed.ckpt', 1)
    
tf.reset_default_graph()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(2303, 128)
