In [151]:
from bwfsoundfile import BwfSoundFile
import soundfile as sf
import librosa
from IPython.display import display, Audio
import glob
import os
from joblib import Parallel, delayed
import multiprocessing
import sys
import re

In [152]:
conditioning_text = set()

In [153]:
class Filter:
    def __init__(self, pattern, flags=0, repl=r''):
        self.pattern = pattern
        self.flags = flags
        self.repl = repl

In [154]:
def process_file(file_path, root_dir, out_dir='Preprocessed/text_only/', drop_sample_length=16384):
    '''
    drop_sample_length: Maximum length of the audio clip (in samples) before the audio will be dropped. 
    This ensures we do learn to generate clipped audio.
    root_dir: Only directory structure below the given directory will be used to extract meta data
    '''
    with BwfSoundFile(file_path) as bwf_file:      
        # Extract meta data from file path
        # Strip off root directory
        cond_text = re.split(root_dir.strip('/').strip('\\'), file_path, 1)[-1].strip('/').strip('\\')
        # Strip off extension
        cond_text = cond_text.split('.')[:-1]
        cond_text = '.'.join(cond_text)
        
        # Split directories by directory seperator
        cond_text = re.split(r'\\|/', cond_text)
        cond_text = ', '.join(cond_text)

        # Extract extra meta data from BWF headers if available
        bwf_file.get_bext()
        meta_description = bwf_file.bext_info['description']
        if meta_description:
            cond_text += ', {}'.format(meta_description)
        
        # Dataset specific text filters
        filters = []
        filters.append(Filter(r'^Animal HyperRealism$'))
        filters.append(Filter(r'^Eclectic Whooshes$'))
        filters.append(Filter(r'^Gamemaster Audio - Pro Sound Collection$'))
        filters.append(Filter(r'^Polarity$'))
        filters.append(Filter(r'^Swordfighter$'))
        filters.append(Filter(r'^The Borax Experiement$'))
        filters.append(Filter(r'^Lethal Energies$'))
        filters.append(Filter(r'Borax'))
        filters.append(Filter(r'^Borax Impacts$'))
        filters.append(Filter(r'Cheats Section'))
        filters.append(Filter(r'Designed Section'))
        filters.append(Filter(r'^Construction Kit$'))
        filters.append(Filter(r'^Designed Weapons$'))
        filters.append(Filter(r'^Raw$'))
        filters.append(Filter(r'^Designed$'))
        filters.append(Filter(r'^Articulated--Magic Elements$'))
        filters.append(Filter(r'^Sound Design$'))
        filters.append(Filter(r'^CK'))
        filters.append(Filter(r'^DK'))
        
        # Non-Dataset specific filers
        filters.append(Filter(r'Various', re.IGNORECASE))
        filters.append(Filter(r'Miscellaneous', re.IGNORECASE))
        filters.append(Filter(r'\d\s*meters?', re.IGNORECASE))
        filters.append(Filter(r'\d'))
        filters.append(Filter(r'(\s|^)(\w{1,2})(?=\s|$)')) # 1-2 character words
        filters.append(Filter(r'(\s|^)(\.+)(?=\s|$)')) # Remove random floating periods
        filters.append(Filter(r'-(\s*-)+', repl='-')) # Collapse multiple hyphens
        filters.append(Filter(r'\W+$|^\W+')) # Strip leading and trailing non-word characters
        
            
        # Split on commas to further pre-process each description phrase before passing off to nltk
        cond_text = cond_text.split(',')
        for i, _ in enumerate(cond_text):
            # Strip whitespace from beginning and end
            cond_text[i] = cond_text[i].strip()
            # Convert underscores to spaces
            cond_text[i] = re.sub(r'_', r' ', cond_text[i])
            # Remove dash between letters and numbers, also add spaces between directly attached numbers and letters
            cond_text[i] = re.sub(r'([a-zA-Z])-?(?=\d)', r'\1 ', cond_text[i])
            cond_text[i] = re.sub(r'(\d)-?(?=[a-zA-Z])', r'\1 ', cond_text[i])
            # Run pattern filters
            for filter_ in filters:
                cond_text[i] = re.sub(filter_.pattern, filter_.repl, cond_text[i], flags=filter_.flags)
            # Strip whitespace from beginning and end
            cond_text[i] = cond_text[i].strip()
        
        # Remove Empty descrition phrases
        cond_text = list(filter(None, cond_text))
        
        # Rejoin everything with commas for better nltk parsing
        cond_text = ', '.join(cond_text)
        
        # Remove Numbers and newlines
#         cond_text = re.sub(r'[\d\r\n]+', '', cond_text, re.M)
        # Tokenize (splits on and removes any whitespace, dashes, underscores and any other special characters)
#         cond_text = re.split(r'[\W_]+', cond_text)
        # Rejoin tokens with whitespace for easy feeding to embedding algorithm
#         cond_text = ' '.join(cond_text)

        # Make Lowercase
#         cond_text = cond_text.lower()

        conditioning_text.add(cond_text)

In [155]:
out_dir = 'Preprocessed/text_only/'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)

In [156]:
file_list = []
root_dir = 'Source'
for filename in glob.iglob(os.path.join(root_dir, '**/*.wav'), recursive=True):
    file_list.append(filename)

In [157]:
for filename in file_list:
    process_file(filename, root_dir, out_dir)

In [158]:
conditioning_text = list(conditioning_text)
print(len(conditioning_text))
print(conditioning_text)

2269




In [159]:
import spacy
from spacy import displacy
import random
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.lang.char_classes import LIST_ELLIPSES
from spacy.lang.char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA

In [160]:
# Create custom tokenizer that treats hyphenated words as single words
def custom_tokenizer(nlp):
    HYPHENS = r'--|---|——|~'
    infixes = (LIST_ELLIPSES + LIST_ICONS +
               [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
                r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
                r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
                r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
                r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

In [161]:
nlp = spacy.load('en')
nlp.tokenizer = custom_tokenizer(nlp)

In [162]:
final_cond_text = set()
for text in conditioning_text:
    doc = nlp(text)
    
    useful_pos = ('VERB', 'NOUN', 'PROPN', 'ADJ', 'ADV')
    candidate_conditioning_text = []
    for i, token in enumerate(doc):
        if token.pos_ in useful_pos:
            sentence = [token]
            for left in reversed(list(token.lefts)):
                if left.pos_ in useful_pos:
                    sentence.insert(0, left)
            for right in token.rights:
                if right.pos_ in useful_pos:
                    sentence.append(right)
            candidate_conditioning_text.append((token.pos_, token.text, sentence))

    if candidate_conditioning_text:
        for text in candidate_conditioning_text:
            new_cond_text = ' '.join([token.text for token in text[2]]).lower()
            final_cond_text.add(new_cond_text)

In [163]:
extra_items = ['small-clawed otter', 'magic', 'Ghostly Resonance', 'banana', 'shotgun cock', 'polarity shotgun blast metallic explosion crunch punchy massive', 'explosion', 'fireball whoosh', 'fireball explosion', 'footsteps cartoon', 'footsteps fart']
for item in extra_items:
    final_cond_text.add(item.lower())

In [164]:
conditioning_text = list(final_cond_text)
print(len(conditioning_text))
print(conditioning_text.index('small-clawed otter asia rodent squeak overlaps'))
print(conditioning_text[:5000])

7565
4689




In [165]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.contrib.tensorboard.plugins import projector

In [166]:
LOGDIR = 'embed_visualization/'
if not os.path.isdir(LOGDIR):
    os.mkdir(LOGDIR)

In [167]:
# Write vocab to file
import csv
with open(os.path.join(LOGDIR, 'vocab.tsv'), 'w') as vocab_file:
    wr = csv.writer(vocab_file, delimiter='\t', lineterminator='\n')
    for line in conditioning_text:
        wr.writerow([line])

In [168]:
tf.reset_default_graph()

with tf.Session() as sess:
    embed_op = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False, name='embed')
    
    cond_text_ph = tf.placeholder(tf.string, shape=[None], name='cond_text_ph')
    embeddings_op = embed_op(cond_text_ph)

    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    
    embeddings = tf.zeros([0, 1024])
    batch_size = 128
    for batch_start in range(0, len(conditioning_text), batch_size):
        batch_end = batch_start + batch_size
        cond_text_batch = conditioning_text[batch_start:batch_end]
        embeddings = sess.run(tf.concat([embeddings, embeddings_op], 0), {cond_text_ph: cond_text_batch})
    print(embeddings.shape)
    
    # Convert to variable for visualization in tensorboard
    embeddings_var = tf.get_variable('embeddings_var', initializer=embeddings)
    sess.run(embeddings_var.initializer)
    
    # Get projector config and summary writer
    config = projector.ProjectorConfig()
    summary_writer = tf.summary.FileWriter(LOGDIR)
    
    # add embeddings to config
    embeddings_config = config.embeddings.add()
    embeddings_config.tensor_name = embeddings_var.name
    
    # link the embeddings to their metadata file. In this case, the file that contains
    # the 500 most popular words in our vocabulary
    embeddings_config.metadata_path = 'vocab.tsv'
    
    # save a configuration file that TensorBoard will read during startup
    projector.visualize_embeddings(summary_writer, config)

    # save our embedding
    saver_embed = tf.train.Saver([embeddings_var])
    saver_embed.save(sess, LOGDIR + '/embed.ckpt', 1)
    
tf.reset_default_graph()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(7565, 1024)
