In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import librosa

In [2]:
DATA_PATH = "D:\cv_corpus_v1\\"

In [3]:
def load_CSV():
    eval_data = pd.read_csv(DATA_PATH + "cv-valid-dev.csv")
    train_data = pd.read_csv(DATA_PATH + "cv-valid-train.csv")
    test_data = pd.read_csv(DATA_PATH + "cv-valid-test.csv")
    
    return eval_data, train_data, test_data

In [4]:
def load_wave(information, absolute=False):
    data = information[1] # get a dict
    audio, sr = librosa.load(DATA_PATH+str(data.filename), mono=True, sr=16000)
    
    return audio, data

In [5]:
eval_data, train_data, test_data = load_CSV()

'abcdefghijklmnopqrstuvwxyz

In [6]:
def sequenceLength(original_lengths, params):
    return tf.cast(tf.floor((tf.cast(original_lengths, dtype=tf.float32) - params['n_fft']) / params['frame_step']
                           ) + 1, tf.int32)

In [7]:
def encode_labels(labels, params):
    # char to id
    characters = list(params['alphabet'])
    
    table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(characters, list(range(len(characters)))), -1)
    
    return table.lookup(tf.string_split(labels, delimiter=''))

In [8]:
def decode_codes(codes, params):
    # id to char
    characters = list(params['alphabet'])
    
    table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(range(len(characters))), characters), '')
    
    return table.lookup(codes)

In [9]:
def decode_logits(logits, lengths, params):
    if len(tf.shape(lengths).shape) == 1:
        lengths = tf.reshape(lengths, [1])
    else:
        lengths = tf.squeeze(lengths)
        
    # input size [max_time x batch_size x num_classes]
    # CTC Word Beam Search Decoding Algorithm for predict words instead letters need here
    # Word Beam Search: A Connectionist Temporal Classification Decoding Algorithm
    decoded, _ = tf.nn.ctc_beam_search_decoder(tf.transpose(logits, (1, 0, 2)), lengths, merge_repeated=True)
    
    # decoded[j] containing the decoded outputs, j=0 in our case
    codes = tf.cast(decoded[0], tf.int32)
    
    text = decode_codes(codes, params)
    
    return text, codes

In [10]:
class MelSpectrogram(tf.layers.Layer):
    def __init__(self, sr, n_fft, frame_step, n_mel, fmin, fmax, **kwargs):
        super(MelSpectrogram, self).__init__(**kwargs)
        
        self.sr = sr
        self.n_fft = n_fft
        self.frame_step = frame_step
        self.n_mel = n_mel
        self.fmin = fmin
        self.fmax = fmax
        
    def call(self, signal):
        # Short-time Fourier Transform
        stft = tf.contrib.signal.stft(signal, self.n_fft, self.frame_step, self.n_fft, pad_end=False)
        # magn_spectr = tf.abs(stft) maybe
        pow_spectr = tf.real(stft * tf.conj(stft)) # necessary tf.real()? TODO: check this
        
        mel = tf.constant(np.transpose(librosa.filters.mel(self.sr, self.n_fft+1, self.n_mel, 
                                                           self.fmin, self.fmax)),
                         dtype=tf.float32)

        mel_spectr = tf.tensordot(pow_spectr, mel, 1)
        mel_spectr.set_shape(pow_spectr.shape[:-1].concatenate(mel.shape[-1:]))
       
        # This helps to balance the importance of
        # detail in low and high energy regions of the spectrum, which more closely
        # matches human auditory sensitivity.
        mel_spectr = tf.log(mel_spectr + 1e-6)
        
        return mel_spectr

In [11]:
class BiLSTM(tf.layers.Layer):
    def __init__(self, filters, kernel_size, 
                 dilation_rate, use_bias=True, 
                 kernel_initializer=tf.glorot_normal_initializer(), 
                 pad_valid=True, **kwargs):
        super(BiLSTM, self).__init__(**kwargs)
    
        self.filters = filters
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.kernel_initializer = kernel_initializer
        self.pad_valid = pad_valid
        
        # Bidirectional Convolutional LSTM
#         self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.ConvLSTM2D(filters, kernel_size, dilation_rate=dilation_rate, 
#                                         padding='valid', use_bias=use_bias, 
#                                         kernel_initializer=kernel_initializer, return_sequences=True))
        self.lstm = tf.layers.Conv1D(filters, kernel_size, dilation_rate=dilation_rate, 
                                    padding='valid', use_bias=use_bias, 
                                    kernel_initializer=kernel_initializer)
    def call(self, inputs):
        if self.pad_valid:
            padding = (self.kernel_size - 1) * self.dilation_rate
            inputs = tf.pad(inputs, tf.constant([(0, 0,), (1, 0), (0, 0)]) * padding)
        
        return self.lstm(inputs)

In [12]:
class Residual(tf.layers.Layer):
    def __init__(self, filters, kernel_size, 
                 dilation_rate, pad_valid=True, **kwargs):
        super(Residual, self).__init__(**kwargs)
        
        self.lstm1 = BiLSTM(filters, kernel_size, dilation_rate, pad_valid=pad_valid)
        self.lstm2 = BiLSTM(filters, kernel_size, dilation_rate, pad_valid=pad_valid)
        #self.out = tf.keras.layers.Bidirectional(tf.keras.layers.ConvLSTM2D(filters, kernel_size=1))
        self.out = tf.layers.Conv1D(filters=filters, kernel_size=1)
        
    def call(self, inputs, training=True):
        data = tf.layers.batch_normalization(inputs, training=training)
        
        filters = self.lstm1(data)
        gates = self.lstm2(data)
            
        filters = tf.nn.tanh(filters)
        gates = tf.nn.sigmoid(gates)
        
        out = tf.nn.tanh(self.out(filters * gates))

        return out + inputs, out

In [13]:
class ResStack(tf.layers.Layer):
    def __init__(self, filters, kernel_size, dilation_rates, pad_valid=True, **kwargs):
        super(ResStack, self).__init__(**kwargs)
        
        self.blocks = [Residual(filters, kernel_size, dilation_rate, pad_valid) 
                      for dilation_rate in dilation_rates]
        
    def call(self, inputs, training=True):
        data = inputs
        skip = 0
        
        for block in self.blocks:
            data, current_skip = block(data, training=training)
            skip += current_skip
        
        return skip

In [14]:
class SpeechNet(tf.layers.Layer):
    def __init__(self, params, **kwargs):
        super(SpeechNet, self).__init__(**kwargs)
        
        self.mel = MelSpectrogram(params['sampling_rate'], params['n_fft'], 
                                 params['frame_step'], params['n_mel'], params['fmin'], 
                                 params['fmax'])
       
       #self.exp = tf.keras.layers.Bidirectional(
           #  tf.keras.layers.ConvLSTM2D(filters=params['stack_filters'], kernel_size=1, padding='same'))
        
        self.exp = tf.layers.Conv1D(filters=params['stack_filters'], kernel_size=1, padding='same')
        
        self.stacks = [ResStack(filters=params['stack_filters'], kernel_size=params['stack_kernel_size'],
                dilation_rates=params['stack_dilation_rates'])
            for _ in range(params['stacks'])]
        
        #self.out = tf.keras.layers.Bidirectional(
         #   tf.keras.layers.ConvLSTM2D(filters=len(params['alphabet']) + 1, kernel_size=1, padding='same'))
        
        self.out = tf.layers.Conv1D(filters=len(params['alphabet'])+1, kernel_size=1, padding='same')
        
    def call(self, inputs, training=True):
        with tf.device("/gpu:0"):
            data = self.mel(inputs)
            data = tf.layers.batch_normalization(data, training=training)

            if len(data.shape) == 2:
                data = tf.expand_dims(data, 0)

            data = self.exp(data)

            for stack in self.stacks:
                data = stack(data, training=training)

            data = tf.layers.batch_normalization(data, training=training)    

        return self.out(data) + 1e-8

In [15]:
from multiprocessing import Pool

def input_fn(inp_dataset, params, load_wave_fn=load_wave):
    with tf.device("/gpu:0"):
        def _input_fn():
            dataset = inp_dataset

            def generator_fn():
                pool = Pool()
                buffer = []
                l = 0

                for epoch in range(params['epochs']):
                    dataset = inp_dataset


                    for _, row in dataset.iterrows():
                        l += 1
                        if (l<1000 and l%10==0):
                            print(row)
                            print(row.text)

                        if (l % 1000 == 0):
                            print(row)
                            print(row.text)

                        buffer.append((row, params))

                        if len(buffer) >= params['batch_size']:
                            if params['parallelize']:
                                audios = pool.map(load_wave_fn, buffer)
                            else: 
                                audios = map(load_wave_fn, buffer)

                            for audio, row in audios:
                                if audio is not None:
                                    if np.isnan(audio).any():
                                        print('SKIP')
                                    else:
                                        yield (audio, len(audio)), row.text.encode()

                            buffer = []

    #         return tf.data.Dataset.from_generator(generator_fn, output_types=((tf.float32, tf.int32), 
    #                                                                          (tf.string)),
    #                                              output_shapes=((None,()), (()))).padded_batch(
    #         batch_size=params['batch_size'], padded_shapes=(tf.TensorShape([None]), tf.TensorShape(()),
    #                                                        tf.TensorShape(())))
            return tf.data.Dataset.from_generator(
                    generator_fn,
                    output_types=((tf.float32, tf.int32), (tf.string)),
                    output_shapes=((None,()), (()))
                ) \
                .padded_batch(
                    batch_size=params['batch_size'],
                    padded_shapes=(
                        (tf.TensorShape([None]), tf.TensorShape(())),
                        tf.TensorShape(())
                    )
                )

    return _input_fn

In [16]:
def model_fn(features, labels, mode, params):
    with tf.device("/gpu:0"):
        if isinstance(features, dict):
            original_lengths = features['length']
        else: 
            audio, original_lengths = features

        lengths = sequenceLength(original_lengths, params)

        if labels is not None:
            codes = encode_labels(labels, params)

        network = SpeechNet(params)
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        print('Is training? {}'.format(is_training))

        logits = network(audio, training=is_training)
        text, predicted_codes = decode_logits(logits, lengths, params)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {'logits': logits, 'text': tf.sparse_tensor_to_dense(text, '')}
            export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}

            return tf.estimator.EstimatorSpec(mode, predictions=predictions, 
                                              export_outputs=export_outputs)
        else:
            loss = tf.reduce_mean(tf.nn.ctc_loss(labels=codes, inputs=logits,
                                                sequence_length=lengths, time_major=False,
                                                ignore_longer_outputs_than_inputs=True))
            mean_edit_distance = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_codes, tf.int32),
                                                                 codes))
            distance_metric = tf.metrics.mean(mean_edit_distance)

            if mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.EstimatorSpec(mode, loss=loss, 
                                                  eval_metric_ops={'edit_distance': distance_metric})
            elif mode == tf.estimator.ModeKeys.TRAIN:
                global_step = tf.train.get_or_create_global_step()

                tf.summary.text('train_predicted_text', tf.sparse_tensor_to_dense(text, ''))
                tf.summary.scalar('train_edit_distance', mean_edit_distance)

                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                with tf.control_dependencies(update_ops):
                    train_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step,
                                                              learning_rate=params['lr'],
                                                              optimizer=(params['optimizer']),
                                                              update_ops=update_ops,
                                                              clip_gradients=params['clip_gradients'],
                                                              summaries=["learning_rate", "loss",
                                                                        "global_gradient_norm"])

    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [17]:
def dataset_params(batch_size=32,
                   epochs=50000,
                   parallelize=True,
                   max_text_length=None,
                   min_text_length=None,
                   max_wave_length=80000,
                   shuffle=True,
                   random_shift_min=-4000,
                   random_shift_max= 4000,
                   random_stretch_min=0.7,
                   random_stretch_max= 1.3,
                   random_noise=0.75,
                   random_noise_factor_min=0.2,
                   random_noise_factor_max=0.5,
                   augment=False):
    return {
        'parallelize': parallelize,
        'shuffle': shuffle,
        'max_text_length': max_text_length,
        'min_text_length': min_text_length,
        'max_wave_length': max_wave_length,
        'random_shift_min': random_shift_min,
        'random_shift_max': random_shift_max,
        'random_stretch_min': random_stretch_min,
        'random_stretch_max': random_stretch_max,
        'random_noise': random_noise,
        'random_noise_factor_min': random_noise_factor_min,
        'random_noise_factor_max': random_noise_factor_max,
        'epochs': epochs,
        'batch_size': batch_size,
        'augment': augment
    }

In [18]:
def experiment_params(data,
                      optimizer='Adam',
                      lr=1e-4,
                      alphabet=" 'abcdefghijklmnopqrstuvwxyz",
                      pad_conv=True,
                      stack_dilation_rates= [1, 3, 9, 27, 81],
                      stacks=2,
                      stack_kernel_size= 3,
                      stack_filters= 32,
                      sampling_rate=16000,
                      n_fft=160*4,
                      frame_step=160,
                      fmin=0,
                      fmax=8000,
                      n_mel=160,
                      clip_gradients=None,
                      codename='regular',
                      **kwargs):
    params = {
        'optimizer': optimizer,
        'lr': lr,
        'data': data,
        'alphabet': alphabet,
        'pad_conv': pad_conv,
        'stack_dilation_rates': stack_dilation_rates,
        'stacks': stacks,
        'stack_kernel_size': stack_kernel_size,
        'stack_filters': stack_filters,
        'sampling_rate': sampling_rate,
        'n_fft': n_fft,
        'frame_step': frame_step,
        'fmin': fmin,
        'fmax': fmax,
        'n_mel': n_mel,
        'clip_gradients': clip_gradients,
        'codename': codename
    }
    
    #import pdb; pdb.set_trace()
    
    if kwargs is not None and 'data' in kwargs:
        params['data'] = { **params['data'], **kwargs['data'] }
        del kwargs['data']
        
    if kwargs is not None:
        params = { **params, **kwargs }
        
    return params

In [19]:
def experiment_name(params, excluded_keys=['alphabet', 'data', 'lr', 'clip_gradients']):

    def represent(key, value):
        if key in excluded_keys:
            return None
        else:
            if isinstance(value, list):
                return '{}_{}'.format(key, '_'.join([str(v) for v in value]))
            else:
                return '{}_{}'.format(key, value)

    parts = filter(
        lambda p: p is not None,
        [
            represent(k, params[k])
            for k in sorted(params.keys())
        ]
    )

    return '/'.join(parts)

In [20]:
import copy

def experiment(data_params=dataset_params(), **kwargs):
    with tf.device("/gpu:0"):
        params = experiment_params(
            data_params,
            **kwargs
        )

        print(params)

        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir='/content/',
            params=params
        )

        #import pdb; pdb.set_trace()

        train_spec = tf.estimator.TrainSpec(
            input_fn=input_fn(
                train_data,
                params['data']
            )
        )

        features = {
            "audio": tf.placeholder(dtype=tf.float32, shape=[None]),
            "length": tf.placeholder(dtype=tf.int32, shape=[])
        }

        serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            features
        )

        best_exporter = tf.estimator.BestExporter(
          name="best_exporter",
          serving_input_receiver_fn=serving_input_receiver_fn,
          exports_to_keep=5
        )

        eval_params = copy.deepcopy(params['data'])
        eval_params['augment'] = False

        eval_spec = tf.estimator.EvalSpec(
            input_fn=input_fn(
                eval_data,
                eval_params
            ),
            throttle_secs=60*30,
            exporters=best_exporter
        )

        tf.estimator.train_and_evaluate(
            estimator,
            train_spec,
            eval_spec
        )

In [None]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
pred_t = True
sess.run(experiment(
    dataset_params(
        batch_size=18,
        epochs=1,
        max_wave_length=320000,
        augment=False,
        random_noise=0.75,
        random_noise_factor_min=0.1,
        random_noise_factor_max=0.15,
        random_stretch_min=0.8,
        random_stretch_max=1.2
    ),
    codename='deep_max_20_seconds',
    alphabet=' !"&\',-.01234:;\\abcdefghijklmnopqrstuvwxyz', # !"&',-.01234:;\abcdefghijklmnopqrstuvwxyz
    causal_convolutions=False,
    stack_dilation_rates=[1, 3],
    stacks=6,
    stack_kernel_size=7,
    stack_filters=3*128,
    n_fft=160*8,
    frame_step=160*4,
    n_mel=160,
    optimizer='Momentum',
    lr=0.001,
    clip_gradients=20.0
))

{'optimizer': 'Momentum', 'lr': 0.001, 'data': {'parallelize': True, 'shuffle': True, 'max_text_length': None, 'min_text_length': None, 'max_wave_length': 320000, 'random_shift_min': -4000, 'random_shift_max': 4000, 'random_stretch_min': 0.8, 'random_stretch_max': 1.2, 'random_noise': 0.75, 'random_noise_factor_min': 0.1, 'random_noise_factor_max': 0.15, 'epochs': 1, 'batch_size': 18, 'augment': False}, 'alphabet': ' !"&\',-.01234:;\\abcdefghijklmnopqrstuvwxyz', 'pad_conv': True, 'stack_dilation_rates': [1, 3], 'stacks': 6, 'stack_kernel_size': 7, 'stack_filters': 384, 'sampling_rate': 16000, 'n_fft': 1280, 'frame_step': 640, 'fmin': 0, 'fmax': 8000, 'n_mel': 160, 'clip_gradients': 20.0, 'codename': 'deep_max_20_seconds', 'causal_convolutions': False}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/content/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_