In [1]:
import librosa
import librosa.display
import scipy.io.wavfile
import sklearn
import tensorflow as tf
import numpy as np
from config import SEQ_LENGTH, FRAMERATE, CHUNK, FFT_SIZE
import matplotlib.pyplot as plt
import generate_wav_samples as gen
import os
import keras
import itertools
from config import MORSE_CHR
from tqdm import tqdm

from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.layers import Input, Dense, Activation,TimeDistributed
from keras.layers import Reshape, Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.recurrent import GRU, SimpleRNN,LSTM
from keras.optimizers import SGD, Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import keras.callbacks


Using TensorFlow backend.


In [2]:
OUTPUT_DIR = 'rnn_output'

In [3]:
class VizCallback(keras.callbacks.Callback):
    def __init__(self, run_name, test_func, X):
        self.test_func = test_func
        self.output_dir = os.path.join(
            OUTPUT_DIR, run_name)
        self.X = X

    def show_edit_distance(self, num):
        print('edit distance: ', num)
        """
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func,
                                       word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j],
                                              word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance:'
              '%.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
        """

    def on_epoch_end(self, epoch, logs={}):
        self.model.save_weights(
            os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
        
        self.show_edit_distance(256)
        
        word_batch = self.X[0][:1]
        res = decode_batch(self.test_func, word_batch)
        labels = self.X[1][:1]
        print('labels: ', labels_to_text([int(e) for e in labels[0]]))
        print('result lens: ', len(res))
        for e in res[:3]:
            print(e[:15])
            

def labels_to_text(i):
    return [MORSE_CHR[e] for e in i]

def decode_batch(test_func, word_batch):
    out = test_func([word_batch])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        outstr = labels_to_text(out_best)
        ret.append(outstr)
    return ret


In [4]:
def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    """Runs CTC loss algorithm on each batch element.

    # Arguments
        y_true: tensor `(samples, max_string_length)`
            containing the truth labels.
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_pred`.
        label_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_true`.

    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element.
    """
    label_length = tf.to_int32(tf.squeeze(label_length, axis=-1))
    input_length = tf.to_int32(tf.squeeze(input_length, axis=-1))
    sparse_labels = tf.to_int32(K.ctc_label_dense_to_sparse(y_true, label_length))

    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon())

    return tf.expand_dims(K.ctc.ctc_loss(inputs=y_pred,
                                       labels=sparse_labels,
                                       sequence_length=input_length, ignore_longer_outputs_than_inputs=True), 1)

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    bc = K.ctc_batch_cost(labels, y_pred, input_length, label_length)
    print(bc)
    return bc


In [14]:
sample_len = SEQ_LENGTH

#SEQ_LENGTH = 24000

samples_count = 500
sample_len = 80000
sr = 8000
dict_len = len(MORSE_CHR)
max_seq_len = 4
mel_count = 1
mel_len = 160

In [15]:
SEQ_LENGTH

39936

In [16]:
g = gen.seq_generator(SEQ_LENGTH, FRAMERATE, 1)

In [17]:
def read_data(set_len):
    l = np.zeros([samples_count, max_seq_len])
    #l += -1.0
    X = np.zeros([samples_count,  mel_len, mel_count])
    input_length = np.zeros([samples_count, 1])
    label_length = np.zeros([samples_count, 1])

    i = 0
    for wave, label_indexes, labels, c in tqdm(g):
        wave = wave.reshape(SEQ_LENGTH)
        wave = librosa.util.normalize(wave)
        mel = librosa.feature.melspectrogram(wave, sr=sr, n_fft=500, n_mels=mel_count,hop_length=250)

        mel = mel.reshape(mel_len, mel_count)
        mel = mel / np.max(mel)
        #mel = np.round(mel, decimals=4)

        X[i, :, :] = mel
        #
        labels = [l for l in labels if l != ' ']
        #print(labels)
        l[i, :len(labels)] = labels
        input_length[i, :1] = np.array([float(len(labels))])
        label_length[i, :1] = np.array([c])
        
        #print(labels)
        #break
        
        i+=1
        if i == set_len:
            break
        
    return [X, l, input_length, label_length], l

In [18]:
X, l = read_data(samples_count)

494it [00:04, 122.92it/s]


In [19]:
X_val, l_val = read_data(10)

0it [00:00, ?it/s]


In [20]:
X_val[1][3]

array([2., 0., 0., 0.])

In [21]:
print(list(l_val[5]))

[2.0, 0.0, 0.0, 0.0]


In [22]:
X[0].shape

(500, 160, 1)

In [23]:
dict_len

4

In [24]:
conv_filters = 32
kernel_size = 2
pool_size = 2
time_dense_size = 32
rnn_size = 512
minibatch_size = 32

In [25]:
input_shape = (mel_len, mel_count)

act = 'relu'
input_data = Input(name='the_input', shape=input_shape, dtype='float32')

#dense0 = Dense(64, name='dense0', activation=act)(input_data)

#"""
inner = Conv1D(conv_filters, kernel_size, padding='same', 
               activation=act, kernel_initializer='he_normal',
               name='conv1')(input_data)
mp = MaxPooling1D(pool_size=pool_size, name='max1')(inner)
#"""

#dense0 = Dense(64, name='dense0', activation=act)(inner)
gru = GRU(256, return_sequences=True, kernel_initializer='he_normal', name='gru1')(mp)
#srnn = SimpleRNN(100, return_sequences=True)(mp)

#lstm = LSTM(50, return_sequences=True)(input_data)
#srnn = SimpleRNN(50, return_sequences=True)(input_data)
#dense2 = Dense(100, kernel_initializer='he_normal', name='dense2')(srnn)
dense1 = Dense(dict_len, kernel_initializer='he_normal', name='dense1')(gru)
#dense2 = Dense(dict_len, kernel_initializer='he_normal', name='dense2', activation='sigmoid')(dense1)
y_pred = Activation('softmax', name='softmax')(dense1)

Model(inputs=input_data, outputs=y_pred).summary()

labels = Input(name='the_labels', shape=[max_seq_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

loss_out = Lambda(
    ctc_lambda_func, output_shape=(1,),
    name='ctc')([y_pred, labels, input_length, label_length])

# clipnorm seems to speeds up convergence
model = Model(inputs=[input_data, labels, input_length, label_length],
              outputs=loss_out)

# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
adam = Adam(lr=0.05)
sgd = SGD(lr=0.005, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5) #
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

test_func = K.function([input_data], [y_pred])
viz_cb = VizCallback('test', test_func, X_val)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, 160, 1)            0         
_________________________________________________________________
conv1 (Conv1D)               (None, 160, 32)           160       
_________________________________________________________________
max1 (MaxPooling1D)          (None, 40, 32)            0         
_________________________________________________________________
gru1 (GRU)                   (None, 40, 256)           221952    
_________________________________________________________________
dense1 (Dense)               (None, 40, 4)             1028      
_________________________________________________________________
softmax (Activation)         (None, 40, 4)             0         
Total params: 223,140
Trainable params: 223,140
Non-trainable params: 

In [26]:
model.fit(X, l, validation_split=0.1, batch_size=10, callbacks=[viz_cb], epochs=30)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 450 samples, validate on 50 samples
Epoch 1/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
[' ', 'T', ' ']
Epoch 2/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
[' ']
Epoch 3/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 4/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 5/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 6/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 7/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 8/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 9/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', ' ']
Epoch 10/30
edit distance:  256
labels:  ['T', ' ', ' ', ' ']
result lens:  1
['T', '

<keras.callbacks.History at 0x7f5aba685908>

In [None]:
[e for e in l_val]

In [None]:
l_val.shape

In [None]:
model.predict(X)