In [1]:
import librosa
import librosa.display
import scipy.io.wavfile
import sklearn
import tensorflow as tf
import numpy as np
from config import SEQ_LENGTH, FRAMERATE, CHUNK, FFT_SIZE
import matplotlib.pyplot as plt
import generate_wav_samples as gen
import os
import keras
import itertools
from config import MORSE_CHR
from tqdm import tqdm

from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.layers import Input, Dense, Activation,TimeDistributed
from keras.layers import Reshape, Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.recurrent import GRU, SimpleRNN,LSTM
from keras.optimizers import SGD, Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import keras.callbacks


Using TensorFlow backend.


In [2]:
OUTPUT_DIR = 'rnn_output'

In [3]:
class VizCallback(keras.callbacks.Callback):
    def __init__(self, run_name, test_func, X):
        self.test_func = test_func
        self.output_dir = os.path.join(
            OUTPUT_DIR, run_name)
        self.X = X

    def show_edit_distance(self, num):
        print('edit distance: ', num)
        """
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func,
                                       word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j],
                                              word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance:'
              '%.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
        """

    def on_epoch_end(self, epoch, logs={}):
        self.model.save_weights(os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
        
        self.show_edit_distance(256)
                
        for i in range(5):
            labels = self.X[1][i:i+1]
            print('labels: ', labels_to_text([int(e) for e in labels[0]]))
        
        word_batch = self.X[0][:5]
        res = decode_batch(self.test_func, word_batch)
        print('result lens: ', len(res))
        for e in res[:5]:
            print(e)
            

def labels_to_text(i):
    return [MORSE_CHR[e] for e in i]

def decode_batch2(test_func, word_batch):
    out = test_func([word_batch])[0]
    ret = []
    print(np.argmax(out, axis = -1))
    return np.argmax(out, axis = -1)


def decode_batch(test_func, word_batch):
    out = test_func([word_batch])[0]
    r = np.argmax(out, axis=-1)
    #print('r: ', r)
    
    
    res = []
    for a in r:
        sub_res = []
        for i, e in enumerate(a):
            #print(i, e)
            if i == 0:
                sub_res.append(e)
                continue
            if (e == a[i-1]):
                continue
            if (e == len(MORSE_CHR) - 1):
                continue
            sub_res.append(e)
            
        sub_res = [e for e in sub_res if e != len(MORSE_CHR) - 1]
        sub_res = labels_to_text(sub_res)
        res.append(sub_res)
            
    #[e if (i==0 or c != bc[i-1] and c!=3)]
    #print('res: ', res)
    return res



In [4]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    #y_pred = y_pred[:, 2:, :]
    bc = K.ctc_batch_cost(labels, y_pred, input_length, label_length)
    return bc


In [5]:
sample_len = SEQ_LENGTH

#SEQ_LENGTH = 24000

samples_count = 5000
sample_len = 80000
sr = 8000
dict_len = len(MORSE_CHR)
max_seq_len = 2
mel_count = 1
mel_len = 160

In [6]:
g = gen.seq_generator(SEQ_LENGTH, FRAMERATE, 1)

In [7]:
def read_data(set_len):
    l = np.zeros([samples_count, max_seq_len], dtype=np.int32)
    #l += -1.0
    X = np.zeros([samples_count,  mel_len, mel_count])
    input_length = np.zeros([samples_count, 1], dtype=np.int32)
    label_length = np.zeros([samples_count, 1], dtype=np.int32)

    i = 0
    for wave, label_indexes, labels, c in tqdm(g):
        wave = wave.reshape(SEQ_LENGTH)
        wave = librosa.util.normalize(wave)
        mel = librosa.feature.melspectrogram(wave, sr=sr, n_fft=500, n_mels=mel_count,hop_length=250)
        
        #print('mel shape', mel.shape)
        #mel = mel.reshape(mel_len, mel_count)
        mel = mel.T
        #print('mel shape reshaped', mel.shape)
        
        
        mel = mel / np.max(mel)
        #mel = np.round(mel, decimals=4)

        X[i, :, :] = mel
        #
        labels = [l for l in labels if l != ' ']
        #print(labels)
        l[i, :len(labels)] = labels
        input_length[i, :] = mel.shape[0]
    
        
        label_length[i, :1] = c
        
            
        #print(label_length)
        #print(l)
        
        #raise Exception
        
        #print(labels)
        #break
        
        i+=1
        if i == set_len:
            break
        
    return [X, l, input_length, label_length], l

In [8]:
X, l = read_data(samples_count)

4993it [00:45, 101.66it/s]

In [9]:
X_val, l_val = read_data(10)


0it [00:00, ?it/s][A

In [10]:
dict_len

4

In [11]:
conv_filters = 32
kernel_size = 30
pool_size = 30
time_dense_size = 32
rnn_size = 512
minibatch_size = 32

In [12]:
ctc_lambda_func

<function __main__.ctc_lambda_func(args)>

In [13]:
input_shape = (mel_len, mel_count)

act = 'relu'
input_data = Input(name='the_input', shape=input_shape, dtype='float32')

inner = Conv1D(conv_filters, kernel_size, padding='same', 
               activation=act, kernel_initializer='he_normal',
               name='conv1')(input_data)

mp = MaxPooling1D(pool_size=pool_size, name='max1', strides=1, padding='same')(inner)
gru = GRU(128, return_sequences=True, kernel_initializer='he_normal', name='gru1')(mp)
#srnn = SimpleRNN(100, return_sequences=True)(mp)
#lstm = LSTM(50, return_sequences=True)(mp)

dense2 = Dense(128, kernel_initializer='he_normal', name='dense2')(gru)
dense1 = Dense(dict_len, kernel_initializer='he_normal', name='dense1')(dense2)

y_pred = Activation('softmax', name='softmax')(dense1)

Model(inputs=input_data, outputs=y_pred).summary()

labels = Input(name='the_labels', shape=[max_seq_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

print(y_pred, labels, input_length, label_length)

loss_out = Lambda(
    ctc_lambda_func, output_shape=(1,),
    name='ctc')([y_pred, labels, input_length, label_length])

# clipnorm seems to speeds up convergence
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5) #
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

test_func = K.function([input_data], [y_pred])
viz_cb = VizCallback('test', test_func, X_val)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, 160, 1)            0         
_________________________________________________________________
conv1 (Conv1D)               (None, 160, 32)           992       
_________________________________________________________________
max1 (MaxPooling1D)          (None, 160, 32)           0         
_________________________________________________________________
gru1 (GRU)                   (None, 160, 128)          61824     
_________________________________________________________________
dense2 (Dense)               (None, 160, 128)          16512     
_________________________________________________________________
dense1 (Dense)               (None, 160, 4)            516       
_________________________________________________________________
soft

In [14]:
model.fit(X, l, validation_split=0.1, batch_size=300, callbacks=[viz_cb], epochs=30)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 4500 samples, validate on 500 samples
Epoch 1/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
[]
[]
[]
[]
[]
Epoch 2/30

4993it [00:59, 101.66it/s]

edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
[]
[]
[]
[]
[]
Epoch 3/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
[]
[]
[]
[]
[]
Epoch 4/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
[]
['E']
[]
[]
['E']
Epoch 5/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
['E']
['E']
['T']
[]
['E']
Epoch 6/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
['T']
['T']
['T']
['T']
['T']
Epoch 7/30
edit distance:  256
labels:  ['E', 'T']
labels:  ['E', 'T']
labels:  ['T', 'T']
labels:  ['T', 'E']
labels:  ['T', 'T']
result lens:  5
['E']
['E']
['T']
['T

<keras.callbacks.History at 0x7ff82c1b3828>

In [18]:
model.predict(X)

array([[6.7317122e-01],
       [1.3549232e+01],
       [3.2861420e+01],
       [4.5195565e+00],
       [1.3926944e-01],
       [4.0953670e+00],
       [2.6735004e+01],
       [2.1312921e+00],
       [5.5188427e+00],
       [1.7765812e+01],
       [2.3649918e+01],
       [1.8207308e+01],
       [2.3434446e+01],
       [1.8066843e+01],
       [2.3815529e-01],
       [1.2227363e+01],
       [9.6283823e-01],
       [2.1750107e+00],
       [1.2765317e+01],
       [1.2348211e+01],
       [3.1387949e+00],
       [1.0682131e+01],
       [2.1838283e+02],
       [1.7335533e+01],
       [5.8277493e+00],
       [4.8725769e+01],
       [5.2742397e+01],
       [8.1748665e+01],
       [9.5771475e+00],
       [7.4476204e+00],
       [1.4510803e+01],
       [5.3244972e+00],
       [1.3796667e+01],
       [7.6745539e+00],
       [4.8530125e+01],
       [1.6400570e+01],
       [2.5027014e+01],
       [1.6611097e+00],
       [2.6627939e+01],
       [8.6402740e+00],
       [2.9030168e-01],
       [1.990933