In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# My Code
# Change the proj_dir to deepspeech-2-ctc
proj_home = '/home/rharshal/myProjects/thesis/deepspeech-2-ctc' 
proj_dir = '/home/rharshal/myProjects/thesis/deepspeech-2-ctc/src/VoyceWorks_Walmart'
path_cnnrnn = f'{proj_dir}/CNN_RNN'

In [None]:
import sys,os
sys.path.append(path_cnnrnn)

In [None]:
from final_model import finalModel
from data_generator import AudioGenerator
from keras import backend as K
from keras.models import Model
from keras.layers import (BatchNormalization, Conv1D, Dense, Input, 
    TimeDistributed, Activation, Bidirectional, SimpleRNN, GRU, LSTM)
import numpy as np
from utils import int_sequence_to_text
import nltk,pickle

In [None]:
termsPath = f'{path_cnnrnn}/termsWalmart.pickle'
wordsPath = f'{path_cnnrnn}/wordsWalmart.pickle'
#trainJsonPath='./CNN_RNN/json/nisheeth_wal_train.json'
modelPath = f'{path_cnnrnn}/model/walmart_hindi.h5'

In [None]:
with open(termsPath, 'rb') as handle:
    terms=pickle.load(handle)
    # print("Available terms \n",terms)
with open(wordsPath, 'rb') as handle:
    words=pickle.load(handle)
    # print("Available words \n",words)
with open(f'{path_cnnrnn}/mean_std/wal_hindi_train_data_std.pickle', 'rb') as handle:
    std=pickle.load(handle)
with open(f'{path_cnnrnn}/mean_std/wal_hindi_train_data_mean.pickle', 'rb') as handle:
    mean=pickle.load(handle)

In [None]:
def after_edit_terms_sorting(after_edit_pred,terms):
    terms_match_count=[]
    after_edit_pred=set(after_edit_pred.split())
    for term in terms :
        terms_match_count.append((term,len(after_edit_pred.intersection(set(term.split())))))
    terms_match_count = sorted(terms_match_count,key=lambda x :(x[1],len(x[0])),reverse=True)
    return [x[0] for x in terms_match_count][:5]

def match_strings(trans,pred):
    tras_l=trans.split()
    pred_l=pred.split()
    c=0
    for t in tras_l:
        for p in pred_l:
            if(t==p):
                c=c+1
    return len(tras_l),c

def cnn_output_length(input_length, filter_size, border_mode, stride,
                       dilation=1,conv_layers=1):
    """ Compute the length of the output sequence after 1D convolution along
        time. Note that this function is in line with the function used in
        Convolution1D class from Keras.
    Params:
        input_length (int): Length of the input sequence.
        filter_size (int): Width of the convolution kernel.
        border_mode (str): Only support `same` or `valid`.
        stride (int): Stride size used in 1D convolution.
        dilation (int)
    """
    if input_length is None:
        return None
    assert border_mode in {'same', 'valid'}
    dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
    if border_mode == 'same':
        for layer in range(conv_layers):
            output_length = (input_length + stride - 1) // stride
            input_length=output_length
    elif border_mode == 'valid':
        for layer in range(conv_layers):
            output_length = input_length - dilated_filter_size + 1
            output_length=(output_length + stride - 1) // stride
            input_length=output_length
            
    return output_length

In [None]:
def final_model(input_dim, filters, kernel_size, conv_stride,
    conv_border_mode, units, output_dim=29, dropout_rate=0.5, number_of_layers=2, 
    cell=GRU, activation='tanh',conv_layers=1):
    """ Build a deep network for speech 
    """
    # Main acoustic input
    input_data = Input(name='the_input', shape=(None, input_dim))
    # TODO: Specify the layers in your network
    conv_1d = Conv1D(filters, kernel_size, 
                     strides=conv_stride, 
                     padding=conv_border_mode,
                     activation='relu',
                     name='layer_1_conv',
                     dilation_rate=1)(input_data)
    conv_bn = BatchNormalization(name='conv_batch_norm')(conv_1d)
    conv_1d_2 = Conv1D(filters, kernel_size, 
                     strides=conv_stride, 
                     padding=conv_border_mode,
                     activation='relu',
                     name='layer_2_conv',
                     dilation_rate=1)(conv_bn)
    conv_bn_2 = BatchNormalization(name='conv_batch_norm_2')(conv_1d_2)


    if number_of_layers == 1:
        layer = cell(units, activation=activation,
            return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate, reset_after=False)(conv_bn_2)
        layer = BatchNormalization(name='bt_rnn_1')(layer)
    else:
        layer = cell(units, activation=activation,
                    return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate, reset_after=False)(conv_bn_2)
        layer = BatchNormalization(name='bt_rnn_1')(layer)

        for i in range(number_of_layers - 2):
            layer = cell(units, activation=activation,
                        return_sequences=True, implementation=2, name='rnn_{}'.format(i+2), dropout=dropout_rate, reset_after=False)(layer)
            layer = BatchNormalization(name='bt_rnn_{}'.format(i+2))(layer)

        layer = cell(units, activation=activation,
                    return_sequences=True, implementation=2, name='final_layer_of_rnn', reset_after=False)(layer)
        layer = BatchNormalization(name='bt_rnn_final')(layer)
    

    time_dense = TimeDistributed(Dense(output_dim))(layer)
    # TODO: Add softmax activation layer
    y_pred = Activation('softmax', name='softmax')(time_dense)
    # Specify the model
    model = Model(inputs=input_data, outputs=y_pred)
    # TODO: Specify model.output_length
    model.output_length = lambda x: cnn_output_length(
        x, kernel_size, conv_border_mode, conv_stride,conv_layers=conv_layers)
#     model.output_length = 33
    # print(model.summary())
    return model

def normalize_test(feature,mean,std,eps=1e-14):
    return (feature - mean) / (std + eps)

# specify the model
# specify the model
model_end = final_model(input_dim=161,
                        filters=200,
                        kernel_size=11, 
                        conv_stride=2,
                        conv_border_mode='valid',
                        units=250,
                        activation='relu',
                        cell=GRU,
                        dropout_rate=0.9,
                        number_of_layers=2,conv_layers=2)


In [None]:
data_gen = AudioGenerator(spectrogram=True)

In [None]:
model_end.load_weights(modelPath)

In [None]:
# My Code
# os.listdir(f'{proj_dir}/data/someWavFiles')
# audio_path=sys.argv[1]

In [None]:
# My code
audio_filename = 'milgaya_1574415169408'
audio_path = f'{proj_dir}/data/{audio_filename}.wav'

In [None]:

#data_gen.load_train_data( desc_file=trainJsonPath)
# audio_path='uploads_Walmart_Lab/574414856883.wav'
data_point = normalize_test(data_gen.featurize(audio_path),mean,std)
data_point.shape

In [None]:
pred = model_end.predict(np.expand_dims(data_point, axis=0))
pred.shape

In [None]:
pred[0]

In [None]:
output_length = [model_end.output_length(data_point.shape[0])] 
output_length

In [None]:
tmp = K.ctc_decode(pred, output_length)

In [None]:
tmp

In [None]:
pred_ints = (K.eval(K.ctc_decode(pred, output_length)[0][0])+1).flatten().tolist()
pred_ints

In [None]:
pred_l=int_sequence_to_text(pred_ints)
# print(pred_l)
pred_text=''.join(pred_l)
pred_text

## Try Using awni's ctc-decoder found from [here](https://distill.pub/2017/ctc/#inference)

In [None]:
dir_awni_ctc = f'{proj_home}/try-ctc-decode-awni'

In [None]:
import sys
sys.path.append(dir_awni_ctc)

In [None]:
import src.ctc_decoder as awni_ctc_decoder

In [None]:
awni_ctc_decoder.decode(pred[0], blank=28)

## My attempt to demystify the working of ctc_decode
+ Tried to select the max prob for each time-step
+ Then mapped to chars based on mapping defined in *char_map.py*

Seems to work fine, but after reading [this](https://distill.pub/2017/ctc/) my observations have changed

In [None]:
throw ValueError

In [None]:
tmp = pred[0]
tmp.shape

In [None]:
chr_map = { idx:char for (idx, char) in enumerate('` abcdefghijklmnopqrstuvwxyz$') }
print(chr_map)

In [None]:
tmp_argmax = np.argmax(tmp, axis=1)
print(tmp_argmax.shape)
tmp_argmax

In [None]:
tmp_pred = [ chr_map[idx] for idx in tmp_argmax ]
''.join(tmp_pred)