# Machine translation pseudo code

In [2]:
import os
import sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.layers import Input, Dense, Concatenate, LSTM, Activation, Dot, Embedding, Lambda, Bidirectional, RepeatVector
from tensorflow.keras.models import Model

  from ._conv import register_converters as _register_converters


In [3]:
model_test = lambda *x: Model(*x).summary()

In [4]:
Tx = 100
Ty = 80
Dx = 100
Dy = 50
Vx = 10000
Vy = 15000
Mx = 25
My = 20

In [5]:
encoder_input = Input(shape = (Tx,), name = 'en_seq_in')
encoder_embedding = Embedding(Vx, Dx, input_length = Tx, name = 'en_seq_embd')
encoder_lstm = Bidirectional(LSTM(Mx,return_sequences = True), name = 'en_lstm')
x = encoder_embedding(encoder_input)
encoder_output = encoder_lstm(x)

model_test(encoder_input, encoder_output)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
en_seq_in (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
en_seq_embd (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
en_lstm (Bidirectional)      (None, 100, 50)           25200     
Total params: 1,025,200
Trainable params: 1,025,200
Non-trainable params: 0
_________________________________________________________________


In [6]:
decoder_tf_input = Input(shape = (Ty,))
decoder_embedding_layer = Embedding(Vy, Dy, input_length = Ty)
decoder_tfembd_input = decoder_embedding_layer(decoder_tf_input)

model_test(decoder_tf_input, decoder_tfembd_input)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 80)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 80, 50)            750000    
Total params: 750,000
Trainable params: 750,000
Non-trainable params: 0
_________________________________________________________________


### Shape reminder

In [7]:
def remind():
    ins = Input(shape = (10,))
    x = Embedding(100, 20, input_length = 10)(ins)
    o,h1,h2,c1,c2 = Bidirectional(LSTM(30, return_sequences = True, return_state = True))(x)
    test_model = Model(ins, [o,h1,h2,c1,c2])

    test_model.summary()

    x_in = np.random.randint(1,100,size = (3,10))
    o,h1,h2,c1,c2 = test_model.predict(x_in)

    print('h1, h2 has shapes: {}, {}'.format(h1.shape, h2.shape))
    print('c1, c2 has shapes: {}, {}'.format(c1.shape, c2.shape))
    print('o has a shape: {}'.format(o.shape))
    
remind()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 20)            2000      
_________________________________________________________________
bidirectional (Bidirectional [(None, 10, 60), (None, 3 12240     
Total params: 14,240
Trainable params: 14,240
Non-trainable params: 0
_________________________________________________________________
h1, h2 has shapes: (3, 30), (3, 30)
c1, c2 has shapes: (3, 30), (3, 30)
o has a shape: (3, 10, 60)


In [8]:
import keras.backend as K

def softmax_over_time(x):
    # (batch, time, hidden), at least 3 dimensions
    assert(K.ndim(x) > 2)
    
    e = K.exp(x - K.max(x, axis = -1, keepdims = True))
    s = K.sum(e, axis = -1, keepdims = True)
    return e/s


attn_repeat_layer = RepeatVector(Tx)
attn_concat_layer = Concatenate(axis = -1)
attn_dense1 = Dense(10, activation = 'tanh')
attn_dense2 = Dense(1, activation = softmax_over_time)
attn_dot = Dot(axes = 1)


def one_step_attention(h, st_1):
    '''
        h          size  (batch, Tx, 2Mx)
        st_1       size  (batch, 1,  My)
        st_1_rpt   size  (batch, Tx, My)
        st_1_h     size  (batch, Tx, My + 2Mx)
        alpha_1    size  (batch, Tx, 10)
        alpha_2    size  (batch, Tx, 1)
        context    size  (batch, 1,  2Mx) - something like 1 step of h
    '''
    
    
    st_1_rpt = attn_repeat_layer(st_1)
    st_1_h = attn_concat_layer([st_1_rpt,h])
    alphas = attn_dense1(st_1_h)
    alphas = attn_dense2(alphas)
    context = attn_dot([alphas, h])
    return context

Using TensorFlow backend.


In [9]:
decoder_lstm = LSTM(My,return_state = True)
decoder_dense = Dense(Vy, activation = 'softmax')

### Building blocks


1. Encoder input token sequences -> Encoder ouput sequence of hidden vectors              
2. Decoder input teaching forcing - Decoder input embedded teaching forcing sequences            
3. One step attention,               
    takes inputs of all hidden vector for all time steps of encoder(done)               
    and previous decoder hidden states(initialize first step with 0, and pass on)                                
    output a context vector, which is a vector, for current timestep(done.)
    
4. pass decoder s & c as hidden and cell states,                
    decoder input = concat(decorder teaching forcing[1 step] , context vector[cur_step])                    


# When you do tensor slicing, batch dimension is included

*Uncomment & run the following code to test*

In [10]:
#tips tensor slicing
# test_in = Input(shape = (Ty,Dy))
# test_out = Lambda(lambda x:x[:,4:5,:])(test_in)
# test_m = Model(test_in, test_out)
# test_m.summary()

In [11]:
initial_s = Input(shape = (My,))
initial_c = Input(shape = (My,))

s = initial_s
c = initial_c

# decoder tfembd (batch, Ty, Dy)
# context (batch, 1, 2Mx)
concat_tfemd_ctx = Concatenate(axis = -1)

outputs = []
for t in range(Ty):
    
    context = one_step_attention(encoder_output, s)
    
    # convert teaching forcing to (batch, 1, Dy)
    tfembd_cur_step = Lambda(lambda x:x[:,t:t+1,:])(decoder_tfembd_input)
    
    # concat context and decoder input
    decoder_final_input = concat_tfemd_ctx([tfembd_cur_step, context])
    
    o, s, c = decoder_lstm(decoder_final_input ,initial_state=[s,c])
    
    output = decoder_dense(o)
    
    outputs.append(output)
    

Next question: what is 1 output looks like? - it is a vector of size Vy, (batch Vy)

In [12]:
# test_dc_in = Input(shape = (Ty,2*Mx + Dy,))
# test_out,h,c = decoder_lstm(test_dc_in)
# test_out = decoder_dense(test_out)

# Model(test_dc_in,test_out).summary()

# WTF are you donig ????       

All steps are vectorized operations (batch operation)                       
But, loops through time dimension, and collection by time with `outputs` list               
Therefor, the list `outputs` is of shape (Ty, batch, Vy)

In [13]:
def stack_n_permute(x):
    '''
        Convet a list of tensors into a concrete tensor
    '''
    
    x = K.stack(x)
    x = K.permute_dimensions(x, pattern = (1,0,2))
    return x

stacker = Lambda(stack_n_permute)
outputs = stacker(outputs)

In [14]:
NMT_model = Model(inputs = [encoder_input, decoder_tf_input, initial_s, initial_c], outputs = outputs)

In [16]:
NMT_model.save('NMT.h5')

In [17]:
NMT_model.save_weights('NMT_weights.h5')