In [1]:
import numpy as np
import pandas as pd
import string, os, io
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import contractions
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Activation
from tensorflow.keras.layers import Concatenate, Dropout, Bidirectional, Dot
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.initializers import Constant
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model

In [2]:
corpus_df = pd.read_csv('en-fr.txt', sep='\t', header=None, names=['eng','french'], nrows=20000)
corpus_df.head()

Unnamed: 0,eng,french
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !


In [3]:
eng = []
for sentence in corpus_df['eng']:
    x = sentence.strip()
    x = contractions.fix(x).lower()
    eng.append(x)

french = []
for sentence in corpus_df['french']:
    x = sentence.strip().lower()
    x = "<BOS> " + x + " <EOS>"
    french.append(x)

In [4]:
en_train, en_test, fr_train, fr_test = train_test_split(eng, french, test_size=.1, random_state=67)

In [5]:
def get_vocab_idx(data):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    max_len = max(map(lambda x: len(x.split()), data))
    vocab_size = len(tokenizer.word_index) + 1 # for zero padding.
    return tokenizer, vocab_size, max_len

en_tokenizer, input_vocab, max_len_en = get_vocab_idx(en_train)
fr_tokenizer, target_vocab, max_len_fr = get_vocab_idx(fr_train)

print(f'Maximum sentence length(i.e n words) English: {max_len_en}')
print(f'Maximum sentence length(i.e n words) French: {max_len_fr}')
print(f'Size of the vocabulary (English) : {input_vocab}')
print(f'Size of the vocabulary (French) : {target_vocab}')

Maximum sentence length(i.e n words) English: 6
Maximum sentence length(i.e n words) French: 14
Size of the vocabulary (English) : 4729
Size of the vocabulary (French) : 9051


In [6]:
class DataGenerator(Sequence): 
    def __init__(self, en_train, fr_train, batch_size, shuffle):        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.en_train = en_train
        self.fr_train = fr_train
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.en_train) // self.batch_size
        
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        en_batch = [self.en_train[i] for i in indexes]
        fr_batch = [self.fr_train[i] for i in indexes]
        input_data, decoder_target = self.__data_generation(en_batch, fr_batch)
        return input_data, decoder_target
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.en_train))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
               
    def __data_generation(self, en_batch, fr_batch):
        encoder_input = en_tokenizer.texts_to_sequences(en_batch)
        decoder_input = fr_tokenizer.texts_to_sequences(fr_batch)
        # zero padding
        encoder_input = pad_sequences(encoder_input, max_len_en, padding='post')
        decoder_input = pad_sequences(decoder_input, max_len_fr, padding='post')
        # decoder target
        decoder_target = np.zeros((len(fr_batch), max_len_fr, target_vocab))
        for x, seq in enumerate(decoder_input):
            for y, s in enumerate(seq):
                if y > 0:
                    decoder_target[x, y - 1, s] = 1.  
        return [encoder_input, decoder_input], decoder_target

In [7]:
embed_size = 300
lstm_units = 100

In [8]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_emb_layer = Embedding(input_dim=input_vocab, output_dim=embed_size)
encoder_embeddings = encoder_emb_layer(encoder_inputs)
encoder_lstm_layer1 = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True))

encoder_outputs, fstate_h, fstate_c, bstate_h, bstate_c = encoder_lstm_layer1(encoder_embeddings)
state_h = Concatenate()([fstate_h, bstate_h])
state_c = Concatenate()([fstate_c, bstate_c])
encoder_states = [state_h, state_c]

# Decoder with attention
decoder_inputs = Input(shape=(None,))
decoder_emb_layer = Embedding(input_dim=target_vocab, output_dim=embed_size)
decoder_embeddings = decoder_emb_layer(decoder_inputs)
decoder_lstm_layer1 = LSTM(lstm_units*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm_layer1(decoder_embeddings, initial_state=encoder_states)

In [9]:
import tensorflow as tf
class AttentionLayer(tf.keras.layers.Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [10]:
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_energy = attn_layer([encoder_outputs, decoder_outputs])

TypeError: in converted code:

    <ipython-input-9-bc7c7a274fd3>:93 create_inital_state  *
        fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
    /tmp/tmpop6robw2.py:153 create_inital_state
        fake_state = ag__.converted_call(K.tile, create_inital_state_scope.callopts, (fake_state, [1, hidden_size]), None, create_inital_state_scope)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/backend.py:3014 tile
        return array_ops.tile(x, n)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_array_ops.py:11311 tile
        "Tile", input=input, multiples=multiples, name=name)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py:530 _apply_op_helper
        raise err
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py:527 _apply_op_helper
        preferred_dtype=default_dtype)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1296 internal_convert_to_tensor
        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py:286 _constant_tensor_conversion_function
        return constant(v, dtype=dtype, name=name)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py:227 constant
        allow_broadcast=True)
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py:265 _constant_impl
        allow_broadcast=allow_broadcast))
    /home/john70/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/tensor_util.py:545 make_tensor_proto
        "supported type." % (type(values), values))

    TypeError: Failed to convert object of type <class 'list'> to Tensor. Contents: [1, None]. Consider casting elements to a supported type.
