In [1]:
import matplotlib.pyplot as plt

import keras
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
from keras.models import load_model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate, Average, GRU, LSTMCell, RNN, Embedding, TimeDistributed, Dropout, BatchNormalization
from keras.models import Model
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.python.keras.utils import tf_utils

Using TensorFlow backend.


In [None]:
# Code from keras how-to

In [25]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 50000  # Number of samples to train on.

# Path to the data txt file on disk.
data_path = 'data/old_eng_close.csv'
# data_path = 'data/old_rus_all.csv'
# data_path = 'data/old_eng_close.csv'
# data_path = 'data/old_eng_close.csv'


In [26]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set("\n")
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split(',')[:2]
    if (len(input_text) > 15) or (len(target_text) > 15):
        continue
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [27]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 12596
Number of unique input tokens: 42
Number of unique output tokens: 28
Max sequence length for inputs: 15
Max sequence length for outputs: 17


In [5]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [6]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index['\n']] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index['\n']] = 1.
    decoder_target_data[i, t:, target_token_index['\n']] = 1.



In [139]:
# LSTM

def build_model(latent_dim, optimizer, encoder_dropout, decoder_dropout):
    earlyStopping = EarlyStopping(monitor='val_acc', patience=10, verbose=0)
#     mcp_save = ModelCheckpoint(file_name + '_e{epoch:02d}_v{val_loss:.2f}.hdf5', save_best_only=True, monitor='val_acc')
    
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(latent_dim, return_state=True, dropout=encoder_dropout)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=decoder_dropout)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# model.save('s2s.h5')

In [155]:
hists = list()
for latent_dim in [64, 128, 256]:
    for optimizer in ['RMSprop', 'Adam']:
        for encoder_dropout in [0.1, 0.3, 0.5]:
            for decoder_dropout in [0.1, 0.3, 0.5]:
                print()
                print("latent_dim, optimizer, encoder_dropout, decoder_dropout")
                print(latent_dim, optimizer, encoder_dropout, decoder_dropout)
                model = build_model(latent_dim, optimizer, encoder_dropout, decoder_dropout)
                hist = model.fit(
                    [encoder_input_data, decoder_input_data],
                    decoder_target_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2,
                    callbacks=[earlyStopping],
                    verbose=0
                )
                m = max(hist.history["val_acc"])
                print("best val_acc:", m)
                print("on epoch", hist.history["val_acc"].index(m))
                hists.append(hist)
                
with open("hists.pickle", "wb") as f:
    pickle.dump(hists, f)


latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.1 0.1
best val_acc: 0.8283938392198104
on epoch 99

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.1 0.3
best val_acc: 0.8187775854677373
on epoch 62

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.1 0.5
best val_acc: 0.8167282262585978
on epoch 74

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.3 0.1
best val_acc: 0.8237996564159714
on epoch 98

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.3 0.3
best val_acc: 0.8211422412902984
on epoch 98

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.3 0.5
best val_acc: 0.816435457371278
on epoch 90

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.5 0.1
best val_acc: 0.813034858035457
on epoch 73

latent_dim, optimizer, encoder_dropout, decoder_dropout
64 RMSprop 0.5 0.3
best val_acc: 0.8080803577604192
on epoch 64

latent_dim, optimizer, encoder_dr

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


In [2]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim*2,))

decoder_state_input_c = Input(shape=(latent_dim*2,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

def decode_target(ground_truth_vec):
    indices_vec = np.argmax(ground_truth_vec[0, :, :], axis=1)
    return "".join(reverse_target_char_index[c] for c in indices_vec).strip()


In [13]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    ground_truth = decode_target(decoder_target_data[seq_index: seq_index + 1])
    decoded_sentence = decode_sequence(input_seq).strip()
    print('-')
    print('Input word:', input_texts[seq_index], sep="\t")
    print('Decoded word:', decoded_sentence, sep="\t")
    print("Ground truth:", ground_truth, sep="\t")

-
Input word:	абидныи
Decoded word:	абидый
Ground truth:	обидный
-
Input word:	аблань
Decoded word:	алоба
Ground truth:	яблоня
-
Input word:	абланьныи
Decoded word:	абланный
Ground truth:	яблоневый
-
Input word:	аблъко
Decoded word:	алобка
Ground truth:	яблоко
-
Input word:	абрѣдь
Decoded word:	арода
Ground truth:	акрида
-
Input word:	августьныи
Decoded word:	августный
Ground truth:	августный
-
Input word:	авъгаръ
Decoded word:	авгра
Ground truth:	авгар
-
Input word:	авъгустъ
Decoded word:	август
Ground truth:	август
-
Input word:	авьныи
Decoded word:	авний
Ground truth:	явный
-
Input word:	агода
Decoded word:	агода
Ground truth:	ягода
-
Input word:	адамантъ
Decoded word:	амман
Ground truth:	алмаз
-
Input word:	адамасъ
Decoded word:	аммар
Ground truth:	алмаз
-
Input word:	адовнии
Decoded word:	адовный
Ground truth:	адовый
-
Input word:	адрила
Decoded word:	адилина
Ground truth:	ветрила
-
Input word:	адъвъ
Decoded word:	адво
Ground truth:	адовый
-
Input word:	адьскыи
Decoded word:	адски

In [15]:
# BiLSTM
def build_model(latent_dim, optimizer, encoder_dropout, decoder_dropout):
    earlyStopping = EarlyStopping(monitor='val_acc', patience=10, verbose=0)
#     mcp_save = ModelCheckpoint(file_name + '_e{epoch:02d}_v{val_loss:.2f}.hdf5', save_best_only=True, monitor='val_acc')
    
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = Bidirectional(LSTM(latent_dim, return_state=True, recurrent_dropout=encoder_dropout))
    
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True, recurrent_dropout=decoder_dropout)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(optimizer=optimizer, loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# model.save('s2s.h5')

In [52]:
hists = list()
for latent_dim in [32, 64, 128]:
    for optimizer in ['Adam']:
        for encoder_dropout in [0.3, 0.5]:
            for decoder_dropout in [0.3, 0.5]:
                for batch_size in [64, 128]:
                    print()
                    print("latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size")
                    print(latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size)
                    model = build_model(latent_dim, optimizer, encoder_dropout, decoder_dropout)
                    hist = model.fit(
                        [encoder_input_data, decoder_input_data],
                        decoder_target_data,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.15,
                        callbacks=[earlyStopping],
                        verbose=0
                    )
                    m = max(hist.history["val_acc"])
                    print("best val_acc:", m)
                    print("on epoch", hist.history["val_acc"].index(m))
                    hists.append(hist)
                
with open("hists.pickle", "wb") as f:
    pickle.dump(hists, f)


latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size
32 Adam 0.3 0.3 64
best val_acc: 0.84112542262182
on epoch 97

latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size
32 Adam 0.3 0.3 128
best val_acc: 0.8375521736098772
on epoch 97

latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size
32 Adam 0.3 0.5 64
best val_acc: 0.839173642849058
on epoch 96

latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size
32 Adam 0.3 0.5 128
best val_acc: 0.8305858287536227
on epoch 99

latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size
32 Adam 0.5 0.3 64


KeyboardInterrupt: 

In [11]:
# BiLSTM-128

latent_dim = 128
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = Bidirectional(LSTM(latent_dim, return_state=True, dropout=0.5))

encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True, dropout=0.1)
# decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

optimizer = keras.optimizers.Adam(lr=3e-4)

model.compile(optimizer=optimizer, loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
         callbacks=[earlyStopping])

model.save('s2s.h5')

Train on 10445 samples, validate on 2612 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

  '. They will not be included '


In [None]:
# Attention LSTM
# Code from
# https://medium.com/@jbetker/implementing-seq2seq-with-attention-in-keras-63565c8e498c

In [8]:
# RNN "Cell" classes in Keras perform the actual data transformations at each timestep. Therefore, in order
# to add attention to LSTM, we need to make a custom subclass of LSTMCell.
class AttentionLSTMCell(LSTMCell):
    def __init__(self, **kwargs):
        self.attentionMode = False
        super(AttentionLSTMCell, self).__init__(**kwargs)
    
    # Build is called to initialize the variables that our cell will use. We will let other Keras
    # classes (e.g. "Dense") actually initialize these variables.
    @tf_utils.shape_type_conversion
    def build(self, input_shape):        
        # Converts the input sequence into a sequence which can be matched up to the internal
        # hidden state.
        self.dense_constant = TimeDistributed(Dense(self.units, name="AttLstmInternal_DenseConstant"))
        
        # Transforms the internal hidden state into something that can be used by the attention
        # mechanism.
        self.dense_state = Dense(self.units, name="AttLstmInternal_DenseState")
        
        # Transforms the combined hidden state and converted input sequence into a vector of
        # probabilities for attention.
        self.dense_transform = Dense(1, name="AttLstmInternal_DenseTransform")
        
        # We will augment the input into LSTMCell by concatenating the context vector. Modify
        # input_shape to reflect this.
        batch, input_dim = input_shape[0]
        batch, timesteps, context_size = input_shape[-1]
        lstm_input = (batch, input_dim + context_size)
        
        # The LSTMCell superclass expects no constant input, so strip that out.
        return super(AttentionLSTMCell, self).build(lstm_input)
    
    # This must be called before call(). The "input sequence" is the output from the 
    # encoder. This function will do some pre-processing on that sequence which will
    # then be used in subsequent calls.
    def setInputSequence(self, input_seq):
        self.input_seq = input_seq
        self.input_seq_shaped = self.dense_constant(input_seq)
        self.timesteps = tf.shape(self.input_seq)[-2]
    
    # This is a utility method to adjust the output of this cell. When attention mode is
    # turned on, the cell outputs attention probability vectors across the input sequence.
    def setAttentionMode(self, mode_on=False):
        self.attentionMode = mode_on
    
    # This method sets up the computational graph for the cell. It implements the actual logic
    # that the model follows.
    def call(self, inputs, states, constants):
        # Separate the state list into the two discrete state vectors.
        # ytm is the "memory state", stm is the "carry state".
        ytm, stm = states
        # We will use the "carry state" to guide the attention mechanism. Repeat it across all
        # input timesteps to perform some calculations on it.
        stm_repeated = K.repeat(self.dense_state(stm), self.timesteps)
        # Now apply our "dense_transform" operation on the sum of our transformed "carry state" 
        # and all encoder states. This will squash the resultant sum down to a vector of size
        # [batch,timesteps,1]
        # Note: Most sources I encounter use tanh for the activation here. I have found with this dataset
        # and this model, relu seems to perform better. It makes the attention mechanism far more crisp
        # and produces better translation performance, especially with respect to proper sentence termination.
        combined_stm_input = self.dense_transform(
            keras.activations.relu(stm_repeated + self.input_seq_shaped))
        # Performing a softmax generates a log probability for each encoder output to receive attention.
        score_vector = keras.activations.softmax(combined_stm_input, 1)
        # In this implementation, we grant "partial attention" to each encoder output based on 
        # it's log probability accumulated above. Other options would be to only give attention
        # to the highest probability encoder output or some similar set.
        context_vector = K.sum(score_vector * self.input_seq, 1)
        
        # Finally, mutate the input vector. It will now contain the traditional inputs (like the seq2seq
        # we trained above) in addition to the attention context vector we calculated earlier in this method.
        inputs = K.concatenate([inputs, context_vector])
        
        # Call into the super-class to invoke the LSTM math.
        res = super(AttentionLSTMCell, self).call(inputs=inputs, states=states)
        
        # This if statement switches the return value of this method if "attentionMode" is turned on.
        if(self.attentionMode):
            return (K.reshape(score_vector, (-1, self.timesteps)), res[1])
        else:
            return res

# Custom implementation of the Keras LSTM that adds an attention mechanism.
# This is implemented by taking an additional input (using the "constants" of the
# RNN class) into the LSTM: The encoder output vectors across the entire input sequence.
class LSTMWithAttention(RNN):
    def __init__(self, units, **kwargs):
        cell = AttentionLSTMCell(units=units)
        self.units = units
        super(LSTMWithAttention, self).__init__(cell, **kwargs)
        
    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        self.input_dim = input_shape[0][-1]
        self.timesteps = input_shape[0][-2]
        return super(LSTMWithAttention, self).build(input_shape) 
    
    # This call is invoked with the entire time sequence. The RNN sub-class is responsible
    # for breaking this up into calls into the cell for each step.
    # The "constants" variable is the key to our implementation. It was specifically added
    # to Keras to accomodate the "attention" mechanism we are implementing.
    def call(self, x, constants, **kwargs):
        if isinstance(x, list):
            self.x_initial = x[0]
        else:
            self.x_initial = x
        
        # The only difference in the LSTM computational graph really comes from the custom
        # LSTM Cell that we utilize.
        self.cell._dropout_mask = None
        self.cell._recurrent_dropout_mask = None
        self.cell.setInputSequence(constants[0])
        return super(LSTMWithAttention, self).call(inputs=x, constants=constants, **kwargs)

# Below is test code to validate that this LSTM class and the associated cell create a
# valid computational graph.
test = LSTMWithAttention(units=latent_dim, return_sequences=True, return_state=True)
test.cell.setAttentionMode(True)
# attenc_inputs2 = Input(shape=(max_encoder_seq_length,))
# attenc_emb2 = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
attenc_emb2 = Input(shape=(None, num_encoder_tokens))
test(inputs=attenc_emb2, constants=attenc_emb2, initial_state=None)

[<tf.Tensor 'lstm_with_attention_1/transpose_1:0' shape=(?, ?, ?) dtype=float32>,
 <tf.Tensor 'lstm_with_attention_1/while/Exit_2:0' shape=(?, 128) dtype=float32>,
 <tf.Tensor 'lstm_with_attention_1/while/Exit_3:0' shape=(?, 128) dtype=float32>]

In [20]:
def build_attention_model(latent_dim, optimizer, encoder_dropout, decoder_dropout):
    attenc_inputs = Input(shape=(max_encoder_seq_length, num_encoder_tokens), name="attenc_inputs")
    attenc_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, recurrent_dropout=encoder_dropout))
    attenc_outputs, forward_h, forward_c, backward_h, backward_c = attenc_lstm(attenc_inputs)
    
    attstate_h = Concatenate()([forward_h, backward_h])
    attstate_c = Concatenate()([forward_c, backward_c])
    attenc_states = [attstate_h, attstate_c]

    attdec_inputs = Input(shape=(max_decoder_seq_length, num_decoder_tokens))
    attdec_lstm = LSTMWithAttention(units=latent_dim*2, return_sequences=True, return_state=True)
    attdec_lstm_out, _, _ = attdec_lstm(inputs=attdec_inputs, 
                                        constants=attenc_outputs, 
                                        initial_state=attenc_states)
    attdec_d1 = Dense(latent_dim, activation="relu")
    attdec_d2 = Dense(num_decoder_tokens, activation="softmax")
    attdec_out = attdec_d2(Dropout(rate=decoder_dropout)(attdec_d1(Dropout(rate=decoder_dropout)(attdec_lstm_out))))

    attmodel = Model([attenc_inputs, attdec_inputs], attdec_out)
    attmodel.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=['categorical_accuracy'])
    return attmodel

model = build_attention_model(64, keras.optimizers.adam(lr=5e-3), 0.5, 0.4)
hist = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=128,
    epochs=100,
    validation_split=0.15,
    callbacks=[earlyStopping],
    verbose=1
)

In [None]:
hists = list()
for latent_dim in [32, 64, 128]:
    for optimizer in ['Adam', 'rmsprop']:
        for encoder_dropout in [0.1, 0.3, 0.5]:
            for decoder_dropout in [0,1, 0.3, 0.5]:
                for batch_size in [64, 128]:
                print()
                print("latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size")
                print(latent_dim, optimizer, encoder_dropout, decoder_dropout, batch_size)
                model = build_attention_model(latent_dim, optimizer, encoder_dropout, decoder_dropout)
                hist = model.fit(
                    [encoder_input_data, decoder_input_data],
                    decoder_target_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.15,
                    callbacks=[earlyStopping],
                    verbose=0
                )
                m = max(hist.history["val_acc"])
                print("best val_acc:", m)
                print("on epoch", hist.history["val_acc"].index(m))
                hists.append(hist)
                
with open("hists_att.pickle", "wb") as f:
    pickle.dump(hists, f)

In [21]:
# BiLSTM-128 + Att

latent_dim, optimizer, encoder_dropout, decoder_dropout = 128, keras.optimizers.adam(lr=5e-3), 0.4, 0.5
attenc_inputs = Input(shape=(max_encoder_seq_length, num_encoder_tokens), name="attenc_inputs")
attenc_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=encoder_dropout, recurrent_dropout=0.2))
attenc_outputs, forward_h, forward_c, backward_h, backward_c = attenc_lstm(attenc_inputs)

attstate_h = Concatenate()([forward_h, backward_h])
attstate_c = Concatenate()([forward_c, backward_c])
attenc_states = [attstate_h, attstate_c]

attdec_inputs = Input(shape=(max_decoder_seq_length, num_decoder_tokens))
attdec_lstm = LSTMWithAttention(units=latent_dim*2, return_sequences=True, return_state=True)
attdec_lstm_out, _, _ = attdec_lstm(inputs=attdec_inputs, 
                                    constants=attenc_outputs, 
                                    initial_state=attenc_states)
attdec_d1 = Dense(latent_dim, activation="relu")
attdec_d2 = Dense(num_decoder_tokens, activation="softmax")
attdec_out = attdec_d2(Dropout(rate=decoder_dropout)(attdec_d1(Dropout(rate=decoder_dropout)(attdec_lstm_out))))

attmodel = Model([attenc_inputs, attdec_inputs], attdec_out)
attmodel.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=['accuracy'])

hist = attmodel.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=100,
    validation_split=0.20,
    verbose=1
)

Train on 10445 samples, validate on 2612 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

In [23]:
# attmodel.save("oldrus_rus_close_model.h5")

  '. They will not be included '


In [12]:
# units = 128
# from keras.models import load_model
# from keras.utils import CustomObjectScope

# with CustomObjectScope({
#     "AttentionLSTMCell": AttentionLSTMCell,
#     "LSTMWithAttention": LSTMWithAttention,
#     }):
#     model = load_model('oldrus_rus_close_model.h5')