## Import packages
* numpy - package for scientific computing with Python

In [None]:
import numpy as np

## Import keras packages
* Model - Import the functional API to define layers as functions
* Input - the input layer for the models defined using functional api
* LSTM - the Long Short-Term Memory RNN layer
* plot_model - module provides utility functions to plot a Keras model
* TensorBoard - TensorBoard is a visualization tool provided with TensorFlow.
* ModelCheckpoint - Saves the model after every epoch.

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.utils import plot_model
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

## Define model related constants
* Batch size for training.
* Number of epochs to train for.
* Latent dimensionality of the encoding space.
* Number of samples to train on.
* Path to the data txt file on disk. (Downloadable from http://www.manythings.org/anki/)

In [None]:
batch_size = 64  

In [None]:
epochs = 100  

In [None]:
latent_dimension = 256  

In [None]:
num_samples = 10000  

In [None]:
data_path = 'fra-eng/fra.txt'

## Vectorize the input
* Define lists and sets to hold the input and target words and characters respectively.
* Open the file from the data path and read the contents into a list, splitting each line at new line.
* Iterate over the contents of the list and split each line into input and target text using tab as the start sequence and \n as the end sequence.
* If num_samples < length of lines, we iterate until the end of the list. Otherwise over the first 10,000 lines.
* For each character in the input and target text, add them to the sets we defined unless they alread exist.
* Sort the input and target characters alphabetically.The sorted() method sorts the elements of a given iterable in a specific order.

In [None]:
input_texts = []
target_texts = []

In [None]:
input_characters = set()
target_characters = set()

In [None]:
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

In [None]:
for line in lines[: min(num_samples, len(lines) - 1)]:
    
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
            
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [None]:
len(input_texts)

In [None]:
input_texts

In [None]:
len(target_texts)

In [None]:
target_texts

In [None]:
len(input_characters)

In [None]:
input_characters = sorted(list(input_characters))

In [None]:
input_characters

In [None]:
len(target_characters)

In [None]:
target_characters = sorted(list(target_characters))

In [None]:
target_characters

## Define input related constants
* Set the number of unique input tokens as the length of input_characters
* Set the number of unique output tokens as the length of target_characters
* Set the maximum length of each input sequence as the length of the longest line in the input texts
* Set the maximum length of each output sequence as the length of the longest line in the target texts

In [None]:
num_encoder_tokens = len(input_characters)

In [None]:
num_decoder_tokens = len(target_characters)

In [None]:
max_encoder_seq_length = max([len(txt) for txt in input_texts])

In [None]:
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [None]:
print('Max sequence length for inputs:', max_encoder_seq_length)

In [None]:
print('Max sequence length for outputs:', max_decoder_seq_length)

## Input preprocessing
* Index the input and output characters in two dictionaries respectively
* Initialize two numpy arrays with zeroes of dimensions [num_samples, max_seq_length, num_tokens] for input and output respectively.
* Turn the sentences into 3 Numpy arrays, encoder_input_data, decoder_input_data, decoder_target_data:
    * encoder_input_data is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) containing a one-hot vectorization of the English sentences.
    * decoder_input_data is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) containg a one-hot vectorization of the French sentences.
    * decoder_target_data is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].
* This is done as follows:
    * Use zip to map the contents of input_texts to target_texts and use enumerate to index them. 
    * Iterate over each character in the input/target text to perform one hot encoding in the postions of the character in the np arrays. 

In [None]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])

In [None]:
input_token_index

In [None]:
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [None]:
target_token_index

In [None]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')

In [None]:
encoder_input_data.shape

In [None]:
encoder_input_data

In [None]:
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
decoder_input_data.shape

In [None]:
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
decoder_target_data.shape

In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
            
# print("Sample values of i, input_text, target_text : ",i,input_text, target_text)
# print("Sample values of t, char, input_token_index[char] : ", t,char,input_token_index[char] )

In [None]:
print(encoder_input_data)

In [None]:
print(decoder_input_data)

In [None]:
print(decoder_target_data)

## Define the encoder
* Define the input layer. The input to the encoder is a sequence of characters, each encoded as one-hot vectors with length of num_encoder_tokens.
* Define an LSTM layer with the return_state argument set to True. This returns the hidden state output returned by LSTM layers generally, as well as the hidden and cell state for all cells in the layer. These are used when defining the decoder.
* Discard `encoder_outputs` and only keep the states.

In [None]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))

In [None]:
encoder = LSTM(latent_dimension, return_state=True)

In [None]:
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

In [None]:
encoder_states = [state_h, state_c]

## Define the decoder
* The decoder input is defined as a sequence of French character one-hot encoded to binary vectors with a length of num_decoder_tokens.
* The LSTM layer is defined to both return sequences and state. The final hidden and cell states are ignored and only the output sequence of hidden states is referenced.
* The final hidden and cell state from the encoder is used to initialize the state of the decoder. This means every time that the encoder model encodes an input sequence, the final internal states of the encoder model are used as the starting point for outputting the first character in the output sequence. This also means that the encoder and decoder LSTM layers must have the same number of cells, in this case, 256.
* A Dense output layer is used to predict each character. This Dense is used to produce each character in the output sequence in a one-shot manner, rather than recursively, at least during training. This is because the entire target sequence required for input to the model is known during training.

In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))

In [None]:
decoder_lstm = LSTM(latent_dimension, return_sequences=True, return_state=True)

In [None]:
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)

In [None]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

In [None]:
decoder_outputs = decoder_dense(decoder_outputs)

## Define the model
* Define the model with inputs for the encoder and the decoder and the output target sequence.
* Compile the model with rmsprop as optimizer which is the usual choice for recurrent neural networks and categorical_crossentropy as the loss funciton as the targets are categorical.
* Define a checkpoint. The model checkpoints will be saved with the epoch number and the validation loss in the filename.
* start tensorboard from the terminal using the command 'tensorboard --logdir=/tmp/autoencoder'
* Fit the model.Set the validation_split argument in model.fit to 0.25, then the validation data used will be the last  25% of the data.
* Pass in the checkpoint callback and the tensorboard callback to the callbacks argument.

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
filepath="saved_models/weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
          callbacks=[TensorBoard(log_dir='/tmp/autoencoder'),checkpoint])

## Define the encoder model for sampling
* Because the training process and inference process (decoding sentences) are quite different, we use different models for both, albeit they all leverage the same inner layers.
* The encoder model is defined as taking the input layer from the encoder in the trained model (encoder_inputs) and outputting the hidden and cell state tensors (encoder_states).

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
plot_model(encoder_model, to_file='encoder_model.png', show_shapes=True)

## Define the decoder model for sampling
* The decoder requires the hidden and cell states from the encoder as the initial state of the newly defined encoder model. 
* Because the decoder is a separate standalone model, these states will be provided as input to the model, and therefore must first be defined as inputs.
* They can then be specified for use as the initial state of the decoder LSTM layer.
* Both the encoder and decoder will be called recursively for each character that is to be generated in the translated sequence.
* On the first call, the hidden and cell states from the encoder will be used to initialize the decoder LSTM layer, provided as input to the model directly.
* On subsequent recursive calls to the decoder, the last hidden and cell state must be provided to the model. 
* Therefore, the decoder must output the hidden and cell states along with the predicted character on each call, so that these states can be assigned to a variable and used on each subsequent recursive call for a given input sequence of English text to be translated.

In [None]:
decoder_state_input_h = Input(shape=(latent_dimension,))

In [None]:
decoder_state_input_c = Input(shape=(latent_dimension,))

In [None]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [None]:
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)

In [None]:
decoder_states = [state_h, state_c]

In [None]:
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
plot_model(decoder_model, to_file='decoder_model.png', show_shapes=True)

## Sampling
* Reverse-lookup token index to decode sequences back to something readable.
* Define method to test the inference model:
    * Encode the input as state vectors and retrieve initial decoder state.
    * Generate empty target sequence of length 1.
    * Populate the first character of target sequence with the start character(tab).
    * Set stop condition to false and initalise an empty string for the output.
    * Feed the state vectors and 1-char target sequence to the decoder to produce predictions for the next character.
    * Sample the next character using these predictions (we simply use argmax).
    * Append the sampled character to the target sequence
    * Repeat until we generate the end-of-sequence character or we hit the character limit.
* Pass a subset of training set to this method for trying out decoding.

In [None]:
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())

In [None]:
reverse_input_char_index

In [None]:
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
reverse_target_char_index

In [None]:
def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))

    target_seq[0, 0, target_token_index['\t']] = 1

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char


        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True


        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(100):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    
    decoded_sentence = decode_sequence(input_seq)
    
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)