In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import RMSprop

## Sequence to sequence auto encoder

In this notebook, we implement a sequence to sequence autoencoder which we will train for a english/french language translation task. 

#### You first  need to download the data on your computer. Go to http://www.manythings.org/anki/, pick your favorite language pairs (must be some_language/english !), and place the `.txt` file next to this notebook.

### The idea
We will use a first LSTM, which will be fed english sentences (after embedding of the words). We will capture the final hidden state of the LSTM, and use it to initialize the hidden state of a decoder LSTM.
Then, the decoder LSTM produces a sequence of outputs which correspond to the translated sentence (one-hot representations of the french words).

To simplify, during training, the decoder LSTM is fed, at each step, with the right french sentences (with an extra "<start>" word on the left), but with a delay of one. This is called teacher forcing.

During inference, the decoder LSTM is fed with its own output at every step. This makes the inference task more difficult sadly. There are ways to circumvent this, but they are out of the scope of today !

![image_seq2seq.png](image_seq2seq.png)

### The data

The following cell loads the data from a txt file. `input_texts` is a list of english sentences and `target_texts` is the list of their french translations.

#### You may need to change the path !

In [None]:
import unicodedata
import string

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 50  # Latent dimensionality of the encoding space.
num_samples = 50000  # Number of samples to train on.

data_path = 'fra.txt'

en_texts = []
fr_texts = []

def clean_string(s):
    s = s.lower()
    s = s.replace("'", " ")
    s = s.replace("’", " ")
    s = s.replace(",", " ")
    s = s.replace(" essaye ", " essaie ")
    s = s.replace("0","")
    s = s.replace("1","")
    s = s.replace("2","")
    s = s.replace("3","")
    s = s.replace("4","")
    s = s.replace("5","")
    s = s.replace("6","")
    s = s.replace("7","")
    s = s.replace("8","")
    s = s.replace("9","")
    s = s.replace(" re ", " are ")
    s = s.replace("he s ", "he is ")
    s = s.replace("we re ", "we are ")
    s = s.replace("they re ", "they are ")
    s = s.replace("i m ", "i am ")
    return s
    
#eng_prefixes = (
#    "i am ", "i m ",
#    "he is", "he s ",
#    "she is", "she s ",
#    "you are", "you re ",
#    "we are", "we re ",
#    "they are", "they re "
#)
eng_prefixes = (
    "i am ", "i m "
)

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')[:2]
    target_text = ''.join((c for c in unicodedata.normalize('NFD', target_text) if unicodedata.category(c) != 'Mn'))
    input_text = clean_string(input_text)
    if input_text.startswith(eng_prefixes) and len(target_text.split(' ')) < 10:
        en_texts.append(clean_string(input_text))
        fr_texts.append(target_text)
            

    
fr_texts = ['<start> ' + elt + ' <end>' for elt in fr_texts]

print(len(en_texts))
for (i,o) in zip(en_texts[:10], fr_texts[:10]):
    print(i, o)

Tokenization of the text.

The tokenizer object in tensorflow.keras.preprocessing.text is useful to convert words into integers, with some extra cleaning operation. Its useful methods in our case are:
1. `fit_on_texts` method on the list of sentences, which fits the tokenizer to the given vocabulary. To be called before doing anything else.
2. The `text_to_sequences` on the list of sentences: it returns a list of lists of integers, where each integer correspond to a word.
3. `sequences_to_texts` does the opposite operation.

#### Fit one tokenizer per language, and compute the sequences (i.e. lists of lists of integers) `en_sequences`, `fr_sequences` corresponding to `en_texts` and `fr_texts`.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
# Keep only vocab_size tokens.
vocab_size = 10000 # maximum vocab size for both languages

en_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<UNK>')
fr_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<UNK>', filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

# fit the tokenizers and compute the sequences:


#### Compute the sizes of the english and french vocabularies (the number of different words)

#### Compute the maximum lengths of the english and french sequences.

In [None]:
en_maxlen = ?
fr_maxlen = ?

print('Max length english:', en_maxlen)
print('Max length french:', fr_maxlen)

#### Convert the sequences back to texts, and look at the result.
The text has been cleaned. Rare words have been replaced by an unknown token, all characters have been lowered etc.

#### We will now prepare the inputs.

All of these operations will be done by you and no high-level functions.

#### First, prepare an array `input_decoder`. This array should contain the english sequences, padded on the right. Its shape should be `(len(en_sequences), en_maxlen)`. We adopt the convention that padding is the value 0. 

In [None]:
import numpy as np

input_encoder = np.zeros((len(en_sequences), en_maxlen), dtype=np.float32)

## FILL THE ARRAY

#### Then, prepare an array `input_decoder` containing the french sequences. Its shape should be `(len(fr_sequences), fr_maxlen)`. Each element of this array corresponds to a sequence (with the "start" word: think about what would happen, or test yourself when you have the model, if we omit this "start" word in this input_decoder). Again the sequences are padded to the right.

In [None]:
input_decoder = np.zeros((len(fr_sequences), fr_maxlen), dtype=np.float32)

## FILL THE ARRAY

#### Prepare an array `output_decoder` containing the french sequences. This time:
- don't include the "start" word (which is always the first in the fr_sequences)
- do include the "end" word
- do a one-hot encoding of the classes (without to_categorical) because we will use a `categorical_cross_entropy` on the predictions (which will be softmax over the fr vocabulary length) 

In [None]:
output_decoder = np.zeros((len(fr_sequences), fr_maxlen, fr_vocab_size), dtype=np.bool)
    
## FILL THE ARRAY

#### We will now define the model.

#### We first define the encoder part, using the 'complicated' keras syntax. Complete the code below using the comments.

In [None]:
embedding_size = 50


# Declare an embedding layer, for the english language
encoder_embedding =

# Define an LSTM layer (100s of units this time), make it return its state
encoder_lstm = 

# We now apply the embedding and the lstm to the input
encoder_inputs = Input(shape=(None,))
embedded_encoder_inputs = encoder_embedding(encoder_inputs)
_,state_h, state_c = encoder_lstm(embedded_encoder_inputs)

encoder_states = [state_h, state_c]

#### [No code needed]  We now define the decoder part. Look at this code, try to understand what is happening !

In [None]:
decoder_inputs = Input(shape=(None,))# french word embeddings
decoder_embedding =  Embedding(fr_vocab_size, embedding_size)

embedded_decoder_inputs = decoder_embedding(decoder_inputs)

# decoder lstm
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(embedded_decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# While training, model takes eng and french words and outputs #translated french word
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

#### [No code needed] What does the `translate` method do ?
The `epoch_end` is just a wrapping of the `translate` as a keras callback.

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

embedded_decoder_inputs = decoder_embedding(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(embedded_decoder_inputs, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

print(encoder_model.summary())
print(decoder_model.summary())

def translate(input_seq):
    # Encode the input as state vectors.
    h, c = encoder_model.predict([input_seq])
    states_value = [h, c]
    if len(h) > 1:
        states_value = [h[-1:], c[-1:]]
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = fr_tokenizer.word_index['<start>']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)# Sample a token
        sampled_word_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.sequences_to_texts([[sampled_word_index]])[0]
        #sampled_word = fr_index_to_word[sampled_word_index]
        decoded_sentence += ' ' + sampled_word
        # Exit condition: either hit max length
        if (sampled_word == '<end>' or len(decoded_sentence) > 52):
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_word_index
        states_value = [h, c]
        
    return decoded_sentence
    
# The corresponding callback:
def on_epoch_end(epoch, _):
    for _ in range(1):
        i = np.random.randint(0, len(en_sequences))
        sentence = cleaned_en_sentences[i]
        translated_sentence = translate(en_sequences[i])
        translations = model.predict([[input_encoder[i]], [input_decoder[i]]])
        translations_sequences = [[np.argmax(elt) for elt in trans] for trans in translations]
        translated_forced = fr_tokenizer.sequences_to_texts(translations_sequences)[0]
        translated_forced = translated_forced[:translated_forced.find('<end>')]
        print('-')
        print('Sentence:', sentence)
        print('Translated:', translated_sentence)
        print('Translated with teacher forcing:', translated_forced)
        #print('Real translation', real_translation)
    
from tensorflow.keras.callbacks import LambdaCallback
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#for i in range(20):
#    print(translate(en_sequences[i]))


In [None]:
#### We now fit the model !

#### [No code needed] What does this cell do ? Are you satisfied with the results ?

In [None]:
translations = model.predict([input_encoder, input_decoder])
translations_sequences = [[np.argmax(elt) for elt in trans] for trans in translations]
translations = fr_tokenizer.sequences_to_texts(translations_sequences)
translations = [elt[:elt.find('<end>')] for elt in translations]
for (tr, en) in zip(translations, cleaned_en_sentences):
    print(tr, en)

#### What do you think of this output ? What do you think is critically limiting this model ? How would you improve the results ?

#### More generally, what do you think of this seq2seq model ? Do you think it is biased in some ways ?