In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")

# Common imports
import numpy as np
import os

# To make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [None]:
### CHAR-RNN ###

In [None]:
## Loading and preparing the dataset ##

In [None]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [None]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [None]:
# Let's print all the characters from the text
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [None]:
# The default tokenization is at word-level encoding
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # char_level=True <=> every ch will be treated as a token
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])
# Tokenizer lowercases the characters by default

['f i r s t']

In [None]:
max_id       = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count  # total number of characters
max_id, dataset_size

(39, 1115394)

In [None]:
# Let's encode the full text so each ch is represented by its ID
# Also, subtract 1 to get IDs from 0 to 38, rather than from 1 to 39
[encoded]  = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

# Take the first 90% of the text for the training set
train_size = dataset_size * 90 // 100
dataset    = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
# tf.data.Dataset will return each character one by one from this set

In [None]:
# The training set now consists of a single sequence of over a million characters
n_steps       = 100
window_length = n_steps + 1 # target = input shifted 1 character to the right

# Shift the window to the right 1 ch for each step
# 1st window will have characters 0 to 100
# 2nd window will have characters 1 to 101
# ...
dataset       = dataset.window(window_length, shift=1, drop_remainder=True)

# Now, the dataset looks like this:
# Dataset{window_1, window_2, ...}, where `window_i` is also a `Dataset` object of length `window_length`

In [None]:
# Now, the `dataset` is a neseted dataset. We need to flatten it
# Nested dataset -> Flat dataset

# We need to convert each `window_i` to a `window_length` dimensions tensor
# The dataset will look like this now:
# Dataset{batch_1, batch_2, ...}, where `batch_i` is a `Tensor` of `window_length` dimensions

dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
# Now, the dataset contains consecutive windows of 101 (`window_length`) characters each

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
batch_size = 32

# Shuffle the windows from the dataset (not the inner characters)
# Then batch the dataset, so that each batch will contain 32 tensors ("windows")
dataset    = dataset.shuffle(10000).batch(batch_size)

# Then, separate the inputs (first 100 chars) from the target (last ch)
dataset    = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# windows[:, :-1] == everything without the last  ch
# windows[:, 1:]  == everything without the first ch

In [None]:
# Let's encode each character using a one-hot vector, because there are fairly few distinct characters (only 39)
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
# Finally, we just need to add prefetching
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)
# (batch_size=32, (seq_len=100, one_hot_enc_len=39)) (batch_size=32, seq_len_target=100)

(32, 100, 39) (32, 100)


In [None]:
## Building and Training the Char-RNN Model ##

In [None]:
# `Warning`: the following code may take up to 24 hours to run, depending on your hardware.
# If you use a GPU, it may take just 1 or 2 hours, or less.

# `Note`: the GRU class will only use the GPU (if you have one) when using the default values
# for the following arguments: [activation, recurrent_activation, recurrent_dropout, unroll, use_bias and reset_after].
# This is why I commented out recurrent_dropout=0.2

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax")) # max_id = 39 (distinct characters)
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# history = model.fit(dataset, epochs=10)

In [None]:
# ...

In [None]:
## Stateful RNN ##

In [None]:
# Until now, we have used only `stateless` RNNs: at each training iteration the model
# starts with a hidden state full of zeros, then it updates this state at each time step, and
# after the last time step, it throws it away, as it is not needed anymore. What if we told
# the RNN to preserve this final state after processing one training batch and use it as
# the initial state for the next training batch? This way the model can learn long-ter
# patterns despite only backpropagating through short sequences. This is called a `stateful` RNN

In [None]:
tf.random.set_seed(42)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)
# This time, we have batches of size 1 (it's harder to have batches (of 32 length for example), when using stateful RNN)

In [None]:
# `Note`: once again, I commented out recurrent_dropout=0.2 so you can get GPU acceleration
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2,
                     dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [None]:
# At the end of each *epoch*, we need to reset the states before we go back
# to the beginning of the text. For this, we can use a small callback.

class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# history = model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])

In [None]:
# After this model is trained, it will only be possible to use it to make
# predictions for batches of the same size as were used during training.
# To avoid this restriction, create an identical stateless model,
# and copy the stateful model’s weights to this model.

In [None]:
### SENTIMENT ANALYSIS ###

In [None]:
tf.random.set_seed(42)

In [None]:
# Let's load the IMDB dataset
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [None]:
# Where are the movie reviews? Well, as you can see, the dataset is already prepro‐
# cessed for you: X_train consists of a list of reviews, each of which is represented as a
# NumPy array of integers, where each integer represents a word. All punctuation was
# removed, and then words were converted to lowercase, split by spaces, and finally
# indexed by frequency (so low integers correspond to frequent words). The integers 0,
# 1, and 2 are special: they represent the padding token, the start-of-sequence (SSS)
# token, and unknown words, respectively. If you want to visualize a review, you can
# decode it like this:

In [None]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

In [None]:
# If you want to deploy your model to a mobile device or a web browser, and you don’t
# want to have to write a different preprocessing function every time, then you will
# want to handle preprocessing using only TensorFlow operations, so it can be included
# in the model itself. Let’s see how. First, let’s load the original IMDb reviews, as text
# (byte strings), using TensorFlow Datasets

In [None]:
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB77KFF/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB77KFF/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB77KFF/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
datasets.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [None]:
train_size = info.splits["train"].num_examples
test_size  = info.splits["test"].num_examples

In [None]:
train_size, test_size

(25000, 25000)

In [None]:
# Let's see some review examples
for X_batch, y_batch in datasets["train"].batch(5).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative

Review: Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label: 0 = Negative

Review: This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...
Label: 1 = Positive

Review: As others have mentioned, all the women that go nude in 

In [None]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300) # truncate the reviews, keeping only the first 300 characters of each *batch*
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ") # replace <br /> tags      with spaces
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ") # replace any ch != letter with spaces
    X_batch = tf.strings.split(X_batch) # split the reviews by spaces => ragged tensor
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch # convert the ragged tensor to dense tensor, padding all reviews with the padding token `<pad>` => same length

In [None]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(5, 59), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'c

In [None]:
# Next, we need to construct the vocabulary. This requires going through the whole
# training set once, applying our `preprocess()` function, and using a `Counter` to count
# the number of occurrences of each word

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
len(vocabulary)

53893

In [None]:
# Great! We probably don’t need our model to know all the words in the dictionary to
# get good performance, though, so let’s truncate the vocabulary, keeping only the
# 10,000 most common words

In [None]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or f"no ID was found for {word}")

22
12
11
no ID was found for b'faaaaaantastic'


In [None]:
words      = tf.constant(truncated_vocabulary)
word_ids   = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

num_oov_buckets = 1000
table           = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [None]:
# This is the final preprocessing step for our training dataset
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [None]:
# Now, we can create the model and train it
embed_size = 128

model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets,
                           embed_size,
                           mask_zero=True, # this means that padding tokens (IDS 0) will be ignored by all downstream layers
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
## Reusing Pretrained Embeddings ##

In [None]:
tf.random.set_seed(42)

In [None]:
TFHUB_CACHE_DIR = os.path.join(os.curdir, "my_tfhub_cache")
os.environ["TFHUB_CACHE_DIR"] = TFHUB_CACHE_DIR

In [None]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
for dirpath, dirnames, filenames in os.walk(TFHUB_CACHE_DIR):
    for filename in filenames:
        print(os.path.join(dirpath, filename))

./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe.descriptor.txt
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/saved_model.pb
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/variables/variables.data-00000-of-00001
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/variables/variables.index
./my_tfhub_cache/82c4aaf4250ffb09088bd48368ee7fd00e5464fe/assets/tokens.txt


In [None]:
# Next, we can just load the IMDb reviews dataset — no need to preprocess it
# (except for batching and prefetching) — and directly train the model

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
### EXERCISES ###

In [None]:
# 1. What are the pros and cons of using a stateful RNN versus a stateless RNN?

# Stateless RNNs can only capture patterns whose length is less than, or equal to,
# the size of the windows the RNN is trained on. Conversely, stateful RNNs can
# capture longer-term patterns. However, implementing a stateful RNN is much
# harder—especially preparing the dataset properly. Moreover, stateful RNNs do
# not always work better, in part because consecutive batches are not independent
# and identically distributed (IID). Gradient Descent is not fond of non-IID datasets

In [None]:
# 2. Why do people use Encoder–Decoder RNNs rather than plain sequence-to-sequence RNNs for automatic translation?

# In general, if you translate a sentence one word at a time, the result will be terri‐
# ble. For example, the French sentence “Je vous en prie” means “You are welcome,”
# but if you translate it one word at a time, you get “I you in pray.” Huh? It is much
# better to read the whole sentence first and then translate it. A plain sequence-to-sequence RNN
# would start translating a sentence immediately after reading the
# first word, while an Encoder–Decoder RNN will first read the whole sentence
# and then translate it. That said, one could imagine a plain sequence-to-sequence
# RNN that would output silence whenever it is unsure about what to say next (just
# like human translators do when they must translate a live broadcast).

In [None]:
# 3. How can you deal with variable-length input sequences? What about variable length output sequences?

# Variable-length input sequences can be handled by padding the shorter sequen‐
# ces so that all sequences in a batch have the same length, and using masking to
# ensure the RNN ignores the padding token. For better performance, you may
# also want to create batches containing sequences of similar sizes. Ragged tensors
# can hold sequences of variable lengths, and tf.keras will likely support them even‐
# tually, which will greatly simplify handling variable-length input sequences (at
# the time of this writing, it is not the case yet). Regarding variable-length output
# sequences, if the length of the output sequence is known in advance (e.g., if you
# know that it is the same as the input sequence), then you just need to configure
# the loss function so that it ignores tokens that come after the end of the sequence.
# Similarly, the code that will use the model should ignore tokens beyond the end
# of the sequence. But generally the length of the output sequence is not known
# ahead of time, so the solution is to train the model so that it outputs an
# end-of-sequence token at the end of each sequence.

In [None]:
# 4. What is beam search and why would you use it? What tool can you use to implement it?

# Beam search is a technique used to improve the performance of a trained
# Encoder–Decoder model, for example in a neural machine translation system.
# The algorithm keeps track of a short list of the k most promising output senten‐
# ces (say, the top three), and at each decoder step it tries to extend them by one
# word; then it keeps only the k most likely sentences. The parameter k is called the
# beam width: the larger it is, the more CPU and RAM will be used, but also the
# more accurate the system will be. Instead of greedily choosing the most likely
# next word at each step to extend a single sentence, this technique allows the sys‐
# tem to explore several promising sentences simultaneously. Moreover, this tech‐
# nique lends itself well to parallelization. You can implement beam search fairly
# easily using TensorFlow Addons.

In [None]:
# 5. What is an attention mechanism? How does it help?

# An attention mechanism is a technique initially used in Encoder–Decoder mod‐
# els to give the decoder more direct access to the input sequence, allowing it to
# deal with longer input sequences. At each decoder time step, the current decod‐
# er’s state and the full output of the encoder are processed by an alignment model
# that outputs an alignment score for each input time step. This score indicates
# which part of the input is most relevant to the current decoder time step. The
# weighted sum of the encoder output (weighted by their alignment score) is then
# fed to the decoder, which produces the next decoder state and the output for this
# time step. The main benefit of using an attention mechanism is the fact that the
# Encoder–Decoder model can successfully process longer input sequences.
# Another benefit is that the alignment scores makes the model easier to debug and
# interpret: for example, if the model makes a mistake, you can look at which part
# of the input it was paying attention to, and this can help diagnose the issue. An
# attention mechanism is also at the core of the Transformer architecture, in the
# Multi-Head Attention layers.

In [None]:
# 6. What is the most important layer in the Transformer architecture? What is its purpose?

# The most important layer in the Transformer architecture is the Multi-Head
# Attention layer (the original Transformer architecture contains 18 of them,
# including 6 Masked Multi-Head Attention layers). It is at the core of language
# models such as BERT and GPT-2. Its purpose is to allow the model to identify
# which words are most aligned with each other, and then improve each word’s
# representation using these contextual clues

In [None]:
# 7. When would you need to use sampled softmax?

# Sampled softmax is used when training a classification model when there are
# many classes (e.g., thousands). It computes an approximation of the crossentropy loss
# based on the logit predicted by the model for the correct class, and
# the predicted logits for a sample of incorrect words. This speeds up training con‐
# siderably compared to computing the softmax over all logits and then estimating
# the cross-entropy loss. After training, the model can be used normally, using the
# regular softmax function to compute all the class probabilities based on all the
# logits.

In [None]:
# 8. Embedded Reber grammars were used by Hochreiter and Schmidhuber in their
# paper about LSTMs. They are artificial grammars that produce strings such as
# “BPBTSXXVPSEPE.” Check out Jenny Orr’s nice introduction to this topic.
# Choose a particular embedded Reber grammar (such as the one represented on
# Jenny Orr’s page), then train an RNN to identify whether a string respects that
# grammar or not. You will first need to write a function capable of generating a
# training batch containing about 50% strings that respect the grammar, and 50% that don’t.

In [None]:
# First we need to build a function that generates strings based on a grammar.
# The grammar will be represented as a list of possible transitions for each state.
# A transition specifies the string to output (or a grammar to generate it) and the next state.

In [None]:
# This is the chosen Reber grammar: https://www.willamette.edu/~gorr/classes/cs449/reber.html

In [None]:
# List index = Current state
# Grammar is represented as a list of lists,
# where each inner list defines all the next conditional states

In [None]:
default_reber_grammar = [
    [("B", 1)],           # (state 0) =B=>(state 1)
    [("T", 2), ("P", 3)], # (state 1) =T=>(state 2) or =P=>(state 3)
    [("S", 2), ("X", 4)], # (state 2) =S=>(state 2) or =X=>(state 4)
    [("T", 3), ("V", 5)], # and so on...
    [("X", 3), ("S", 6)],
    [("P", 4), ("V", 6)],
    [("E", None)]]        # (state 6) =E=>(terminal state)

In [None]:
embedded_reber_grammar = [
    [("B", 1)],
    [("T", 2), ("P", 3)],
    [(default_reber_grammar, 4)], # "recursive"
    [(default_reber_grammar, 5)], # "recursive"
    [("T", 6)],
    [("P", 6)],
    [("E", None)]]

In [None]:
def generate_string(grammar):
    state = 0
    output = []
    while state is not None:
        index = np.random.randint(len(grammar[state]))
        production, state = grammar[state][index]
        if isinstance(production, list): # if is `default_reber_grammar` type ("recursive" from the above cell)
            production = generate_string(grammar=production)
        output.append(production)
    return "".join(output)

In [None]:
# Let's generate a few strings based on the `default` Reber grammar
np.random.seed(42)

for _ in range(25):
    print(generate_string(default_reber_grammar), end=" ")

BTXXTTVPXTVPXTTVPSE BPVPSE BTXSE BPVVE BPVVE BTSXSE BPTVPXTTTVVE BPVVE BTXSE BTXXVPSE BPTTTTTTTTVVE BTXSE BPVPSE BTXSE BPTVPSE BTXXTVPSE BPVVE BPVVE BPVVE BPTTVVE BPVVE BPVVE BTXXVVE BTXXVVE BTXXVPXVVE 

In [None]:
# Looks good. Now let's generate a few strings based on the `embedded` Reber grammar
np.random.seed(42)

for _ in range(25):
    print(generate_string(embedded_reber_grammar), end=" ")

BTBPTTTVPXTVPXTTVPSETE BPBPTVPSEPE BPBPVVEPE BPBPVPXVVEPE BPBTXXTTTTVVEPE BPBPVPSEPE BPBTXXVPSEPE BPBTSSSSSSSXSEPE BTBPVVETE BPBTXXVVEPE BPBTXXVPSEPE BTBTXXVVETE BPBPVVEPE BPBPVVEPE BPBTSXSEPE BPBPVVEPE BPBPTVPSEPE BPBTXXVVEPE BTBPTVPXVVETE BTBPVVETE BTBTSSSSSSSXXVVETE BPBTSSSXXTTTTVPSEPE BTBPTTVVETE BPBTXXTVVEPE BTBTXSETE 

In [None]:
# Okay, now we need a function to generate strings that do not respect the grammar.
# We could generate a random string, but the task would be a bit too easy,
# so instead we will generate a string that respects the grammar,
# and we will corrupt it by changing just one character

In [None]:
POSSIBLE_CHARS = "BEPSTVX"

def generate_corrupted_string(grammar, chars=POSSIBLE_CHARS):
    good_string = generate_string(grammar) # generate a valid Reber string
    index       = np.random.randint(len(good_string))
    good_char   = good_string[index]
    bad_char    = np.random.choice(sorted(set(chars) - set(good_char)))
    return good_string[:index] + bad_char + good_string[index + 1:] # intersperse the `bad_char` in the `good_string`

In [None]:
# Let's look at a few corrupted strings
np.random.seed(42)

for _ in range(25):
    print(generate_corrupted_string(embedded_reber_grammar), end=" ")

BTBPTTTPPXTVPXTTVPSETE BPBTXEEPE BPBPTVVVEPE BPBTSSSSXSETE BPTTXSEPE BTBPVPXTTTTTTEVETE BPBTXXSVEPE BSBPTTVPSETE BPBXVVEPE BEBTXSETE BPBPVPSXPE BTBPVVVETE BPBTSXSETE BPBPTTTPTTTTTVPSEPE BTBTXXTTSTVPSETE BBBTXSETE BPBTPXSEPE BPBPVPXTTTTVPXTVPXVPXTTTVVEVE BTBXXXTVPSETE BEBTSSSSSXXVPXTVVETE BTBXTTVVETE BPBTXSTPE BTBTXXTTTVPSBTE BTBTXSETX BTBTSXSSTE 

In [None]:
# We cannot feed strings directly to an RNN, so we need to encode them somehow. One option would be to one-hot encode each character.
# Another option is to use embeddings. Let's go for the second option (but since there are just a handful of characters,
# one-hot encoding would probably be a good option as well). For embeddings to work, we need to convert each string into a sequence of character IDs.
# Let's write a function for that, using each character's index in the string of possible characters "BEPSTVX"

In [None]:
def string_to_ids(s, chars=POSSIBLE_CHARS):
    return [chars.index(c) for c in s]

In [None]:
string_to_ids("BTTTXXVVETE")

[0, 4, 4, 4, 6, 6, 5, 5, 1, 4, 1]

In [None]:
# We can now generate the dataset, with 50% good strings, and 50% bad strings
# We generate the dataset based on the `embedded reber grammar`, not on the `default reber grammar` 
def generate_dataset(size):
    # Generate the `good_strings` and `bad_strings`
    good_strings = [string_to_ids(generate_string(embedded_reber_grammar)) for _ in range(size // 2)]
    bad_strings  = [string_to_ids(generate_corrupted_string(embedded_reber_grammar)) for _ in range(size - size // 2)]
    all_strings  = good_strings + bad_strings

    # Convert the strings to tensor (X)
    X = tf.ragged.constant(all_strings, ragged_rank=1)

    # Set the labels
    y = np.array([[1.] for _ in range(len(good_strings))] +
                 [[0.] for _ in range(len(bad_strings))])
    
    # Return the training dataset with its corresponding labels
    return X, y

In [None]:
np.random.seed(42)

X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)

In [None]:
# Let's take a look at the first training sequence
X_train[0]

<tf.Tensor: shape=(22,), dtype=int32, numpy=
array([0, 4, 0, 2, 4, 4, 4, 5, 2, 6, 4, 5, 2, 6, 4, 4, 5, 2, 3, 1, 4, 1],
      dtype=int32)>

In [None]:
y_train[0]

array([1.])

In [None]:
X_train[0].shape, X_train[1].shape

(TensorShape([22]), TensorShape([11]))

In [None]:
# Perfect! We are ready to create the RNN to identify good strings. We build a simple sequence binary classifier
np.random.seed(42)
tf.random.set_seed(42)

embedding_size = 5

model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS), output_dim=embedding_size),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation="sigmoid")
])

optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum=0.95, nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 5)           35        
                                                                 
 gru (GRU)                   (None, 30)                3330      
                                                                 
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 3,396
Trainable params: 3,396
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20


  "shape. This may consume a large amount of memory." % value)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Now let's test our RNN on two tricky strings: the first one is bad while the second one is good. They only differ by the second to last character.
# If the RNN gets this right, it shows that it managed to notice the pattern that the second letter should always be equal to the second to last letter.
# That requires a fairly long short-term memory (which is the reason why we used a GRU cell).

In [None]:
test_strings = ["BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
                "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]
X_test = tf.ragged.constant([string_to_ids(s) for s in test_strings], ragged_rank=1)

y_proba = model.predict(X_test)
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
    print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))

Estimated probability that these are Reber strings:
BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 0.01%
BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 99.33%


In [None]:
# It worked fine. The RNN found the correct answers with very high confidence :)

In [2]:
# 9. Train an Encoder–Decoder model that can convert a date string from one format to another (e.g., from “April 22, 2019” to “2019-04-22”)

In [3]:
# Let's start by creating the dataset. We will use random days between 1000-01-01 and 9999-12-31

In [4]:
from datetime import date

# cannot use strftime()'s %B format since it depends on the locale
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates    = [date.fromordinal(ordinal) for ordinal in ordinals]

    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates]
    
    return x, y

In [5]:
# Here are a few random dates, displayed in both the input format and the target format
np.random.seed(42)

n_dates = 3
x_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print("{:25s}{:25s}".format(x_example[idx], y_example[idx]))

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [6]:
# Let's get the list of all possible characters in the inputs
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [7]:
# And here's the list of possible characters in the outputs
OUTPUT_CHARS = "0123456789-"

In [8]:
# Let's write a function to convert a string to a list of character IDs, as we did in the previous exercise
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

In [9]:
x_example[0]

'September 20, 7075'

In [10]:
date_str_to_ids(x_example[0], INPUT_CHARS)

[19, 23, 31, 34, 23, 28, 21, 23, 32, 0, 4, 2, 1, 0, 9, 2, 9, 7]

In [11]:
y_example[0]

'7075-09-20'

In [12]:
date_str_to_ids(y_example[0], OUTPUT_CHARS)

[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]

In [13]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X     = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID

def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [14]:
np.random.seed(42)

X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test,  Y_test  = create_dataset(2000)

In [15]:
X_train[0]
# e.g.: September 22, 2019

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([20, 24, 32, 35, 24, 29, 22, 24, 33,  1,  5,  3,  2,  1, 10,  3, 10,
        8], dtype=int32)>

In [16]:
Y_train[0]
# YYYY-MM-DD

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  1,  8,  6, 11,  1, 10, 11,  3,  1], dtype=int32)>

In [17]:
## First version: a very basic seq2seq model ##

# Let's first try the simplest possible model: we feed in the input sequence, which first goes through the encoder
# (an embedding layer followed by a single LSTM layer), which outputs a vector,
# then it goes through a decoder (a single LSTM layer, followed by a dense output layer),
# which outputs a sequence of vectors, each representing the estimated probabilities for all possible output character.
# Since the decoder expects a sequence as input, we repeat the vector (which is output by the encoder) as many times as the longest possible output sequence.

In [18]:
embedding_size    = 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [19]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 128)               83680     
                                                                 
 repeat_vector (RepeatVector  (None, 10, 128)          0         
 )                                                               
                                                                 
 sequential_1 (Sequential)   (None, 10, 12)            133132    
                                                                 
Total params: 216,812
Trainable params: 216,812
Non-trainable params: 0
_________________________________________________________________


In [20]:
history = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
# Let's use the model to make some predictions. We will need to be able to convert a sequence of character IDs to a readable string:
def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence]) for sequence in ids]

In [22]:
X_new = prepare_date_strs(["September 17, 2009", "July 14, 1789"])

In [23]:
ids = np.argmax(model.predict(X_new), axis=-1)
for date_str in ids_to_date_strs(ids):
    print(date_str)

2009-09-17
1789-07-14


In [24]:
# Perfect! :)
# However, since the model was only trained on input strings of length 18 (which is the length of the longest date),
# it does not perform well if we try to use it to make predictions on shorter sequences:

In [25]:
X_new = prepare_date_strs(["May 02, 2020", "July 14, 1789"])

In [26]:
ids = np.argmax(model.predict(X_new), axis=-1)
for date_str in ids_to_date_strs(ids):
    print(date_str)

2020-02-02
1789-09-14


In [27]:
# Oops! We need to ensure that we always pass sequences of the same length as during training,
# using padding if necessary. Let's write a little helper function for that

In [28]:
max_input_length = X_train.shape[1]

def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < max_input_length:
        X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
    return X

def convert_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    #ids = model.predict_classes(X)
    ids = np.argmax(model.predict(X), axis=-1)
    return ids_to_date_strs(ids)

In [29]:
convert_date_strs(["May 02, 2020", "July 14, 1789"])

['2020-05-02', '1789-07-14']

In [30]:
# Cool! Granted, there are certainly much easier ways to write a date conversion tool
# (e.g., using regular expressions or even basic string manipulation), but you have to admit that using neural networks is way cooler.
# However, real-life sequence-to-sequence problems will usually be harder, so for the sake of completeness, let's build a more powerful model.

In [35]:
## Second version: feeding the shifted targets to the decoder (teacher forcing) ##

# Instead of feeding the decoder a simple repetition of the encoder's output vector, we can feed it the target sequence, shifted by one time step to the right.
# This way, at each time step the decoder will know what the previous target character was. This should help is tackle more complex sequence-to-sequence problems.

# Since the first output character of each target sequence has no previous character, we will need a new token to represent the start-of-sequence (sos).

# During inference, we won't know the target, so what will we feed the decoder? We can just predict one character at a time, starting with an sos token,
# then feeding the decoder all the characters that were predicted so far (we will look at this in more details later in this notebook).

# But if the decoder's LSTM expects to get the previous target as input at each step, how shall we pass it it the vector output by the encoder?
# Well, one option is to ignore the output vector, and instead use the encoder's LSTM state as the initial state of the decoder's LSTM
# (which requires that encoder's LSTM must have the same number of units as the decoder's LSTM).

# Now let's create the decoder's inputs (for training, validation and testing). The sos token will be represented using the last possible output character's ID + 1.

In [36]:
sos_id = len(OUTPUT_CHARS) + 1

def shifted_output_sequences(Y):
    sos_tokens = tf.fill(dims=(len(Y), 1), value=sos_id)
    return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder  = shifted_output_sequences(Y_test)

In [37]:
X_train_decoder

<tf.Tensor: shape=(10000, 10), dtype=int32, numpy=
array([[12,  8,  1, ..., 10, 11,  3],
       [12,  9,  6, ...,  6, 11,  2],
       [12,  8,  2, ...,  2, 11,  2],
       ...,
       [12, 10,  8, ...,  2, 11,  4],
       [12,  2,  2, ...,  3, 11,  3],
       [12,  8,  9, ...,  8, 11,  3]], dtype=int32)>

In [38]:
# Now let's build the model. It's not a simple sequential model anymore, so let's use the functional API

In [44]:
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

np.random.seed(42)
tf.random.set_seed(42)

# Encoder - Input
encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)

# Embedding
encoder_embedding = keras.layers.Embedding(
    input_dim=len(INPUT_CHARS) + 1,
    output_dim=encoder_embedding_size)(encoder_input)

# LSTM
_, encoder_state_h, encoder_state_c = keras.layers.LSTM(
    lstm_units, return_state=True)(encoder_embedding)
encoder_state = [encoder_state_h, encoder_state_c]


# Decoder - Input
decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)

# Embedding
decoder_embedding = keras.layers.Embedding(
    input_dim=len(OUTPUT_CHARS) + 2,
    output_dim=decoder_embedding_size)(decoder_input)

# LSTM
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(decoder_embedding, initial_state=encoder_state)

# Decoder - Output
decoder_output = keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")(decoder_lstm_output)


# Model
model = keras.models.Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

# Optimizer + Compile
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [45]:
history = model.fit([X_train, X_train_decoder], Y_train, epochs=10, validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [46]:
# Let's once again use the model to make some predictions. This time we need to predict characters one by one

In [47]:
sos_id = len(OUTPUT_CHARS) + 1

def predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
    
    for index in range(max_output_length):
        pad_size      = max_output_length - Y_pred.shape[1]
        X_decoder     = tf.pad(Y_pred, [[0, 0], [0, pad_size]])
        Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
        Y_pred_next   = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
        Y_pred        = tf.concat([Y_pred, Y_pred_next], axis=1)

    return ids_to_date_strs(Y_pred[:, 1:])

In [48]:
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

In [49]:
# Works fine! :)

In [50]:
## Third version: using TF-Addons's seq2seq implementation ##

In [52]:
# import tensorflow_addons as tfa

# np.random.seed(42)
# tf.random.set_seed(42)

# encoder_embedding_size = 32
# decoder_embedding_size = 32
# units = 128

# encoder_inputs   = keras.layers.Input(shape=[None], dtype=np.int32)
# decoder_inputs   = keras.layers.Input(shape=[None], dtype=np.int32)
# sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

# encoder_embeddings = keras.layers.Embedding(len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)

# decoder_embedding_layer = keras.layers.Embedding(len(OUTPUT_CHARS) + 2, decoder_embedding_size)
# decoder_embeddings      = decoder_embedding_layer(decoder_inputs)

# encoder = keras.layers.LSTM(units, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
# encoder_state = [state_h, state_c]

# sampler = tfa.seq2seq.sampler.TrainingSampler()

# decoder_cell = keras.layers.LSTMCell(units)
# output_layer = keras.layers.Dense(len(OUTPUT_CHARS) + 1)

# decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
#                                                  sampler,
#                                                  output_layer=output_layer)

# final_outputs, final_state, final_sequence_lengths = decoder(
#     decoder_embeddings,
#     initial_state=encoder_state)
# Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

# # Model
# model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
# optimizer = keras.optimizers.Nadam()
# model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
# history = model.fit([X_train, X_train_decoder], Y_train, epochs=15, validation_data=([X_valid, X_valid_decoder], Y_valid))

In [53]:
## Fourth version: using TF-Addons's seq2seq implementation with a scheduled sampler ##

In [54]:
## Fifth version: using TFA seq2seq, the Keras subclassing API and attention mechanisms ##