This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

This notebook was generated for TensorFlow 2.6.

## The Transformer architecture

### Understanding self-attention

#### Generalized self-attention: the query-key-value model

### Multi-head attention

### The Transformer encoder

**Getting the data**

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz # importing the dataset
!tar -xf aclImdb_v1.tar.gz # extracting the dataset
!rm -r aclImdb/train/unsup # removing the unsupervised data

**Preparing the data**

In [None]:
import os, pathlib, shutil, random # importing the necessary libraries
from tensorflow import keras # importing the necessary libraries
batch_size = 32 # setting the batch size
base_dir = pathlib.Path("aclImdb") # setting the base directory
val_dir = base_dir / "val" # setting the validation directory
train_dir = base_dir / "train" # setting the training directory
for category in ("neg", "pos"): # iterating over the categories
    os.makedirs(val_dir / category) # creating the validation directory
    files = os.listdir(train_dir / category) # listing the files in the training directory
    random.Random(1337).shuffle(files) # shuffling the files
    num_val_samples = int(0.2 * len(files)) # setting the number of validation samples
    val_files = files[-num_val_samples:] # setting the validation files
    for fname in val_files: # iterating over the validation files
        shutil.move(train_dir / category / fname, 
                    val_dir / category / fname) # moving the files to the validation directory

train_ds = keras.utils.text_dataset_from_directory( # creating the training dataset
    "aclImdb/train", batch_size=batch_size # setting the batch size and the directory for the training dataset 
)
val_ds = keras.utils.text_dataset_from_directory( # creating the validation dataset
    "aclImdb/val", batch_size=batch_size # setting the batch size and the directory for the validation dataset
)
test_ds = keras.utils.text_dataset_from_directory( # creating the test dataset
    "aclImdb/test", batch_size=batch_size # setting the batch size and the directory for the test dataset
)
text_only_train_ds = train_ds.map(lambda x, y: x) # extracting the text from the training dataset

**Vectorizing the data**

In [None]:
from tensorflow.keras import layers # importing the necessary libraries
 
max_length = 600 # setting the maximum length
max_tokens = 20000 # setting the maximum tokens
text_vectorization = layers.TextVectorization( # creating the text vectorization layer
    max_tokens=max_tokens, # setting the maximum tokens
    output_mode="int", # setting the output mode
    output_sequence_length=max_length, # setting the output sequence length
)
text_vectorization.adapt(text_only_train_ds) # adapting the text vectorization layer to the training dataset

int_train_ds = train_ds.map( # creating the integer training dataset
    lambda x, y: (text_vectorization(x), y), # mapping the text vectorization layer to the training dataset
    num_parallel_calls=4) # setting the number of parallel calls to 4 (this is the number of CPU cores)
int_val_ds = val_ds.map( # creating the integer validation dataset
    lambda x, y: (text_vectorization(x), y), # mapping the text vectorization layer to the validation dataset
    num_parallel_calls=4) # setting the number of parallel calls to 4 (this is the number of CPU cores)
int_test_ds = test_ds.map( # creating the integer test dataset
    lambda x, y: (text_vectorization(x), y), # mapping the text vectorization layer to the test dataset
    num_parallel_calls=4) # setting the number of parallel calls to 4 (this is the number of CPU cores)

**Transformer encoder implemented as a subclassed `Layer`**

In [None]:
import tensorflow as tf # importing the necessary libraries
from tensorflow import keras # importing the necessary libraries
from tensorflow.keras import layers # importing the necessary libraries

class TransformerEncoder(layers.Layer): # creating the transformer encoder class
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs): # defining the initialization method
        super().__init__(**kwargs) # initializing the super class
        self.embed_dim = embed_dim # setting the embedding dimension
        self.dense_dim = dense_dim # setting the dense dimension
        self.num_heads = num_heads # setting the number of heads
        self.attention = layers.MultiHeadAttention( # creating the multi-head attention layer
            num_heads=num_heads, key_dim=embed_dim) # setting the number of heads and the key dimension
        self.dense_proj = keras.Sequential( # creating the dense projection layer
            [layers.Dense(dense_dim, activation="relu"), # adding a dense layer with the ReLU activation function
             layers.Dense(embed_dim),] # adding a dense layer with the embedding dimension
        )
        self.layernorm_1 = layers.LayerNormalization() # creating the first layer normalization layer
        self.layernorm_2 = layers.LayerNormalization() # creating the second layer normalization layer

    def call(self, inputs, mask=None): # defining the call method
        if mask is not None: # if the mask is not None
            mask = mask[:, tf.newaxis, :] # add a new axis to the mask
        attention_output = self.attention( # creating the attention output
            inputs, inputs, attention_mask=mask) # setting the inputs and the attention mask
        proj_input = self.layernorm_1(inputs + attention_output) # creating the projection input
        proj_output = self.dense_proj(proj_input) # creating the projection output
        return self.layernorm_2(proj_input + proj_output) # returning the layer normalization

    def get_config(self): # defining the get config method
        config = super().get_config() # getting the configuration
        config.update({ # updating the configuration
            "embed_dim": self.embed_dim, # setting the embedding dimension
            "num_heads": self.num_heads, # setting the number of heads
            "dense_dim": self.dense_dim, # setting the dense dimension
        })
        return config # returning the configuration

**Using the Transformer encoder for text classification**

In [None]:
vocab_size = 20000 # setting the vocabulary size
embed_dim = 256 # setting the embedding dimension
num_heads = 2 # setting the number of heads
dense_dim = 32 # setting the dense dimension

inputs = keras.Input(shape=(None,), dtype="int64") # creating the input layer
x = layers.Embedding(vocab_size, embed_dim)(inputs) # creating the embedding layer
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) # creating the transformer encoder layer
x = layers.GlobalMaxPooling1D()(x) # creating the global max pooling layer
x = layers.Dropout(0.5)(x) # creating the dropout layer
outputs = layers.Dense(1, activation="sigmoid")(x) # creating the output layer
model = keras.Model(inputs, outputs) # creating the model
model.compile(optimizer="rmsprop", # compiling the model
              loss="binary_crossentropy", # setting the loss function
              metrics=["accuracy"]) # setting the metrics
model.summary() # printing the model summary

**Training and evaluating the Transformer encoder based model**

In [None]:
callbacks = [ # creating the callbacks
    keras.callbacks.ModelCheckpoint("transformer_encoder.keras", # creating the model checkpoint callback 
                                    save_best_only=True) # saving only the best model
]
model.fit(int_train_ds, # fitting the model
          validation_data=int_val_ds, # setting the validation data
          epochs=20, # setting the number of epochs
          callbacks=callbacks) # setting the callbacks
model = keras.models.load_model( # loading the model
    "transformer_encoder.keras", # setting the model path
    custom_objects={"TransformerEncoder": TransformerEncoder}) # setting the custom objects
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}") # printing the test accuracy

#### Using positional encoding to re-inject order information

**Implementing positional embedding as a subclassed layer**

In [None]:
class PositionalEmbedding(layers.Layer): # creating the positional embedding layer
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs): # defining the initialization method
        super().__init__(**kwargs) # initializing the super class
        self.token_embeddings = layers.Embedding( # creating the token embeddings layer
            input_dim=input_dim, output_dim=output_dim) # setting the input and output dimensions
        self.position_embeddings = layers.Embedding( # creating the position embeddings layer
            input_dim=sequence_length, output_dim=output_dim) # setting the input and output dimensions
        self.sequence_length = sequence_length # setting the sequence length
        self.input_dim = input_dim # setting the input dimension
        self.output_dim = output_dim # setting the output dimension

    def call(self, inputs): # defining the call method
        length = tf.shape(inputs)[-1] # getting the length
        positions = tf.range(start=0, limit=length, delta=1) # getting the positions
        embedded_tokens = self.token_embeddings(inputs) # getting the embedded tokens
        embedded_positions = self.position_embeddings(positions) # getting the embedded positions
        return embedded_tokens + embedded_positions # returning the sum of the embedded tokens and positions

    def compute_mask(self, inputs, mask=None): # defining the compute mask method (this is used for padding)
        return tf.math.not_equal(inputs, 0) # returning the not equal inputs to 0 (this is used for padding)

    def get_config(self): # defining the get config method
        config = super().get_config() # getting the configuration
        config.update({ # updating the configuration
            "output_dim": self.output_dim, # setting the output dimension
            "sequence_length": self.sequence_length, # setting the sequence length
            "input_dim": self.input_dim, # setting the input dimension
        })
        return config # returning the configuration

#### Putting it all together: A text-classification Transformer

**Combining the Transformer encoder with positional embedding**

In [None]:
vocab_size = 20000 # setting the vocabulary size
sequence_length = 600 # setting the sequence length
embed_dim = 256 # setting the embedding dimension     
num_heads = 2 # setting the number of heads
dense_dim = 32 # setting the dense dimension

inputs = keras.Input(shape=(None,), dtype="int64") # creating the input layer with the integer data type (int64) and None shape (variable length)
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs) # creating the positional embedding layer and passing the inputs to it 
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) # creating the transformer encoder layer and passing the positional embedding layer to it
x = layers.GlobalMaxPooling1D()(x) # creating the global max pooling layer
x = layers.Dropout(0.5)(x) # creating the dropout layer
outputs = layers.Dense(1, activation="sigmoid")(x) # creating the output layer
model = keras.Model(inputs, outputs) # creating the model
model.compile(optimizer="rmsprop", # compiling the model
              loss="binary_crossentropy", # setting the loss function
              metrics=["accuracy"]) # setting the metrics
model.summary() # printing the model summary

callbacks = [ # creating the callbacks
    keras.callbacks.ModelCheckpoint("full_transformer_encoder.keras", # creating the model checkpoint callback 
                                    save_best_only=True) # saving only the best model
]
model.fit(int_train_ds, # fitting the model
          validation_data=int_val_ds, # setting the validation data
          epochs=20, # setting the number of epochs
          callbacks=callbacks) # setting the callbacks
model = keras.models.load_model( # loading the model
    "full_transformer_encoder.keras", # setting the model path
    custom_objects={"TransformerEncoder": TransformerEncoder, # setting the custom objects
                    "PositionalEmbedding": PositionalEmbedding}) # setting the custom objects
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}") # printing the test accuracy

### When to use sequence models over bag-of-words models?