In [68]:
import tensorflow as tf
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization
from transformers import DistilBertTokenizerFast #, TFDistilBertModel
from transformers import TFDistilBertForTokenClassification
from tqdm import tqdm_notebook as tqdm

# Positional Encoding

When training a Transformer network, since your data is fed into the model all at once, you need to encode the positions of your inputs using these sine and cosine formulas:
    
$$
PE_{(pos, 2i)}= sin\left(\frac{pos}{{10000}^{\frac{2i}{d}}}\right)
\tag{1}$$
<br>

$$
PE_{(pos, 2i+1)}= cos\left(\frac{pos}{{10000}^{\frac{2i}{d}}}\right)
\tag{2}$$
<br>

* $d$ is the dimension of the word embedding and positional encoding
* $pos$ is the position of the word.
* $i$ refers to each of the different dimensions of the positional encoding.

The sum of the positional encoding and word embedding is ultimately what is fed into the model. 
The sine and cosine values are small (between -1 and 1) so the word embeddings aren't significantly distorted, but this helps the Transformer attend to the relative positions of your input data.

## a) Angles

The function below computes the inner term of the sine and cosine equations: 

$$\frac{pos}{10000^{\frac{2i}{d}}} \tag{3}$$

In [69]:
def get_angles(pos, i, d, verbose=False):
    """
    Get the angles for the positional encoding

    Arguments:
        pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
        i --   Row vector containing the dimension span [[0, 1, 2, ..., M-1]]
        d(integer) -- Encoding size
    
    Returns:
        angles -- (pos, d) numpy array 
    """
    denom = np.power(10000, (2 * (i//2)) / np.float32(d))

    if verbose:
        print(pos)
        print(denom)
    
    angles = pos / denom
    
    return angles

In [70]:
# Example
position = 4
d_model = 8
pos = np.arange(position)[:, np.newaxis] # Column vector of positions (1 to position)
dims = np.arange(d_model)[np.newaxis, :] # Row vector of dimensions (1 to dims)

get_angles(pos, dims, d_model, verbose=True)

[[0]
 [1]
 [2]
 [3]]
[[   1.    1.   10.   10.  100.  100. 1000. 1000.]]


array([[0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00, 0.e+00],
       [1.e+00, 1.e+00, 1.e-01, 1.e-01, 1.e-02, 1.e-02, 1.e-03, 1.e-03],
       [2.e+00, 2.e+00, 2.e-01, 2.e-01, 2.e-02, 2.e-02, 2.e-03, 2.e-03],
       [3.e+00, 3.e+00, 3.e-01, 3.e-01, 3.e-02, 3.e-02, 3.e-03, 3.e-03]])

## b) Positional Encoding

In [71]:
def positional_encoding(positions, d):
    """
    Precomputes a matrix with all the positional encodings 
    
    Arguments:
        positions (int) -- Maximum number of positions to be encoded 
        d (int) -- Encoding size
    
    Returns:
        pos_encoding -- (1, position, d_model) A matrix with the positional encodings
    """
    # initialize a matrix angle_rads of all the angles 
    angle_rads = get_angles(np.arange(positions)[:, np.newaxis], 
                            np.arange(d)[ np.newaxis,:],
                            d)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    # Add an extra outer dimension to create 3D matrix
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

In [72]:
# Example
positional_encoding(4, 8)

<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[ 0.0000000e+00,  1.0000000e+00,  0.0000000e+00,  1.0000000e+00,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        [ 8.4147096e-01,  5.4030228e-01,  9.9833414e-02,  9.9500418e-01,
          9.9998331e-03,  9.9994999e-01,  9.9999981e-04,  9.9999952e-01],
        [ 9.0929741e-01, -4.1614684e-01,  1.9866933e-01,  9.8006660e-01,
          1.9998666e-02,  9.9980003e-01,  1.9999987e-03,  9.9999797e-01],
        [ 1.4112000e-01, -9.8999250e-01,  2.9552022e-01,  9.5533651e-01,
          2.9995501e-02,  9.9955004e-01,  2.9999956e-03,  9.9999553e-01]]],
      dtype=float32)>

# Masking

There are 2 types of masks that are useful when building your Transformer network: the <b>padding mask</b> and the <b>look-ahead mask</b>. Both help the softmax compute appropriate weights for the words of the input sentence.

## a) Padding Mask

After masking, your input should go from [87, 600, 0, 0, 0] to [87, 600, -1e9, -1e9, -1e9], so that when you take the softmax, the -1e9s become 0s.

In [73]:
def create_padding_mask(seq):
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        seq -- (n, m) matrix
    
    Returns:
        mask -- (n, 1, 1, m) binary tensor
    """
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :] 

In [74]:
# Example
x = tf.constant([[7., 6., 0., 0., 1.], [1., 2., 3., 0., 0.], [0., 0., 0., 4., 5.]])
print(x)
create_padding_mask(x)

tf.Tensor(
[[7. 6. 0. 0. 1.]
 [1. 2. 3. 0. 0.]
 [0. 0. 0. 4. 5.]], shape=(3, 5), dtype=float32)


<tf.Tensor: shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],


       [[[0., 0., 0., 1., 1.]]],


       [[[1., 1., 1., 0., 0.]]]], dtype=float32)>

If we multiply this mask by -1e9 and add it to the input sequences, the 0s essentially become negative infinity. Notice the difference when taking the softmax of the original sequence and the masked sequence:

In [75]:
print(tf.keras.activations.softmax(x))
print()
print(tf.keras.activations.softmax(x + create_padding_mask(x) * -1.0e9))

tf.Tensor(
[[7.2876638e-01 2.6809818e-01 6.6454895e-04 6.6454895e-04 1.8064313e-03]
 [8.4437370e-02 2.2952457e-01 6.2391245e-01 3.1062772e-02 3.1062772e-02]
 [4.8541026e-03 4.8541026e-03 4.8541026e-03 2.6502505e-01 7.2041273e-01]], shape=(3, 5), dtype=float32)

tf.Tensor(
[[[[7.2973627e-01 2.6845497e-01 0.0000000e+00 0.0000000e+00
    1.8088354e-03]
   [2.4472848e-01 6.6524094e-01 0.0000000e+00 0.0000000e+00
    9.0030573e-02]
   [6.6483542e-03 6.6483542e-03 0.0000000e+00 0.0000000e+00
    9.8670328e-01]]]


 [[[7.3057157e-01 2.6876226e-01 6.6619506e-04 0.0000000e+00
    0.0000000e+00]
   [9.0030566e-02 2.4472845e-01 6.6524088e-01 0.0000000e+00
    0.0000000e+00]
   [3.3333334e-01 3.3333334e-01 3.3333334e-01 0.0000000e+00
    0.0000000e+00]]]


 [[[0.0000000e+00 0.0000000e+00 0.0000000e+00 2.6894143e-01
    7.3105860e-01]
   [0.0000000e+00 0.0000000e+00 0.0000000e+00 5.0000000e-01
    5.0000000e-01]
   [0.0000000e+00 0.0000000e+00 0.0000000e+00 2.6894143e-01
    7.3105860e-01]]]], shap

## b) Look-Ahead Mask

In training, you will have access to the complete correct output of your training example. The look-ahead mask helps your model pretend that it correctly predicted a part of the output and see if, without looking ahead, it can correctly predict the next output.

For example, if the expected correct output is [1, 2, 3] and you wanted to see if given that the model correctly predicted the first value it could predict the second value, you would mask out the second and third values. So you would input the masked sequence [1, -1e9, -1e9] and see if it could generate [1, 2, -1e9].

In [76]:
def create_look_ahead_mask(size):
    """
    Returns an upper triangular matrix filled with ones
    
    Arguments:
        size -- matrix size
    
    Returns:
        mask -- (size, size) tensor
    """
    mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask 

In [77]:
x = tf.random.uniform((1, 3))
print(x)
create_look_ahead_mask(x.shape[1])

tf.Tensor([[0.04652643 0.70834446 0.68522584]], shape=(1, 3), dtype=float32)


<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]], dtype=float32)>

# Self-Attention

<b>Scaled dot product attention</b> takes in a query, key, value, and an optional mask to return attention-based vector representations of the words in your sequence. This self-attention can be expressed as:
$$
\text { Attention }(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}+{M}\right) V\tag{4}\
$$

* $Q$ is the matrix of queries 
* $K$ is the matrix of keys
* $V$ is the matrix of values
* $M$ is the optional mask you choose to apply
* ${d_k}$ is the dimension of the keys, which is used to scale everything down so the softmax doesn't explode

In [78]:
def scaled_dot_product_attention(q, k, v, mask):
    """
    Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) but it must be broadcastable for addition.

    Arguments:
        q -- query shape == (..., seq_len_q, depth)
        k -- key shape == (..., seq_len_k, depth)
        v -- value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
        output -- attention_weights
    """
    
    # Q*K'
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    
    # attention_weights * V
    output = tf.matmul(attention_weights, v)   # (..., seq_len_q, depth_v)

    return output, attention_weights

# Transformer

<img src="images/transformer.png" alt="Transformer" width="900"/>

## Encoder

The Transformer's Encoder layer pairs a <b>multi-head self-attention</b> and <b>feed forward neural networks</b> to imporve training speed and passes K and V matrices to the Decoder (later).
<br><br>
<img src="images/encoder_layer.png" alt="Encoder" width="250"/>

a) `MultiHeadAttention` - compute self-attention several times to detect different features <br>
b) `FullyConnected` - 2 Dense layers, independently applied (exact same parameters) to each position of attention

### a) FullyConnected

In [84]:
def FullyConnected(embedding_dim, fully_connected_dim, out_activation=None):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(units=fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(units=embedding_dim, activation=out_activation)  # (batch_size, seq_len, d_model)
    ])

### b) EncoderLayer

Apart from pairing the multi-head self-attention and feed forward neural network, some residual connections and layer normalizations are also used to speed up training.

In [80]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    Composed of a multi-head self-attention mechanism, followed by a simple positionwise fully-connected feed-forward network.
    This archirecture includes a residual connection around each of the two sub-layers, followed by layer normalization.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(num_heads=num_heads,
                                        key_dim=embedding_dim)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                    fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
    
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            out2 -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
        """
        # calculate self-attention using mha
        #-> To compute self-attention Q, V and K should be the same (x)
        self_attn_output = self.mha(x, x, x, mask) # Self attention (batch_size, input_seq_len, embedding_dim)
        
        # dropout layer to the self-attention output 
        self_attn_output = self.dropout1(self_attn_output, training=training)
        
        # layer normalization on sum of the input and the attention output to get the  
        # output of the multi-head attention layer 
        mult_attn_out = self.layernorm1(x + self_attn_output)  # (batch_size, input_seq_len, embedding_dim)
        # x + self_attn_output is a skip connection

        # pass the output of the multi-head attention layer through a ffn 
        ffn_output = self.ffn(mult_attn_out)  # (batch_size, input_seq_len, embedding_dim)
        
        # dropout layer to ffn output 
        ffn_output = self.dropout2(ffn_output, training=training)
        
        # layer normalization on sum of the output from multi-head attention and ffn output to get the
        # output of the encoder layer 
        encoder_layer_out = self.layernorm2(ffn_output + mult_attn_out)  # (batch_size, input_seq_len, embedding_dim)
        
        return encoder_layer_out

### c) Encoder

In [81]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts with an embedding layer, then positional encoding, and lastly EncoderLayer
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
                maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.embedding_dim)

        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                            for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)
    
    def call(self, x, training, mask):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
        Returns:
            out2 -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
        """

        seq_len = tf.shape(x)[1]
        
        # Embedding layer
        x = self.embedding(x)  # (batch_size, input_seq_len, embedding_dim)
        # Scale embedding by multiplying it by the square root of the embedding dimension
        x *= tf.math.sqrt(tf.cast(self.embedding_dim,tf.float32))
        # Add positional encoding to embedding
        x += self.pos_encoding[:, :seq_len, :]
        # Pass the encoded embedding through a dropout layer
        x = self.dropout(x, training=training)
        # Pass the output through the stack of encoding layers 
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,training, mask)

        return x  # (batch_size, input_seq_len, embedding_dim)

## Decoder

The Decoder layer takes the K and V matrices generated by the Encoder and in computes the second multi-head attention layer with the Q matrix from the output.

<img src="images/decoder_layer.png" alt="Encoder" width="250"/>

### a) DecoderLayer

This time, we pair 2 multi-head attention layers with a feed forward neural network, once again with residual connections and layer normalization to speed up training.
    
1. Block 1 = multi-head self-attention that takes the new shifted-right input (with residual connection, dropout layer, look-ahead mask)
2. Block 2 = multi-head attention that takes Encoder's outputs K & V, and output Q from Block 1 (with dropout layer, layer normalization, residual connection)
3. Block 3 = feed forward neural network (with dropout layer, normalization layers, residual connection)

In [82]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(num_heads=num_heads,
                                        key_dim=embedding_dim)

        self.mha2 = MultiHeadAttention(num_heads=num_heads,
                                        key_dim=embedding_dim)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                    fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)
        
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
        self.dropout3 = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Forward pass for the Decoder Layer
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            out3 -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
            attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
        """
        # enc_output.shape == (batch_size, input_seq_len, embedding_dim)
        
        # BLOCK 1 = multi-head self-attention (with residual connection, dropout layer, look-ahead mask)
        # Compute self-attention and return attention scores as attn_weights_block1
        attn1, attn_weights_block1 = self.mha1(x, x, x,look_ahead_mask, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)

        # dropout layer on the attention output
        attn1 = self.dropout1(attn1, training = training)
        # layer normalization to the sum of the attention output and the input
        out1 = self.layernorm1(attn1 + x)


        # BLOCK 2 = multi-head attention that takes Encoder's outputs K & V, and Q from Block 1 (with dropout layer, layer normalization, residual connection)
        # Compute self-attention using Q from BLOCK 1 and K and V from the encoder output.
        # MultiHeadAttention's call takes input (Query, Value, Key, attention_mask, return_attention_scores, training)
        # Return attention scores as attn_weights_block2 
        attn2, attn_weights_block2 = self.mha2( out1, enc_output, enc_output, padding_mask, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        
        # dropout layer on the attention output
        attn2 = self.dropout2(attn2, training=training)
        # layer normalization to the sum of the attention output and the output of the first block 
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, embedding_dim)
        

        # BLOCK 3 = feed forward neural network (with dropout layer, normalization layers, residual connection)
        # pass output of BLOCK 2 through ffn
        ffn_output = self.ffn(out2) # (batch_size, target_seq_len, embedding_dim)
        
        # dropout layer to the ffn output
        ffn_output = self.dropout3(ffn_output, training=training)
        # layer normalization to the sum of the ffn output and the output of the second block
        out3 =  self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, embedding_dim)


        return out3, attn_weights_block1, attn_weights_block2
    

### b) Decoder

In [83]:
class Decoder(tf.keras.layers.Layer):
    """
    The entire Encoder is starts by passing the target input to an embedding layer 
    and using positional encoding to then pass the output through a stack of
    decoder Layers
        
    """ 
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
                maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                            for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Forward  pass for the Decoder
        
        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            attention_weights - Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        """

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        # create word embeddings 
        x = self.embedding(x)  # (batch_size, target_seq_len, embedding_dim)
        
        # scale embeddings by multiplying by the square root of their dimension
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        
        # calculate positional encodings and add to word embedding
        x += self.pos_encoding[:, :seq_len, :]
        
        # apply a dropout layer to x
        x = self.dropout(x, training=training)

        # use a for loop to pass x through a stack of decoder layers and update attention_weights (~4 lines total)
        for i in range(self.num_layers):
            # pass x and the encoder output through a stack of decoder layers and save the attention weights
            # of block 1 and 2 (~1 line)
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights['decoder_layer{}_block1_self_att'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = block2
        
        # x.shape == (batch_size, target_seq_len, embedding_dim)
        return x, attention_weights

## Transformer

After the Nth Decoder layer, a dense layer and softmax are applied to predict the sequence's next output.
1. Pass input through Encoder with appropiate mask
2. Pass Encoder output and target through Decoder with appropiate mask
3. Apply linear transformation and softmax to get prediction

In [85]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with Encoder and Decoder
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
                target_vocab_size, max_positional_encoding_input,
                max_positional_encoding_target, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers=num_layers,
                                embedding_dim=embedding_dim,
                                num_heads=num_heads,
                                fully_connected_dim=fully_connected_dim,
                                input_vocab_size=input_vocab_size,
                                maximum_position_encoding=max_positional_encoding_input,
                                dropout_rate=dropout_rate,
                                layernorm_eps=layernorm_eps)

        self.decoder = Decoder(num_layers=num_layers, 
                                embedding_dim=embedding_dim,
                                num_heads=num_heads,
                                fully_connected_dim=fully_connected_dim,
                                target_vocab_size=target_vocab_size, 
                                maximum_position_encoding=max_positional_encoding_target,
                                dropout_rate=dropout_rate,
                                layernorm_eps=layernorm_eps)

        self.final_layer = Dense(target_vocab_size, activation='softmax')
    
    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        """
        Forward pass for the entire Transformer
        Arguments:
            inp -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
            tar -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            enc_padding_mask -- Boolean mask to ensure that the padding is not 
                    treated as part of the input
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            final_output -- Describe me
            attention_weights - Dictionary of tensors containing all the attention weights for the decoder
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        """
        enc_output = self.encoder(inp,training,enc_padding_mask) # (batch_size, inp_seq_len, fully_connected_dim)
        
        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask) # dec_output.shape == (batch_size, tar_seq_len, fully_connected_dim)
        
        final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights