<center><h1>Transformers Implementation</h1>

### Importing everything we need

In this notebook, it will be sufficient to use tensorflow to optimize our model so that we don't go into backpropagation's details and we focus on the Transformer itself

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

<h3>Positional Encoding</h3>

positional encoding shape: **(length, d_model)**<br>

In [3]:
def positional_encoding(length, depth):
    PE = np.zeros((length, depth))
    for i in range(depth):
        for pos in range(length):
            if(i%2==0):
                PE[pos, i] = np.sin(pos/(10000**(i/depth)))
            else:
                PE[pos, i] = np.cos(pos/(10000**((i-1)/depth)))

    PE = PE
    return PE

print(positional_encoding(3, 4)) # (length, depth)
print(positional_encoding(4, 4))

[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]]
[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]
 [ 0.14112001 -0.9899925   0.0299955   0.99955003]]


<h3>Positional Embedding</h3>

Input shape: **(batch_size, length, d_model)**<br>
positional embedding shape: **(batch_size, length, d_model)**<br>

In [4]:
# input_shape=(batch_size, length)      output_shape=(batch_size, length, depth)
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_model, mask_zero=True)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x+= positional_encoding(length ,self.d_model)
        return x

<h3>Multi-Head Attention</h3>

Input shape: **(batch_size, length, d_model)**<br>
MultiHeadAttention output shape: **(batch_size, length, d_model)**<br>

In [5]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, mask=False):
        super().__init__()
        self.d_model = d_model
        self.heads = heads
        self.mask = mask
        self.queries = tf.keras.layers.Dense(d_model,use_bias=False) #  input_shape=(batch_size, length, d_model)
        self.keys = tf.keras.layers.Dense(d_model,use_bias=False)
        self.values = tf.keras.layers.Dense(d_model,use_bias=False)
        self.out = tf.keras.layers.Dense(d_model,use_bias=False)

    def call(self, q, k, v):
        batch_size, length, _ = q.shape
        batch_size, lengthc, _ = k.shape
        Q, K, V = self.queries(q), self.keys(k), self.values(v)
        # shapes = (batch_size, length, d_model)
        Q = tf.transpose( tf.reshape( Q,shape=(batch_size, length, self.heads, self.d_model//self.heads) ), perm=(0,2,1,3))
        K, V =[tf.transpose( tf.reshape( P,shape=(batch_size, lengthc, self.heads, self.d_model//self.heads) ), perm=(0,2,1,3)) for P in (K,V)]
        # shapes= (batch_size, heads, length, d_model//heads=dk)
        dk = tf.cast(self.d_model//self.heads,dtype=tf.float32)
        mask = tf.linalg.set_diag(tf.linalg.band_part(tf.fill((length,lengthc), float('-inf')), 0, -1), tf.zeros(length)) if self.mask else tf.zeros((length,lengthc))
        attention_values = tf.matmul(Q, tf.transpose(K, perm=(0,1,3,2))) / tf.sqrt(dk)
        attention_weights = tf.nn.softmax(attention_values+mask)@V
        return self.out(tf.reshape(tf.transpose(attention_weights, perm=(0,2,1,3)),shape=(batch_size, length, self.d_model))), attention_values

<h3>Feed Forward Neural Network</h3>

Input shape: **(batch_size, length, d_model)**<br>
Inner layer shape: **(batch_size, length, dff)**<br>
FeedForward output shape: **(batch_size, length, d_model)**<br>

In [6]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super().__init__()
        self.l1 = tf.keras.layers.Dense(dff, activation=tf.keras.activations.relu)# input_shape=(batch_size, length, d_model)
        self.l2 = tf.keras.layers.Dense(d_model)
    def call(self, x):
        return self.l2(self.l1(x))

<h3>Encoder Layer</h3>

Input shape: **(batch_size, length, d_model)**<br>
EncoderLayer shape: **(batch_size, length, d_model)**<br>

In [7]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, masked_attention=False):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, heads, masked_attention)
        self.add1 = tf.keras.layers.Add()
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.FFN = FeedForward(d_model, dff)
        self.add2 = tf.keras.layers.Add()
        self.norm2 = tf.keras.layers.LayerNormalization()

    def call(self, x):
        MHA,_ = self.mha(x,x,x)
        AN1 = self.norm1(self.add1((MHA,x)))
        F1 = self.FFN(AN1)
        AN2 = self.norm2(self.add2((F1,AN1)))
        return AN2

<h3>Encoder</h3>

Input shape: **(batch_size, length, d_model)**<br>
Encoder shape: **(batch_size, length, d_model)**<br>

In [8]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, N, vocab_size, d_model, heads, dff, masked_attention=False):
        super().__init__()
        self.positional_embedding = PositionalEmbedding(vocab_size, d_model)
        self.encoder_layers = [EncoderLayer(d_model, heads, dff, masked_attention=False) for _ in range(N)]
        self.N = N

    def call(self, x):
        x = self.positional_embedding(x)

        for n in range(self.N):
            x = self.encoder_layers[n](x)
        return x

<h3>Decoder Layer</h3>

Input shapes: **(batch_size, x_length, d_model)** and **(batch_size, context_length, d_model)**<br>
Decoder output shape: **(batch_size, x_length, d_model)** <br>

In [9]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, mheads, cheads, dff, masked_cross_attention=False):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, mheads, mask=True)
        self.add1 = tf.keras.layers.Add()
        self.norm1 = tf.keras.layers.LayerNormalization()
        
        self.mha2 = MultiHeadAttention(d_model, cheads, masked_cross_attention)
        self.add2 = tf.keras.layers.Add()
        self.norm2 = tf.keras.layers.LayerNormalization()
        
        self.FFN = FeedForward(d_model, dff)
        self.add3 = tf.keras.layers.Add()
        self.norm3 = tf.keras.layers.LayerNormalization()

    def call(self, x, context):
        MMHA, _ = self.mha1(x,x,x)
        AN1 = self.norm1(self.add1((MMHA,x)))
        CMHA, _ = self.mha2(AN1, context, context)
        AN2 = self.norm2(self.add2((CMHA,AN1)))
        F = self.FFN(AN2)
        AN3 = self.norm3(self.add3((F,AN2)))
        return AN3

<h3>Decoder</h3>

Input shapes: **(batch_size, x_length, d_model)** and **(batch_size, context_length, d_model)**<br>
Decoder output shape: **(batch_size, x_length, d_model)** <br>

In [10]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, N, target_vocab_size, d_model, mheads, cheads, dff, masked_cross_attention=False):
        super().__init__()
        self.positional_embedding = PositionalEmbedding(target_vocab_size, d_model)
        self.decoder_layers = [DecoderLayer(d_model, mheads, cheads, dff, masked_cross_attention) for n in range(N)]
        self.N = N

    def call(self, x, context):
        x = self.positional_embedding(x)

        for n in range(self.N):
            x = self.decoder_layers[n](x, context)
        return x

<h3>Transformer</h3>

Input shapes: **(batch_size, x_length, d_model)** and **(batch_size, context_length, d_model)**<br>
Transformer output shape: **(batch_size, x_length, d_model)** <br>

In [11]:
class Transformer(tf.keras.Model):
    def __init__(self, Ne, Nd, vocab_size, target_vocab_size, d_model, gheads, mheads, cheads, dff, masked_global_attention=False, masked_cross_attention=False):
        super().__init__()
        self.encoder = Encoder(Ne, vocab_size, d_model, gheads, dff, masked_global_attention)
        self.decoder = Decoder(Nd, target_vocab_size, d_model, mheads, cheads, dff, masked_cross_attention)
        self.out = tf.keras.layers.Dense(target_vocab_size)

    def call(self, x, context):
        encoded = self.encoder(context)
        decoded = self.decoder(x, encoded)
        logits = self.out(decoded)
        return logits

<h3>Let's try it out</h3>

In [16]:
Ne, Nd, vocab_size, target_vocab_size, batch_size, d_model, mheads, cheads, gheads, dff, lengthx, lengthc = 6, 6, 1000, 100, 32, 512, 8, 8, 8, 2046, 9, 1

input = tf.random.uniform(shape=(batch_size, lengthx))
context_input = tf.random.uniform(shape=(batch_size, lengthc))

transformer = Transformer(Ne, Nd, vocab_size, target_vocab_size, d_model, gheads, mheads, cheads, dff)

results = transformer(input, context_input)
print(results.shape)

(32, 9, 100)
