Note: If you're a recruiter for a company I applied for, I am willing to explain each part of this process as well as re implement the entire thing from scratch infront of you.

In [283]:
import numpy as np 
import matplotlib.pyplot as plt 
import tensorflow_datasets as tfds 
import tensorflow_text as text 
import tensorflow as tf

In [253]:
def angles(pos, i, d_model):
  angle_rates = i/np.power(10000, (2*(i//2))/np.float32(d_model))
  return pos * angle_rates
def positional_encoding(pos, d_model):
  angless = angles(np.arange(pos)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) #The np.newaxis makes the positional vector based off rows and d_model based off col
  angless[:, 0::2] = np.sin(angless[:, 0::2])
  angless[:, 1::2] = np.cos(angless[:, 1::2])
  pos_encoding = angless[np.newaxis, ...] #The ... simply means add the rest of what was already in there 
  return tf.cast(pos_encoding, tf.float32) #Makes the numpy array a tensor of float 32 values

In [254]:
def create_padding_mask(seq):
  seqq = tf.cast(tf.math.equal(seq, 0),tf.float32) #Sets the values in the .equal to a tensor of type float 32 
  return seqq[:, tf.newaxis, tf.newaxis, :] #batch_size,1,1,seq_len This mask is for encoder and beginning decoder layer

In [255]:
def create_look_ahead_mask(size):
  vals = 1 - tf.linalg.band_part(tf.ones((size,size)),-1, 0)
  return vals 

In [256]:
def scaled_dot_product_attention(q, k, v, mask):
  dotproductssf = tf.matmul(q, k, transpose_b=True)
  dotproductssf = dotproductssf/tf.math.sqrt(tf.cast(tf.shape(k)[-1], tf.float32))
  #dont forget before we can softmax we gotta add the mask as this is what will zero out the values when we do softmax!
  #Notice the reason we added the two neq dimensions was so we could implement it in this way 
  if mask is not None:
    dotproductssf += (mask*-1e9) #(note that only the mask with a 1 will affect input, the ones with a 0 dont affect input since we are effectively adding and not multiplying!)
  dotproductssf = tf.nn.softmax(dotproductssf, axis=-1)
  output = tf.matmul(dotproductssf, v, transpose_b=True)
  return output, dotproductssf #dotproductssf represents the attention weights, i feel like the reason we may have to keep track of this is so when we pass it to a dense layer 
  #It knows the kind of weights to initialize our stuff with

In [257]:
class MultiHeadAttention(tf.keras.layers.Layer):
  #this inheritance lets us basically become a layer 
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads 
    self.d_model = d_model 
    assert self.d_model % self.num_heads == 0 #else we will have a remainder in split and that would be awkward lmaooo 
    self.depth = self.d_model // self.num_heads #This returns the floor value, so rounding down in case its a float 
    #In order to actually get the weight matrices we are going to use a dense layer the size of d_model this will make sure the multihead attention attends weight to each of the split
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    #Remember that after the attention output our matrices are going to be a weird size after we do all the split multiplications, so we ues another dense layer with a fixed unit
    #size, that we want to output the entire model as! 
    self.dense = tf.keras.layers.Dense(d_model) 
  def split_heads(self, x, batch_size):
    #To split into num_heads and depth, we need to use the d_model 
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) #batch size, seq len, num heads, depth
    return tf.transpose(x, perm = [0, 2, 1, 3]) #However now we move it to batch size, num heads, seq len, depth because this is the way I suppose tensorflow likes its transposed 
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0] #Since tensors like to have batch size in first index and q.batch_size = k.batch_size = v.batch_size which implies we can pick any q,k,v
    q = self.wq(q)
    k = self.wk(k)
    v = self.wv(v)
    q = self.split_heads(q, batch_size)
    k = self.split_heads(k, batch_size)
    v = self.split_heads(v, batch_size)
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    scaled_attention = tf.transpose(scaled_attention, perm = [0, 2,1,3]) #moves it back to batch_size, seq_len, num_heads, depth in order to reshape
    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
    output = self.dense(concat_attention) #Makes it go back to original size type of attention size if not alrdy!
    return output, attention_weights #:D Attention weights in my best guess will probably be used for the feedforward layers in order to have starting values 



In [258]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
                tf.keras.layers.Dense(dff, activation = 'relu'),
                tf.keras.layers.Dense(d_model)
  ])


In [259]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate = 0.1):
    super(EncoderLayer, self).__init__()
    #in an encoder layer, we embed vals, consider in positional encoding, 
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
  def call(self, x, training, mask): #Note: 'training' is simply dictating whether we want to train this specific layer or not when building or rather inferencing w/our transformer
    attn_output, _ = self.mha(x,x,x, mask)
    attn_output = self.dropout1(attn_output, training = training)
    out1 = self.layernorm1(attn_output + x) 
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training = training)
    out2 = self.layernorm2(ffn_output + out1)

    return out2 

In [260]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__() 
    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask): #We associate a training with each one of these layers for the same reason of giving us the option of whether to freeze or not 
    selfatt, attweight1 = self.mha1(x,x,x,look_ahead_mask)
    selfatt = self.dropout1(selfatt, training = training)
    out1 = self.layernorm1(selfatt + x)
    output2, attweight2 = self.mha2(enc_output,enc_output, out1, padding_mask)
    output2 = self.dropout2(output2, training = training)
    out2 = self.layernorm2(output2 + out1)
    output3 = self.ffn(out2)
    output3 = self.dropout3(output3, training = training)
    out3 = self.layernorm3(output3 + out2)
    return out3, attweight1, attweight2

In [261]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model,num_heads, dff, input_vocab_size, maximum_positional_encoding,rate=0.1):
    super(Encoder, self).__init__()
    #How I think its gonna go, we are going to first start off by creating our padding mask from the tokens, then we are going to implementing embedding.
    #After we implement the embedding we are then going to deploy the encoder layer on it for number of times we have num layer.
    self.d_model = d_model 
    self.num_layers = num_layers
    self.pos_encoding = positional_encoding(maximum_positional_encoding, self.d_model)
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.enc = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
  def call(self, x, training, padding_mask): #Note that since positional encoding doesnt rely at all on the actual values inside of the embeddings, we are alrdy good there 
    seq_len = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training = training)
    for i in range(self.num_layers):
      x = self.enc[i](x, training, padding_mask)
    return x

In [262]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads,dff, target_vocab_size,maximum_positional_encoding,rate=0.1):
    super(Decoder, self).__init__()
    self.num_layers = num_layers 
    self.d_model = d_model 
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_positional_encoding, d_model)
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1] #This is making sure columns match when we do pos encoding 
    x = self.embedding(x)
    attention_weights = {}
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) #I guess we multiply by sqrt of d_model to normalize?
    x += self.pos_encoding[:, :seq_len, :]
    x = self.dropout(x, training=training)
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask) 
      attention_weights[f'decoder_layer{i+1}_block1'] = block1 
      attention_weights[f'decoder_layer{i+1}_block2'] = block2 #This gives a key value pair keeping track of weights so we can easily index! 
    return x, attention_weights

In [263]:
class Transformer(tf.keras.Model):
  def __init__(self,num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
    self.decoder = Decoder(num_layers, d_model,num_heads, dff, target_vocab_size, pe_target, rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
  def call(self, inputs, training):
    #Keras models prefer all inputs into first argument 
    inp, tar = inputs 
    enc_padding_mask, look_ahead_mask, dec_padding_mask  = self.create_masks(inp, tar)
    enc_output = self.encoder(inp, training,enc_padding_mask)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    final_output = self.final_layer(dec_output)
    return final_output, attention_weights 
  def create_masks(self, inp, tar):
    #Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(tar)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, look_ahead_mask, dec_padding_mask 

In [264]:
#Now we are done with that :), we simply have to setup hyperparameters

In [265]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [266]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps = 4000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model 
    self.d_model = tf.cast(self.d_model, tf.float32) #This learning rate is strictly based off of the dimensionality of the model
    self.warmup_steps = warmup_steps 
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model)*tf.math.minimum(arg1, arg2)

In [267]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [268]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

In [269]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype = loss_.dtype)
  loss_ *= mask
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask) #reduce sum flattens and then sums

In [270]:
def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)
  accuracies = tf.cast(accuracies, dtype = tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)


In [271]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [272]:
transformer = Transformer(num_layers = num_layers, d_model = d_model, num_heads = num_heads, dff = dff, input_vocab_size=tokenizer.pt.get_vocab_size().numpy(),
                          target_vocab_size = tokenizer.en.get_vocab_size().numpy(), pe_input=1000, pe_target=1000, rate = dropout_rate)

In [279]:
transformer.compile(optimizer = optimizer, loss = tf.keras.losses.SparseCategoricalCrossentropy, metrics = ['accuracy'])

In [281]:
transformer.summary()

Model: "transformer_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_13 (Encoder)        multiple                  1787008   
                                                                 
 decoder_6 (Decoder)         multiple                  1955584   
                                                                 
 dense_471 (Dense)           multiple                  904290    
                                                                 
Total params: 4,646,882
Trainable params: 4,646,882
Non-trainable params: 0
_________________________________________________________________


This was probably the most intense build from scratch project i've done. Pretty exhausting but this taught me a lot about implementation in code, it also taught me a lot about tensorflow rules. It's nice because with this I feel a lot more confident in my ability to build great deep learning architectures. This crap took me so many hours to understand before I could even start building lol.