Built a random transformer based model. Just to show comfortability with tensorflow mainly. This is pretty simple stuff at this point. All you have to do now is just preprocess some data for a tast like text generation, create the encoder decoder and look ahead masks. 
Now that I think about it i think its better to actually incorporate the mask building into each step of the model. I will be fixing this up in my next rendition of the model.
Note: The preprocessing for this model is meant for natural language generation tasks. If you want a QA type instead we would take into account 2 outputs. Both over the same vocabulary space (context). If used for sentiment analysis our output space would be all available sentiments and we simply adjust the last dense layer in the 'KennethsCustomTransformer' class to have as many units as outputs. This is nice to work on because it makes me feel aware of how to implement a lot of these systems that I just see in research papers. 

In [None]:
import tensorflow as tf 
class KensReallyWeirdLayer(tf.keras.layers.Layer):
  def __init__(self, units):
    super(KensReallyWeirdLayer, self).__init__()
    self.units = units
  def build(self, input_shape):
    w_init_method = tf.random_normal_initializer()
    b_init_method = tf.zeros_initializer()
    self.w = tf.Variable(initial_value = w_init_method(shape=(input_shape[-1], self.units), dtype='float32'), trainable=True)
    self.b = tf.Variable(initial_value = b_init_method(shape=(self.units,), dtype='float32'), trainable=True)
  def call(self, input):
    relu = tf.math.maximum(tf.matmul(input*self.w) + b, 0)
    simoidfollowingrelu = 1/(1+tf.math.pow(tf.cast(e, dtype='float32'),tf.cast(-relu, dtype='float32')))
    hyptangentofemall = tf.math.tanh(sigmoidfollowingrelu)
    return hyptangentofemall 

In [None]:
import numpy as np 
def internal_value(pos, i, d_model):
  return pos/np.power(10000, (2*i)/d_model)
def positional_encoding(max_position, d_model):
  #Recall the formula is trig(pos/(10000^(2i/d_model)))
  values = internal_value(np.arange(max_position)[:, np.newaxis], np.arange(d_model)[np.newaxis,:], d_model)
  values[:, 0::2] = np.sin(values[:, 0::2])
  values[:, 1::2] = np.cos(values[:, 0::2])
  return values

In [None]:
def dot_product_attention(value, key, query, dimensionality, mask):
  dotproductkq = tf.matmul(key, query, transpose_b = True)
  dotproductkq = dotproductkq/(tf.math.sqrt(dimensionality))
  dotproductkq += (mask * -1e12) 
  softout = tf.nn.softmax(dotproductkq, axis = -1)
  attout = tf.matmul(softout, value, transpose_b=True)
  return attout

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self,dimensionality, num_heads, batch_size):
    super(MultiHeadAttention, self).__init__()
    self.dimensionality = dimensionality 
    self.num_heads = num_heads
    assert self.dimensionality % self.num_heads == 0 
    self.qW = tf.keras.layers.Dense(dimensionality)
    self.kW = tf.keras.layers.Dense(dimensionality)
    self.vW = tf.keras.layers.Dense(dimensionality)
    #Now we have the matrices we are going to learn that is associated with each of these query, key, value components
    self.dense = tf.keras.layers.Dense(dimensionality)
    self.depth = self.dimensionality // self.num_heads
  def call(self,query, key, value, mask):
    q = self.qW(query)
    k = self.kW(key)
    v = self.vW(value)

    q = tf.reshape(q, (batch_size, -1, self.num_heads, self.depth))
    q = tf.transpose(q, perm = [0, 2,1,3])
    v = tf.reshape(v, (batch_size, -1, self.num_heads, self.depth))
    v = tf.transpose(v, perm = [0,2,1,3])
    k = tf.reshape(k, (batch_size, -1, self.num_heads, self.depth))
    k = tf.transpose(k, perm = [0,2,1,3])
    attention_output = dot_product_attention(v, k, q, dimensionality, mask)
    attention_output = tf.tranpose(attention_output, perm = [0,2,1,3])
    attention_output = tf.reshape(attention_output, (batch_size, -1, self.dimensionality))
    output = self.dense(attention_output)
    return output

In [None]:
def WEIRDLayers(dff, d_model):
  return tf.keras.Sequential([
        tf.keras.layers.LSTM(dff, return_sequences=True),
        tf.keras.layers.Dense(dff, activation = 'relu'),
        tf.keras.layers.LeakyReLU(alpha = 0.5),
        tf.keras.layers.LayerNormalization(epsilon = 1e-7),
        tf.keras.layers.Dense(d_model)
  ])

In [None]:
#Now we have an LSTM Layer, along with a FC Layer, into a LeakyRELU layer, into LayerNormalization into a final FC Layer.
#Now we can build the EncoderLayer and DecoderLayer
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, batch_size):
    super(EncoderLayer, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads 
    self.mha= MultiHeadAttention(self.d_model, self.num_heads, batch_size)
    self.weirdlayer1 = WEIRDLayers(dff, self.d_model)
    self.weirdlayer2 = WEIRDLayers(dff, self.d_model)
  def call(self, input, mask):
    out1_1 = self.mha(input,input,input, mask)
    out1_2 = tf.keras.layers.Dropout(0.3, training=True)(out1_1)
    out1_3= tf.keras.layers.LayerNormalization(out1_2 + input)
    out2_1 = self.weirdlayer1(out1_3)
    out2_2 = tf.keras.layers.Dropout(0.3, training=True)(out2_1)
    out2_3 = tf.keras.layers.LayerNormalization(out2_2 + out1_3)
    out3 = self.weirdlayer2(out2_3)
    return out3

In [None]:
class WeirdDecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, batch_size):
    super(WeirdDecoderLayer, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads
    self.mha1 = MultiHeadAttention(d_model, num_heads, batch_size)
    self.mha2 = MultiHeadAttention(d_model, num_heads, batch_size)
    self.weirdlayer1 = WEIRDLayers(dff, d_model)
    self.weirdlayer2 = WEIRDLayers(dff, d_model)
    self.weirdlayer3 = WEIRDLayers(dff, d_model)
    self.layernorm1= tf.keras.layers.LayerNormalization()
    self.dense = tf.keras.layers.Dense(dff)
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.dropout1 = tf.keras.layers.Dropout(0.4)
    self.dropout2 = tf.keras.layers.Dropout(0.3)
    self.Kenthebeast123 = KennethsWeirdLayer(d_model)
  def call(self, input, enc_input, mask_forward, zero_mask):
    out1_1 = self.mha1(input, input, input, mask_forward)
    out1_2 = self.layernorm1(out1_1 + input)
    out1_3 = self.weirdlayer1(out1_2 + out1_1 + input)
    out1_4 = self.layernorm2(out1_3 + out1_2 + out1_1)
    out2_1 = self.mha2(out1_4, enc_input, enc_input, zero_mask, batch_size) 
    #(ABOVE)QUERY IS ALWAYS DECODER OUTPUT, WE USE KEY AND VALUE FROM ENCODER TO MAKE IT SEEM LIKE THE ENCODER IS A MEMORY STORAGE
    out2_2 = self.dropout(out2_1)
    out2_3 = self.dense(out2_2)
    out2_4 = self.weirdlayer2(out2_3)
    out2_5 = self.dropout2(out2_4)
    out3 = self.weirdlayer3(out2_5 + out2_4+ out2_1)
    out3Ken = self.Kenthebeast123(out3)
    return out3

In [None]:
class CustomEncoder(tf.keras.layers.Layer):
  def __init__(self, num_heads,num_layers, d_model, batch_size, dff, distinct_vocab_size, max_position):
    super(CustomEncoder, self).__init__()
    self.num_layers = num_layers
    self.enc_layers = [EncoderLayer(d_model, num_heads,dff, batch_size) for _ in range(self.num_layers)]
    self.embedder = tf.keras.layers.Embedding(distinct_vocab_size, d_model)
    self.pos_encoding = positional_encodindg(max_position, d_model)
  def call(self, input, mask):
    x = self.embedder(input)
    sequenceLength = tf.shape(input)[1]
    x += self.pos_encoding[:, :sequenceLength, :]
    for i in range(self.num_layers):
      x = self.enc_layers[i](input, mask)
    return x

In [15]:
class CustomDecoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, dff,batch_size, num_heads, distinct_vocab_size, max_position):
    super(CustomDecoder, self).__init__()
    self.num_layers = num_layers
    self.d_model = d_model
    self.embedder = tf.keras.layers.Embedding(distinct_vocab_size, self.d_model)
    self.dec_layers = [WeirdDecoderLayer(d_model, num_heads, dff, batch_size) for _ in range(self.num_layers)]
    self.pos_encoding = positional_encoding(max_position, d_model)
  def call(self, input, enc_output, look_ahead_mask, zero_mask):
    x = self.embedder(input)
    x += pos_encoding[:, :tf.shape(input)[1], :]
    for i in range(self.num_layers):
      x = self.dec_layers[i](input, enc_output, look_ahead_mask, zero_mask)
    return x 

In [16]:
class KennethsCustomTransformer(tf.keras.Model):
  def __init__(self, dec_num_layers, enc_num_layers, d_model, dff, num_heads, max_position, input_vocab_size,output_vocab_size, batch_size, dec_zero_mask, enc_zero_mask
               ,dec_look_ahead_mask):
    super(KennethsCustomTransformer, self).__init__()
    self.encoder = CustomEncoder(num_heads, enc_num_layers, d_model, batch_size, dff, distinct_vocab_size, max_position)
    self.decoder = CustomDecoder(dec_num_layers, d_model, dff, batch_size, num_heads, distinct_vocab_size, max_position)
    self.dense = tf.keras.layers.Dense(output_vocab_size) #The amount of units is equivalent to the amount of outputs we are considering at each step.
    #The amount of different dense layers corresponds to the amount of different output's we want. If this was question answering we'd have two dense layers that we can connect
    #together. However here we are assuming a text generation probability output that we choose from for the next word. So we will use one output and a option size equivalent
    #to the entire output vocabulary size
  def call(self, input, label):
    enc_out = self.encoder(input,enc_zero_mask)
    dec_out = self.decoder(label,enc_out,dec_look_ahead_mask,dec_zero_mask)
    finalout = self.dense(dec_out)
    return finalout

In [20]:
#Now to create an instance we simply have to create the hyperparameters, as well as regular parameters and then fit to our data!
#The preprocessing step is pretty simple (usually) so once you have that setup all you have to do is input! We also need to take into account making the masks with the input
#we have. We could incorporate it straight into the model but im a bit lazy and I have a lot of other work to do for school unfortunately.
#Anyways this is my personally made weight transformer!

Made this to show comfortability again with making advanced models. I added randomness to show this. I will be using this model to process some data and labels a bit later on but this is just the start. Next up on my list I believe I will be making different transformer variations and testing them to see which is the best performing.