In [302]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Layer
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.activations import softmax
from tensorflow.keras.models import Model

In [533]:
x = tf.ones(shape=(10,6), dtype=tf.float32)
x.shape

TensorShape([10, 6])

# Embedings
Embeddings are a way to represent words in a way that a machine learning model can understand. They are a way to convert words into numbers. In this notebook, we will see how to use embeddings in a machine learning model. We will use the tf.keras.layer.Embedding for this purpose.
Maybe later I will create my own embeddings using the word2vec algorithm.

In [534]:
tf.keras.layers.Embedding(10, 512)(x).shape

TensorShape([10, 6, 512])

# Positional Encoding

In [311]:
class positional_encoding(Layer):
    def __init__(self, **kwargs):
        super(positional_encoding, self).__init__(**kwargs)

    def build(self, input_shape):
        positions = tf.range(input_shape[1], dtype=tf.float32)[:, tf.newaxis]
        dim = tf.range(input_shape[-1], dtype=tf.float32)[tf.newaxis, :]
        self.pe = positions / tf.pow(10000, 2 * (dim // 2) / input_shape[-1])
        self.pe = tf.where( tf.cast(dim % 2, tf.bool),tf.cos(self.pe), tf.sin(self.pe))

    def call(self, x):
        return self.pe + x


## Multi-head Attention Mechanism

In [296]:
class multi_head_attention(Layer):
    def __init__(self, heads=8):
        super(multi_head_attention, self).__init__()
        self.Q_initializer = GlorotUniform()
        self.K_initializer = GlorotUniform()
        self.V_initializer = GlorotUniform()
        self.WO_initializer = GlorotUniform()
        self.heads = heads
    
    def build(self, input_shape):
        self.n_dims = input_shape[-1]
        self.WQ = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.Q_initializer, trainable=True)
        self.WK = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.K_initializer, trainable=True)
        self.WV = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.V_initializer, trainable=True)
        self.WO = self.add_weight(shape=(self.n_dims, self.n_dims), initializer= self.WO_initializer, trainable=True)
    
    def call(self,x):
        ### x shape = (batch_size, seq_len, n_dims)
        Q = x @ self.WQ
        K = x @ self.WK
        V = x @ self.WV

        ### Splitting the heads and stacking them
        Q = tf.stack(tf.split(Q, self.heads, axis=2))
        K = tf.stack(tf.split(K, self.heads, axis=2))
        V = tf.stack(tf.split(V, self.heads, axis=2))
        ### Applying the attention
        return self.attention(Q, K, V) @ self.WO

    def attention(self, Q, K, V):
        QK = tf.matmul(Q, K, transpose_b=True)
        QK = softmax(QK / np.sqrt(self.n_dims), axis=-1)
        QKV = tf.matmul(QK, V)
        return tf.concat(tf.unstack(QKV, axis=0), axis=-1)

### Layer Normalization

In [297]:
class add_n_norm(Layer):
    def __init__(self, epsilon=1e-6):
        super(add_n_norm, self).__init__()
        self.epsilon = epsilon
    
    def build(self, input_shape):
        self.n_dims = input_shape[-1]
        self.gamma = self.add_weight(shape=(self.n_dims,), initializer='ones', trainable=True)
        self.beta = self.add_weight(shape=(self.n_dims,), initializer='zeros', trainable=True)
    
    def call(self, x, x_i):
        ## x_i is the input to the sublayer
        ## x is the output from the sublayer
        mean = tf.reduce_mean(x, axis=-1, keepdims=True)
        std = tf.math.reduce_std(x, axis=-1, keepdims=True)
        normalized_values = (x - mean) / tf.sqrt(tf.square(std) + self.epsilon) * self.gamma + self.beta
        return normalized_values + x_i

# Feed Forward Neural Network

In [298]:
class dense_layer(Layer):
    def __init__(self, n_out, activation):
        super().__init__(name=f'Neuron')
        self.w_initializer = GlorotUniform()
        self.b_initializer = GlorotUniform()
        self.n_out = n_out
        self.activation = activation
    
    def build(self, input_shape):
        self.w = self.add_weight(shape=[input_shape[-1],self.n_out], initializer=self.w_initializer, trainable=True)
        self.b = self.add_weight(shape=[self.n_out,], initializer=self.b_initializer, trainable=True)
    
    def call(self, x):
        z = x @ self.w + self.b
        if self.activation:
            z =  self.activation(z)
        return z     

# Masked Multi-head Attention Mechanism

In [358]:
class masked_multi_head_attention(Layer):
    def __init__(self, heads=8):
        super(masked_multi_head_attention, self).__init__()
        self.Q_initializer = GlorotUniform()
        self.K_initializer = GlorotUniform()
        self.V_initializer = GlorotUniform()
        self.WO_initializer = GlorotUniform()
        self.heads = heads
    
    def build(self, input_shape):
        self.n_dims = input_shape[-1]
        self.WQ = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.Q_initializer, trainable=True)
        self.WK = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.K_initializer, trainable=True)
        self.WV = self.add_weight(shape=(self.n_dims, self.n_dims), initializer=self.V_initializer, trainable=True)
        self.WO = self.add_weight(shape=(self.n_dims, self.n_dims), initializer= self.WO_initializer, trainable=True)
        a = tf.linalg.band_part(tf.ones(shape= (input_shape[1], input_shape[1])), -1, 0)
        mask = tf.not_equal(a, 1)
        a = tf.where(mask, np.inf * -1, a)
        mask = tf.greater(a, 0)
        self.a = tf.where(mask, 0, a)

    def call(self,x,Q):
        ### x shape = (batch_size, seq_len, n_dims)
        Q = Q @ self.WQ
        K = x @ self.WK
        V = x @ self.WV
        
        ### Splitting the heads and stacking them
        Q = tf.stack(tf.split(Q, self.heads, axis=2))
        K = tf.stack(tf.split(K, self.heads, axis=2))
        V = tf.stack(tf.split(V, self.heads, axis=2))
        ### Applying the attention
        return self.attention(Q, K, V) @ self.WO

    def attention(self, Q, K, V):
        QK = tf.matmul(Q, K, transpose_b=True)
        QK = QK + self.a   ### Masking the attention
        QK = softmax(QK / np.sqrt(self.n_dims), axis=-1)
        QKV = tf.matmul(QK, V)
        return tf.concat(tf.unstack(QKV, axis=0), axis=-1)

# Linear transformation

In [378]:
class linear(Layer):
    def __init__(self, n_out):
        super(linear, self).__init__()
        self.n_out = n_out
    
    def build(self, input_shape):
        self.w = self.add_weight(shape=(input_shape[-1], self.n_out), initializer='ones', trainable=True)

    def call(self, x):
        return softmax(x @ self.w, axis=-1)

# Encoder

In [440]:
class encoder_layer(Layer):
    def __init__(self, n_heads = 8):
        super(encoder_layer, self).__init__()
        self.mha = multi_head_attention(n_heads)
        self.add_norm1 = add_n_norm()
        self.dense = dense_layer(2048, tf.nn.relu)
        self.dense1 = dense_layer(512, None)
        self.add_norm2 = add_n_norm()
    
    def call(self, x):
        x1 = self.mha(x)
        x = self.add_norm1(x1, x)
        x1 = self.dense(x)
        x1 = self.dense1(x1)
        x = self.add_norm2(x, x1)
        return x    

In [451]:
class encoder(Layer):
    def __init__(self, n_layers=6, n_heads=8, input_dims = 6 ,output_dims=512):
        super(encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dims, output_dims)
        self.pe = positional_encoding()
        self.layers = [encoder_layer(n_heads= n_heads) for _ in range(n_layers)]
    
    def call(self, x):
        x = self.embedding(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x)
        return x

# Decoder

In [473]:
class decoder_layer(Layer):
    def __init__(self, n_heads = 8):
        super(decoder_layer, self).__init__()
        self.mha1 = masked_multi_head_attention(n_heads)
        self.add_norm1 = add_n_norm()
        self.mha2 = multi_head_attention(n_heads)
        self.add_norm2 = add_n_norm()
        self.dense = dense_layer(2048, tf.nn.relu)
        self.dense1 = dense_layer(512, None)
        self.add_norm3 = add_n_norm()

    
    def call(self, x, enc):
        x1 = self.mha1(x, enc)
        x = self.add_norm1(x1, x)
        x1 = self.mha2(x)
        x = self.add_norm2(x, x1)
        x1 = self.dense(x)
        x1 = self.dense1(x1)
        x = self.add_norm3(x, x1)
        return x

In [511]:
class decoder(Layer):
    def __init__(self, n_layers=6, n_heads=8, output_dims=512, input_dims=6):
        super(decoder, self).__init__()
        self.transform_input = dense_layer(output_dims, None)
        self.pe = positional_encoding()
        self.layers = [decoder_layer(n_heads= n_heads) for _ in range(n_layers)]
        self.dense = dense_layer(512, tf.nn.relu)
        self.dense1 = dense_layer(256, None)
        self.dense2 = dense_layer(input_dims, None)


    def call(self, x, enc):
        x = self.transform_input(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x, enc)
        x = self.dense(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return x

# Transformer
For this transformer I did not use the linear transformation layer, because I am trying to predict the secondary structure of a RNA. Since the output is a adjacency matrix, I did not see the need to use the linear transformation layer.

In [522]:
class Transformer(Model):
    def __init__(self, input_dims=6, output_dims=6, n_layers=6, n_heads=8):
        super(Transformer, self).__init__()
        self.encoder = encoder(n_layers= n_layers, n_heads= n_heads, input_dims=input_dims)
        self.decoder = decoder(n_layers= n_layers, n_heads= n_heads, input_dims=input_dims)

    
    def call(self, x, y):
        x = self.encoder(x)
        x = self.decoder(y, x)
        return x
    
    def compile(self, optimizer, loss):
        super(Transformer, self).compile()
        self.optimizer = optimizer
        self.loss = loss
    
    def train_step(self, data):
        x, y = data
        with tf.GradientTape() as tape:
            y_pred = self(x, y)
            loss = self.loss(y, y_pred)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return {'loss': loss}

In [529]:
y = tf.ones(shape=(10,300, 300), dtype=tf.float32)
x = tf.ones(shape=(10,300), dtype=tf.float32)

In [530]:
t = Transformer(input_dims= 300)
z = t(x, y)
t.summary()

Model: "transformer_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_20 (encoder)        multiple                  19055616  
                                                                 
 decoder_29 (decoder)        multiple                  25824812  
                                                                 
Total params: 44880428 (171.21 MB)
Trainable params: 44880428 (171.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [531]:
z

<tf.Tensor: shape=(10, 300, 300), dtype=float32, numpy=
array([[[-1.0702186 ,  2.1606808 , -0.5551263 , ...,  1.0365806 ,
          2.3754857 , -0.84484637],
        [-1.0707722 ,  2.1600933 , -0.5549633 , ...,  1.0360175 ,
          2.377528  , -0.8467401 ],
        [-1.0707887 ,  2.1593242 , -0.5539858 , ...,  1.0356475 ,
          2.3789375 , -0.8486698 ],
        ...,
        [-1.0727466 ,  2.1484785 , -0.5517084 , ...,  1.0421611 ,
          2.3714552 , -0.8509084 ],
        [-1.0727916 ,  2.1479397 , -0.55349755, ...,  1.0425571 ,
          2.3714054 , -0.85161   ],
        [-1.0724458 ,  2.148208  , -0.5547035 , ...,  1.043347  ,
          2.371759  , -0.85240537]],

       [[-1.0702186 ,  2.1606808 , -0.5551263 , ...,  1.0365806 ,
          2.3754857 , -0.84484637],
        [-1.0707722 ,  2.1600933 , -0.5549633 , ...,  1.0360175 ,
          2.377528  , -0.8467401 ],
        [-1.0707887 ,  2.1593242 , -0.5539858 , ...,  1.0356475 ,
          2.3789375 , -0.8486698 ],
        ...