In [2]:
import numpy as np

In [3]:
import keras

In [4]:
import tensorflow as tf

In [5]:
from tensorflow.keras.layers import Input,Embedding,Dense,Dropout,LayerNormalization,MultiHeadAttention

In [5]:
num_layers=3
num_heads=8
d_model=512
dff=2048
dropout=0.2

In [6]:
def scaled_dot_product(q,k,v,mask=None):
    scale=q.shape[-1]
    k=tf.transpose(k,perm=(0,1,3,2))
    scaled=tf.matmul(q,k)/np.sqrt(scale)
    if mask is not None:
        scaled+=mask
    attention_wts=tf.nn.softmax(scaled,axis=-1)
    values=tf.matmul(scaled,v)
    return values,attention_wts

In [28]:
class MultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MultiHead_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.qkv=Dense(3*d_model)
        self.dense=Dense(d_model)
        
    def call(self,inputs):
        batch_size,max_inp_len,d_model=inputs.shape
        qkv=self.qkv(inputs)
        desired_shape = (batch_size, max_inp_len, self.n_heads, 3 * self.head_dims)
        qkv=tf.reshape(qkv,desired_shape)
        qkv=tf.transpose(qkv,perm=(0,2,1,3))
        q,k,v=tf.split(qkv,num_or_size_splits=3,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v)
        v=tf.reshape(v,(batch_size,max_inp_len,self.n_heads*self.head_dims))
        output=self.dense(v)
        return output

In [29]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self,d_model,ffc,dropout):
        super(FeedForward,self).__init__()
        self.dense1=Dense(ffc,activation="relu")
        self.dense2=Dense(d_model)
        self.dropout=Dropout(rate=dropout)
        
    def call(self,inputs,training=False):
        x=self.dense1(inputs)
        x=self.dropout(x,training=training)
        x=self.dense2(x)
        return x

In [30]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, parameter_shape, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.parameter_shape = parameter_shape
        self.eps = eps
        self.gamma = self.add_weight("gamma", shape=parameter_shape, initializer="ones", trainable=True)
        self.beta = self.add_weight("beta", shape=parameter_shape, initializer="zeros", trainable=True)
        
    def call(self, inputs):
        mean = tf.reduce_mean(inputs, axis=-1, keepdims=True)
        var = tf.reduce_mean(tf.square(inputs - mean), axis=-1, keepdims=True)
        std = tf.sqrt(var + self.eps)
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [31]:
class Encoder_Layer(tf.keras.layers.Layer):
    def __init__(self,num_of_heads,d_model,dff,dropout):
        super(Encoder_Layer,self).__init__()
        self.mha=MultiHead_Attention(num_of_heads,d_model)
        self.dropout1=Dropout(rate=dropout)
        self.ln1=LayerNormalization(d_model)
        self.ff=FeedForward(d_model,dff,dropout)
        self.dropout2=Dropout(rate=dropout)
        self.ln2=LayerNormalization(d_model)
        
    def call(self,inputs,training=False):
        mha_output=self.mha(inputs)
        output_dropout1=self.dropout1(mha_output,training=training)
        ln1_output=self.ln1(output_dropout1+inputs)
        
        ffn_output=self.ff(ln1_output,training)
        ffn_output_dropout=self.dropout2(ffn_output,training=training)
        ln2_output=self.ln2(ffn_output_dropout+ln1_output)
        return ln2_output

In [32]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,num_of_heads,d_model,dff,dropout,num_layers):
        super(Encoder,self).__init__()
        self.encoder_layer=[Encoder_Layer(num_of_heads,d_model,dff,dropout) for _ in range(num_layers)]
        
    def call(self,inputs,training=False):
        x=inputs
        for layer in self.encoder_layer:
            x=layer(x,training)
        return x

In [33]:
encoder=Encoder(num_heads,d_model,dff,dropout,num_layers)

In [38]:
x=np.random.random((10,30,512))

In [39]:
output=encoder(x)

In [40]:
output.shape

TensorShape([10, 30, 512])