In [1]:
import numpy as np

In [2]:
import keras

In [3]:
import tensorflow as tf

In [4]:
from tensorflow.keras import Model

In [5]:
from tensorflow.keras.layers import Embedding,Dense,Dropout

## Encoder

In [6]:
class PositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, hidden_dim):
        super(PositionEmbedding, self).__init__()
        word_embedding_matrix = self.get_position_encoding(vocab_size, hidden_dim)   
        position_embedding_matrix = self.get_position_encoding(sequence_length, hidden_dim)                                          
        self.word_embedding_layer = Embedding(
            input_dim=vocab_size, output_dim=hidden_dim,
            weights=[word_embedding_matrix],
            trainable=False
        )
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, output_dim=hidden_dim,
            weights=[position_embedding_matrix],
            trainable=False
        )
             
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
 
 
    def call(self, inputs):        
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

In [7]:
def scaled_dot_product(q,k,v,mask=None):
    scale=q.shape[-1]
    k=tf.transpose(k,perm=(0,1,3,2))
    scaled=tf.matmul(q,k)/np.sqrt(scale)
    if mask is not None:
        scaled+=mask
    attention_wts=tf.nn.softmax(scaled,axis=-1)
    values=tf.matmul(scaled,v)
    return values,attention_wts

In [8]:
class MultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MultiHead_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.qkv=Dense(3*d_model)
        self.dense=Dense(d_model)
        
    def call(self,inputs,mask=None):
        batch_size,max_inp_len,d_model=inputs.shape
        qkv=self.qkv(inputs)
        desired_shape = (batch_size, max_inp_len, self.n_heads, 3 * self.head_dims)
        qkv=tf.reshape(qkv,desired_shape)
        qkv=tf.transpose(qkv,perm=(0,2,1,3))
        q,k,v=tf.split(qkv,num_or_size_splits=3,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v)
        v=tf.reshape(v,(batch_size,max_inp_len,self.n_heads*self.head_dims))
        output=self.dense(v)
        return output

In [9]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self,d_model,ffc,dropout):
        super(FeedForward,self).__init__()
        self.dense1=Dense(ffc,activation="relu")
        self.dense2=Dense(d_model)
        self.dropout=Dropout(rate=dropout)
        
    def call(self,inputs,training=False):
        x=self.dense1(inputs)
        x=self.dropout(x,training=training)
        x=self.dense2(x)
        return x

In [10]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, parameter_shape, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.parameter_shape = parameter_shape
        self.eps = eps
        self.gamma = self.add_weight("gamma", shape=parameter_shape, initializer="ones", trainable=True)
        self.beta = self.add_weight("beta", shape=parameter_shape, initializer="zeros", trainable=True)
        
    def call(self, inputs):
        mean = tf.reduce_mean(inputs, axis=-1, keepdims=True)
        var = tf.reduce_mean(tf.square(inputs - mean), axis=-1, keepdims=True)
        std = tf.sqrt(var + self.eps)
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [11]:
class Encoder_Layer(tf.keras.layers.Layer):
    def __init__(self,num_of_heads,d_model,dff,dropout):
        super(Encoder_Layer,self).__init__()
        self.mha=MultiHead_Attention(num_of_heads,d_model)
        self.dropout1=Dropout(rate=dropout)
        self.ln1=LayerNormalization(d_model)
        self.ff=FeedForward(d_model,dff,dropout)
        self.dropout2=Dropout(rate=dropout)
        self.ln2=LayerNormalization(d_model)
        
    def call(self,inputs,mask,training):
        mha_output=self.mha(inputs,mask)
        output_dropout1=self.dropout1(mha_output,training=training)
        ln1_output=self.ln1(output_dropout1+inputs)
        
        ffn_output=self.ff(ln1_output,training)
        ffn_output_dropout=self.dropout2(ffn_output,training=training)
        ln2_output=self.ln2(ffn_output_dropout+ln1_output)
        return ln2_output

In [12]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,inp_seq_len, inp_vocab_size,embed_dim,num_of_heads,d_model,dff,dropout,num_layers):
        super(Encoder,self).__init__()
        self.embedding_layer = PositionEmbedding(inp_seq_len, inp_vocab_size,embed_dim)
        self.encoder_layer=[Encoder_Layer(num_of_heads,d_model,dff,dropout) for _ in range(num_layers)]
        
    def call(self,inputs,mask=False,training=False):
        x=self.embedding_layer(inputs)
        for layer in self.encoder_layer:
            x=layer(x,mask,training)
        return x

## Decoder

In [13]:
class MaskedMultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MaskedMultiHead_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.qkv=Dense(3*d_model)
        self.dense=Dense(d_model)
        
    def call(self,inputs,mask=None):
        batch_size,max_inp_len,d_model=inputs.shape
        qkv=self.qkv(inputs)
        desired_shape = (batch_size, max_inp_len, self.n_heads, 3 * self.head_dims)
        qkv=tf.reshape(qkv,desired_shape)
        qkv=tf.transpose(qkv,perm=(0,2,1,3))
        q,k,v=tf.split(qkv,num_or_size_splits=3,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v,mask)
        v=tf.reshape(v,(batch_size,max_inp_len,self.n_heads*self.head_dims))
        output=self.dense(v)
        return output

In [44]:
class MultiHeadCross_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MultiHeadCross_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.kv=Dense(2*d_model)
        self.q=Dense(d_model)
        self.dense=Dense(d_model)
        
    def call(self,x,y,mask=None):
        batch_size,max_inp_len,d_model=x.shape
        batch_size,max_tar_len,d_model=y.shape
        kv=self.kv(x)
        q=self.q(y)
        kv=tf.reshape(kv,(batch_size, max_inp_len, self.n_heads, 2 * self.head_dims))
        q=tf.reshape(q,(batch_size, max_tar_len, self.n_heads, self.head_dims))
        kv=tf.transpose(kv,perm=(0,2,1,3))
        q=tf.transpose(q,perm=(0,2,1,3))
        k,v=tf.split(kv,num_or_size_splits=2,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v,mask)
        v=tf.reshape(v,(batch_size,max_tar_len,d_model))
        output=self.dense(v)
        return output

In [45]:
class Decoder_Layer(tf.keras.layers.Layer):
    def __init__(self,d_model,dropout,no_heads,dff):
        super(Decoder_Layer,self).__init__()
        self.mha=MaskedMultiHead_Attention(no_heads,d_model)
        self.dropout1=Dropout(rate=dropout)
        self.norm1=LayerNormalization(d_model)
        
        self.mhca=MultiHeadCross_Attention(no_heads,d_model)
        self.dropout2=Dropout(rate=dropout)
        self.norm2=LayerNormalization(d_model)
        
        self.ff=FeedForward(d_model,dff,dropout)
        self.dropout3=Dropout(rate=dropout)
        self.norm3=LayerNormalization(d_model)
        
    def call(self,x,y,look_ahead_mask,padding_mask,training=False):
        mha_output=self.mha(y,mask=look_ahead_mask)
        mha_output_dropout=self.dropout1(mha_output,training=training)
        norm1_output=self.norm1(mha_output_dropout+y)
        
        cmha_output=self.mhca(x,norm1_output,mask=padding_mask)
        cmha_output_dropout=self.dropout2(cmha_output,training=training)
        norm2_output=self.norm2(cmha_output_dropout+norm1_output)
        
        ff_output=self.ff(norm2_output,training)
        ff_output_dropout=self.dropout3(ff_output,training=training)
        norm3_output=self.norm3(ff_output_dropout+norm2_output)
        return norm3_output

In [46]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,tar_seq_len, tar_vocab_size,embed_dim,n_layers,d_model,dropout,no_heads,dff):
        super(Decoder,self).__init__()
        self.embedding_layer = PositionEmbedding(tar_seq_len, tar_vocab_size,embed_dim)
        self.decoder_layers=[Decoder_Layer(d_model,dropout,no_heads,dff) for _ in range(n_layers)]
        
    def call(self,x,y,look_ahead_mask=None,padding_mask=None,training=False):
        y=self.embedding_layer(y)
        for layer in self.decoder_layers:
            y=layer(x,y,look_ahead_mask,padding_mask,training)
        return y         

In [57]:
class Transformer(tf.keras.Model):
    def __init__(self,inp_seq_len,inp_vocab_size,tar_seq_len, tar_vocab_size,embed_dim,n_layers,dropout,dff,n_heads,d_model):
        super(Transformer,self).__init__()
        self.encoder=Encoder(inp_seq_len, inp_vocab_size,embed_dim,n_heads,d_model,dff,dropout,n_layers)
        self.decoder=Decoder(tar_seq_len, tar_vocab_size,embed_dim,n_layers,d_model,dropout,n_heads,dff)
        self.dense=Dense(tar_vocab_size)
        
    def create_padding_mask(self,seq):
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return seq[:,tf.newaxis,tf.newaxis,:]
    
    def create_look_ahead_mask(self,size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask
    
    def create_masks(self,inputs, target):
        enc_padding_mask = self.create_padding_mask(inputs)
        dec_padding_mask = self.create_padding_mask(target)
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(target)[1])
        look_ahead_mask = tf.maximum(dec_padding_mask, look_ahead_mask)

        return enc_padding_mask, look_ahead_mask, dec_padding_mask
    
    def call(self,inputs,training=False):
        x,y=inputs
        enc_padding_mask,lookahead_mask,dec_padding_mask=self.create_masks(x,y)
        print(enc_padding_mask.shape)
        print(dec_padding_mask.shape)
        print(lookahead_mask.shape)
        encoder_op=self.encoder(x,mask=enc_padding_mask,training=training)
        decoder_op=self.decoder(encoder_op,y,look_ahead_mask=lookahead_mask,
                            padding_mask=enc_padding_mask,training=training)
        output=self.dense(decoder_op)
        return output

In [58]:
inp_seq_len=10
tar_seq_len=15
inp_vocab_size=50
tar_vocab_size=100
embed_dim=512
n_layers=6
dropout=0.1
dff=2048
n_heads=8
d_model=512

In [59]:
transformer=Transformer(inp_seq_len,inp_vocab_size,tar_seq_len, tar_vocab_size,embed_dim,n_layers,dropout,dff,n_heads,d_model)

In [60]:
x=np.random.random((150,10))
y=np.random.random((150,15))

In [61]:
output=transformer([x,y],True)

(150, 1, 1, 10)
(150, 1, 1, 15)
(150, 1, 15, 15)


In [221]:
output.shape

TensorShape([150, 15, 100])

In [223]:
transformer.summary()

Model: "transformer_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_18 (Encoder)        multiple                  18945024  
                                                                 
 decoder_18 (Decoder)        multiple                  25283072  
                                                                 
 dense_501 (Dense)           multiple                  51300     
                                                                 
Total params: 44279396 (168.91 MB)
Trainable params: 44189796 (168.57 MB)
Non-trainable params: 89600 (350.00 KB)
_________________________________________________________________
