In [27]:
import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization , Layer , Dense , ReLU , Dropout , Embedding
import math
from tensorflow.keras import Model
from tensorflow.keras.metrics import Mean
from tensorflow import linalg , ones , maximum , newaxis
from tensorflow import reshape , transpose , cast , matmul , train ,shape , math , float32 ,equal , argmax ,data ,GradientTape , TensorSpec,function, int64 , reduce_sum 
import numpy as np
from numpy.random import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow import int64,convert_to_tensor
from pickle import load
from time import time

In [28]:
class AddNormalization(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.layer_norm = LayerNormalization()

    def call ( self, x, sublayer_x):
        add = x + sublayer_x

        return self.layer_norm(add)
class FeedForward(Layer):
    def __init__(self, d_ff , d_model , **kwargs):
        super().__init__(**kwargs)
        self.fullyconnected1=Dense(d_ff)
        self.fullyconnected2=Dense(d_model)
        self.activation=ReLU()

    def call(self,x):
        x_fc1=self.fullyconnected1(x)
        return self.fullyconnected2(self.activation(x_fc1))

class PositionEmbeddingFixedWeights(Layer):
    def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
        super().__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
        pos_embedding_matrix = self.get_position_encoding(seq_length, output_dim)
        self.word_embedding_layer = Embedding(input_dim=vocab_size, output_dim=output_dim,weights=[word_embedding_matrix],trainable=False)
        self.position_embedding_layer = Embedding(input_dim=seq_length, output_dim=output_dim,weights=[pos_embedding_matrix],trainable=False)
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P
    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices
class GetAttentionScores(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, queries, keys, values,d_k, mask):
        # Scale dot-product of queries and keys
        scores = tf.matmul(queries, keys, transpose_b=True) / math.sqrt(d_k)

        # Apply mask if it exists, setting masked positions to a large negative value
        if mask is not None:
            scores += -1e9 * tf.cast(mask, dtype=scores.dtype)

        # Apply softmax to the scores to get the attention weights
        weights = tf.nn.softmax(scores, axis=-1)

        # Multiply the attention weights by values to get the final attention output
        attention_scores = tf.matmul(weights, values)

        return attention_scores


class MultiHeadAttention(Layer):
    def __init__(self, h, d_k, d_v, d_model, **kwargs):
        super().__init__(**kwargs)
        self.attention = GetAttentionScores() # Scaled dot product attention
        self.heads = h # Number of attention heads to use
        self.d_k = d_k # Dimensionality of the linearly projected queries and keys
        self.d_v = d_v # Dimensionality of the linearly projected values
        self.d_model = d_model # Dimensionality of the model
        self.W_q = Dense(d_k) # Learned projection matrix for the queries
        self.W_k = Dense(d_k) # Learned projection matrix for the keys
        self.W_v = Dense(d_v) # Learned projection matrix for the values
        self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output
    def reshape_tensor(self, x, heads, flag):
        if flag:
        # Tensor shape after reshaping and transposing:
        # (batch_size, heads, seq_length,-1)
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads,-1))
            x = transpose(x, perm=(0, 2, 1, 3))
        else:
        # Reverting the reshaping and transposing operations:
        # (batch_size, seq_length, d_k)
            x = transpose(x, perm=(0, 2, 1, 3))
            x = reshape(x, shape=(shape(x)[0], shape(x)[1], self.d_k))
        return x
    def call(self,x1,x2,x3,mask):
        # Rearrange the queries to be able to compute all heads in parallel
        q_reshaped = self.reshape_tensor(self.W_q(x1),self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length,-1)
        # Rearrange the keys to be able to compute all heads in parallel
        k_reshaped = self.reshape_tensor(self.W_k(x2), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length,-1)
        # Rearrange the values to be able to compute all heads in parallel
        v_reshaped = self.reshape_tensor(self.W_v(x3), self.heads, True)
        # Resulting tensor shape: (batch_size, heads, input_seq_length,-1)
        # Compute the multi-head attention output using the reshaped queries,
        # keys, and values
        o_reshaped = self.attention(queries=q_reshaped, keys=k_reshaped,values=v_reshaped,d_k=cast(self.d_k,float32),mask= mask)
        # Resulting tensor shape: (batch_size, heads, input_seq_length,-1)
        # Rearrange back the output into concatenated form
        output = self.reshape_tensor(o_reshaped, self.heads, False)
        # Resulting tensor shape: (batch_size, input_seq_length, d_v)
        # Apply one final linear projection to the output to generate the multi-head
        # attention. Resulting tensor shape: (batch_size, input_seq_length, d_model)
        return self.W_o(output)
#ENCODER PART
class EncoderLayer(Layer):
    def __init__(self, h, d_k, d_v, d_model, d_ff, rate, **kwargs):
        super().__init__(**kwargs)
        self.multihead_attention = MultiHeadAttention(h=h,d_k=d_k,d_v=d_v,d_model=d_model)
        self.dropout1 = Dropout(rate)
        self.add_norm1 = AddNormalization()  # First Add & Norm
        self.feed_forward1 = FeedForward(d_ff=d_ff,d_model=d_model)
        self.dropout2 = Dropout(rate)
        self.add_norm2 = AddNormalization()  # Second Add & Norm

    def call(self, x, padding_mask=None, training=False):
        # Attention with dropout and add & norm
        # Ensure mask is passed correctly
        multiheaded_output = self.multihead_attention(x,x,x,mask=padding_mask)  
        multiheaded_output = self.dropout1(multiheaded_output, training=training)
        
        addnorm_output1 = self.add_norm1(x, multiheaded_output)  # Add & Norm with attention output

        # Feed-forward with dropout and second add & norm
        feed_forward_output = self.feed_forward1(addnorm_output1)
        feed_forward_output = self.dropout2(feed_forward_output, training=training)
        
        return self.add_norm2(addnorm_output1, feed_forward_output)  # Add & Norm with feed-forward output

class Encoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,
    **kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = PositionEmbeddingFixedWeights(seq_length=sequence_length, vocab_size=vocab_size,output_dim=d_model)
        self.dropout = Dropout(rate)
        self.encoder_layer = [EncoderLayer(h=h, d_k=d_k, d_v=d_v, d_model=d_model, d_ff=d_ff, rate=rate)for _ in range(n)]
    
    def call(self, input_sentence, padding_mask, training):
        # Generate the positional encoding
        pos_encoding_output = self.pos_encoding(input_sentence)
        # Expected output shape = (batch_size, sequence_length, d_model)
        # Add in a dropout layer
        x = self.dropout(pos_encoding_output, training=training)
        # Pass on the positional encoded values to each encoder layer
        for i,layer in enumerate(self.encoder_layer):
            x = layer(x, padding_mask, training=training)
            
        return x

# DECODER PART
class DecoderLayer(Layer):
    def __init__(self,h,d_k,d_v,d_model,d_ff,rate,**kwargs):
        super().__init__(**kwargs)

        self.multihead_att1=MultiHeadAttention(h,d_k,d_v,d_model)
        self.dropout1=Dropout(rate)
        self.add_norm1=AddNormalization()

        self.multihead_att2=MultiHeadAttention(h,d_k,d_v,d_model)
        self.dropout2=Dropout(rate)
        self.add_norm2=AddNormalization()

        self.feed_forward =FeedForward(d_ff,d_model)
        self.dropout3=Dropout(rate)
        self.add_norm3=AddNormalization()


    def call(self,x,encoder_output,lookahead_mask,padding_mask,training):

        multihead_output1= self.multihead_att1(x,x,x,lookahead_mask)

        multihead_output1=self.dropout1(multihead_output1,training=training)
        add_norm1_output = self.add_norm1(x,multihead_output1)

        #keys and values are encoder output
        multihead_output2 = self.multihead_att2(add_norm1_output,encoder_output,encoder_output,padding_mask)

        multihead_output2= self.dropout2(multihead_output2,training=training)
        add_norm2_output = self.add_norm2(add_norm1_output,multihead_output2)


        feed_forward_output= self.feed_forward(add_norm2_output)
        feed_forward_output=self.dropout3(feed_forward_output,training=training)


        return self.add_norm3(add_norm2_output,feed_forward_output)


class Decoder(Layer):
    def __init__(self, vocab_size, sequence_length, h, d_k, d_v, d_model, d_ff, n, rate,**kwargs):
        super().__init__(**kwargs)

        self.pos_encoding = PositionEmbeddingFixedWeights(sequence_length,vocab_size,d_model)
        self.dropout = Dropout(rate)
        
        self.decoder_layer= [DecoderLayer(h,d_k,d_v,d_model,d_ff,rate) for _ in range(n)]


    def call(self,output_target,encoder_output,lookahead_mask,padding_mask,training):

        pos_encoding_output = self.pos_encoding(output_target)

        x=self.dropout(pos_encoding_output,training=training)

        for i,layer in enumerate(self.decoder_layer):
            x=layer(x,encoder_output,lookahead_mask=lookahead_mask,padding_mask=padding_mask,training=training)

        return x

class TransformerModel(Model):
    def __init__(self,enc_vocab_size,dec_vocab_size,enc_seq_length,dec_seq_length,h,d_k,d_v,d_model,d_ff_inner,n,rate,**kwargs):
        super().__init__(**kwargs)

        self.encoder = Encoder(enc_vocab_size,enc_seq_length,h,d_k,d_v,d_model,d_ff_inner,n,rate)

        self.decoder = Decoder(dec_vocab_size,dec_seq_length,h,d_k,d_v,d_model,d_ff_inner,n,rate)

        self.model_last_layer = Dense(dec_vocab_size)

    def padding_mask (self,input):

        mask = math.equal(input,0)
        mask = cast(mask,float32)

        return mask[:,newaxis,newaxis,:]
    
    def lookahead_mask(self,shape):

        mask =  1 - linalg.band_part(ones((shape,shape)),-1,0)

        return mask
    

    def call ( self, encoder_input,decoder_input,training):

        enc_padding_mask = self.padding_mask(encoder_input)

        dec_padding_mask = self.padding_mask(decoder_input)
        dec_lookahead_mask = self.lookahead_mask(decoder_input.shape[1])
        dec_lookahead_mask =  maximum(dec_padding_mask,dec_lookahead_mask)


        encoder_output = self.encoder(encoder_input,enc_padding_mask,training=training)

        decoder_ouput = self.decoder(decoder_input,encoder_output,dec_lookahead_mask,enc_padding_mask,training=training)

        model_output = self.model_last_layer(decoder_ouput)

        return model_output


In [29]:
class PrepareDataset:
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        self.n_sentences=10000
        self.train_split =0.9

    def create_tokenizer (self,dataset):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(dataset)

        return tokenizer
    
    def find_seq_length(self,dataset):

        return max(len(seq.split()) for seq in dataset)
    
    def find_vocab_size (self,tokenizer,dataset):
        tokenizer.fit_on_texts(dataset)

        return len(tokenizer.word_index)+1
    
    def call( self, filename):

        clean_dataset = load(open(filename,'rb'))
        dataset = clean_dataset[:self.n_sentences,:] 

        for i in range(dataset[:,0].size): #Only English Sentences size i.e 10000
            dataset[i,0]="<START> "+ dataset[i,0] + " <EOS>" #adds start and eos in english
            dataset[i,1]="<START> "+ dataset[i,1] + " <EOS>" #same here but in german    

        shuffle(dataset)

        train = dataset[:int(self.n_sentences*self.train_split)]

        #tokernize english sentences and other stuff
        enc_tokenizer = self.create_tokenizer(train[:,0])
        enc_seq_length = self.find_seq_length(train[:,0])
        enc_vocab_size = self.find_vocab_size(enc_tokenizer,train[:,0])

        #encode and pad 
        trainX = enc_tokenizer.texts_to_sequences(train[:,0])
        trainX = pad_sequences(trainX,maxlen=enc_seq_length,padding="post")
        trainX = convert_to_tensor(trainX,dtype=int64)

        #tokenize german 
        dec_tokenizer = self.create_tokenizer(train[:,1])
        dec_seq_length = self.find_seq_length(train[:,1])
        dec_vocab_size = self.find_vocab_size(dec_tokenizer,train[:,1])

        #enocde and pad
        trainY = dec_tokenizer.texts_to_sequences(train[:,1])
        trainY = pad_sequences(trainY,maxlen=dec_seq_length,padding="post")
        trainY = convert_to_tensor(trainY,dtype=int64)

        return (trainX,trainY,enc_seq_length,dec_seq_length,enc_vocab_size,dec_vocab_size)

In [17]:
dataset = PrepareDataset()
(trainX, trainY,enc_seq_length, dec_seq_length,enc_vocab_size, dec_vocab_size) = dataset.call("english-german-both.pkl")

In [18]:
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of model layers' outputs
d_ff = 2048 # Dimensionality of the inner fully connected layer
n = 6 # Number of layers in the encoder stack

# Define the training parameters
epochs = 2
batch_size = 64
beta_1 = 0.9
beta_2 = 0.98
epsilon = 1e-9
dropout_rate = 0.1

In [19]:
class LRScheduler (LearningRateSchedule):
    def __init__(self, d_model,warmup_steps=4000,**kwargs):
        super().__init__(**kwargs)
        self.d_model = cast(d_model,float32)
        self.warmup_steps = warmup_steps

    def __call__(self,step_num):
        #linearly increasing learning rate for the first warmup steps and decreasing after
        arg1= step_num ** -0.5
        arg2= step_num * (self.warmup_steps ** -1.5)

        return ( self.d_model ** -0.5 ) * math.minimum(arg1,arg2)

In [20]:
optimizer = Adam(LRScheduler(d_model),beta_1,beta_2)

In [21]:
train_dataset = data.Dataset.from_tensor_slices((trainX,trainY))
train_dataset = train_dataset.batch(batch_size)

In [22]:
training_model = TransformerModel(enc_vocab_size,dec_vocab_size,enc_seq_length,dec_seq_length,h,d_k,d_v,d_model,d_ff,n,dropout_rate)

In [23]:
train_loss = Mean(name='train_loss')
train_accuracy= Mean(name='train_accuracy')

In [24]:
checkpoint = train.Checkpoint(model=training_model,optimizer=optimizer)
checkpoint_manager = train.CheckpointManager(checkpoint,"./checkpoints" ,max_to_keep=3)

In [25]:
def loss_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the
    # computation of loss
    mask = math.logical_not(equal(target, 0))
    mask = cast(mask, float32)
    # Compute a sparse categorical cross-entropy loss on the unmasked values
    loss = sparse_categorical_crossentropy(target, prediction, from_logits=True) * mask
    # Compute the mean loss over the unmasked values
    return reduce_sum(loss) / reduce_sum(mask)

# Defining the accuracy function
def accuracy_fcn(target, prediction):
    # Create mask so that the zero padding values are not included in the
    # computation of accuracy
    mask = math.logical_not(equal(target, 0))
    # Find equal prediction and target values, and apply the padding mask
    accuracy = equal(target, argmax(prediction, axis=2))
    accuracy = math.logical_and(mask, accuracy)
    # Cast the True/False values to 32-bit-precision floating-point numbers
    mask = cast(mask, float32)
    accuracy = cast(accuracy, float32)
    # Compute the mean accuracy over the unmasked values
    return reduce_sum(accuracy) / reduce_sum(mask)

In [26]:
@function
def train_step(encoder_input, decoder_input, decoder_output):
    with GradientTape() as tape:
        # Run the forward pass of the model to generate a prediction
        prediction = training_model(encoder_input, decoder_input, training=True)
        # Compute the training loss
        loss = loss_fcn(decoder_output, prediction)
        # Compute the training accuracy
        accuracy = accuracy_fcn(decoder_output, prediction)
    # Retrieve gradients of the trainable variables with respect to the training loss
    gradients = tape.gradient(loss, training_model.trainable_weights)
    # Update the values of the trainable variables by gradient descent
    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))
    train_loss(loss)
    train_accuracy(accuracy)

for epoch in range(epochs):
    train_loss.reset_state()
    train_accuracy.reset_state()
    print("\nStart of epoch %d" % (epoch + 1))
    start_time = time()
    # Iterate over the dataset batches
    for step, (train_batchX, train_batchY) in enumerate(train_dataset):
        # Define the encoder and decoder inputs, and the decoder output
        encoder_input = train_batchX[:, 1:]
        decoder_input = train_batchY[:, :-1]
        decoder_output = train_batchY[:, 1:]
        train_step(encoder_input, decoder_input, decoder_output)
        if step % 50 == 0:
            print(f"Epoch {epoch+1} Step {step} Loss {train_loss.result():.4f} "
            + f"Accuracy {train_accuracy.result():.4f}")
    # Print epoch number and loss value at the end of every epoch
    print(f"Epoch {epoch+1}: Training Loss {train_loss.result():.4f}, "
    + f"Training Accuracy {train_accuracy.result():.4f}")
    # Save a checkpoint after every five epochs
    if (epoch + 1) % 5 == 0:
        save_path = checkpoint_manager.save()
        print(f"Saved checkpoint at epoch {epoch+1}")


    print("Total time taken: %.2fs" % (time()- start_time))


Start of epoch 1
Epoch 1 Step 0 Loss 8.3999 Accuracy 0.0000
Epoch 1 Step 50 Loss 7.6825 Accuracy 0.1210
Epoch 1 Step 100 Loss 7.0614 Accuracy 0.1700
Epoch 1: Training Loss 6.7397, Training Accuracy 0.1898
Total time taken: 124.95s

Start of epoch 2
Epoch 2 Step 0 Loss 5.6838 Accuracy 0.2568
Epoch 2 Step 50 Loss 5.4632 Accuracy 0.2737
Epoch 2 Step 100 Loss 5.2908 Accuracy 0.2820
Epoch 2: Training Loss 5.1536, Training Accuracy 0.2897
Total time taken: 90.19s
