In [None]:
import numpy as np
import keras
from tensorflow.keras.layers import Embedding,Dense,Dropout

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras import Model

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
inputs = []
outputs = []

data_file = open('spa.txt', encoding='utf-8')

count = 0
for line in data_file:
    count += 1
    if count > 20000:
        break
    if '\t' not in line:
        continue
    ip, temp_op, extra = line.rstrip().split('\t')
    op = temp_op
    inputs.append(ip)
    outputs.append(op)

In [None]:
inputs=[sentence.lower() for sentence in inputs]
outputs=[sentence.lower() for sentence in outputs]

In [None]:
outputs = ['<start> '+sentence+' <end>' for sentence in outputs]

In [None]:
import re

def clean_sentences(sentences):
    cleaned_sentences = []

    def replace_non_alphabetic_with_null(input_string):
        return re.sub(r'[^a-zA-Z]', '', input_string)

    for sentence in sentences:
        cleaned_sentence = replace_non_alphabetic_with_null(sentence)
        cleaned_sentences.append(cleaned_sentence)

    return cleaned_sentences

In [None]:
inputs[10:20]

['who?',
 'wow!',
 'fire!',
 'fire!',
 'fire!',
 'help!',
 'help!',
 'help!',
 'jump!',
 'jump.']

In [None]:
inputs=clean_sentences(inputs)

In [None]:
inputs[10:20]

['who', 'wow', 'fire', 'fire', 'fire', 'help', 'help', 'help', 'jump', 'jump']

In [None]:
EnglishTokenizer=Tokenizer(oov_token="<UNK>")
EnglishTokenizer.fit_on_texts(inputs)
inp_sequences=EnglishTokenizer.texts_to_sequences(inputs)
max_inp_len=max(len(i) for i in inp_sequences)
src_sequences=pad_sequences(inp_sequences,maxlen=max_inp_len,padding="post")
Englishword2index=EnglishTokenizer.word_index
Englishindex2word=EnglishTokenizer.index_word

In [None]:
SpanishTokenizer=Tokenizer(oov_token="<UNK>")
SpanishTokenizer.fit_on_texts(outputs)
op_sequences=SpanishTokenizer.texts_to_sequences(outputs)
max_tar_len=max(len(i) for i in op_sequences)
tar_sequences=pad_sequences(op_sequences,maxlen=max_tar_len,padding="post")
Spanishword2index=SpanishTokenizer.word_index
Spanishindex2word=SpanishTokenizer.index_word

In [None]:
src_vocab_size=len(Englishword2index)+1
trg_vocab_size=len(Spanishword2index)+1
print("src_vocab_size:",src_vocab_size)
print("tar_vocab_size:",trg_vocab_size)

src_vocab_size: 14920
tar_vocab_size: 7853


In [None]:
print("max_inp_len:",max_inp_len)
print("max_tar_len:",max_tar_len)

max_inp_len: 1
max_tar_len: 14


In [None]:
class PositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, hidden_dim):
        super(PositionEmbedding, self).__init__()
        word_embedding_matrix = self.get_position_encoding(vocab_size, hidden_dim)
        position_embedding_matrix = self.get_position_encoding(sequence_length, hidden_dim)
        self.word_embedding_layer = Embedding(
            input_dim=vocab_size, output_dim=hidden_dim,
            weights=[word_embedding_matrix],
            trainable=False
        )
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, output_dim=hidden_dim,
            weights=[position_embedding_matrix],
            trainable=False
        )

    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P


    def call(self, inputs):
        position_indices = tf.range(tf.shape(inputs)[-1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

In [None]:
def scaled_dot_product(q,k,v,mask=None):
    scale=q.shape[-1]
    k=tf.transpose(k,perm=(0,1,3,2))
    scaled=tf.matmul(q,k)/np.sqrt(scale)
    if mask is not None:
        scaled+=mask
    attention_wts=tf.nn.softmax(scaled,axis=-1)
    values=tf.matmul(scaled,v)
    return values,attention_wts

In [None]:
class MultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MultiHead_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.qkv=Dense(3*d_model)
        self.dense=Dense(d_model)

    def call(self,inputs,mask=None):
        batch_size,max_inp_len,d_model=inputs.shape
        qkv=self.qkv(inputs)
        qkv=tf.reshape(qkv,(tf.shape(inputs)[0], max_inp_len, self.n_heads, 3 * self.head_dims))
        qkv=tf.transpose(qkv,perm=(0,2,1,3))
        q,k,v=tf.split(qkv,num_or_size_splits=3,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v)
        v=tf.reshape(v,(tf.shape(inputs)[0],max_inp_len,self.n_heads*self.head_dims))
        output=self.dense(v)
        return output

In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self,d_model,ffc,dropout):
        super(FeedForward,self).__init__()
        self.dense1=Dense(ffc,activation="relu")
        self.dense2=Dense(d_model)
        self.dropout=Dropout(rate=dropout)

    def call(self,inputs,training=False):
        x=self.dense1(inputs)
        x=self.dropout(x,training=training)
        x=self.dense2(x)
        return x

In [None]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, parameter_shape, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.parameter_shape = parameter_shape
        self.eps = eps
        self.gamma = self.add_weight("gamma", shape=parameter_shape, initializer="ones", trainable=True)
        self.beta = self.add_weight("beta", shape=parameter_shape, initializer="zeros", trainable=True)

    def call(self, inputs):
        mean = tf.reduce_mean(inputs, axis=-1, keepdims=True)
        var = tf.reduce_mean(tf.square(inputs - mean), axis=-1, keepdims=True)
        std = tf.sqrt(var + self.eps)
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [None]:
class Encoder_Layer(tf.keras.layers.Layer):
    def __init__(self,num_of_heads,d_model,dff,dropout):
        super(Encoder_Layer,self).__init__()
        self.mha=MultiHead_Attention(num_of_heads,d_model)
        self.dropout1=Dropout(rate=dropout)
        self.ln1=LayerNormalization(d_model)
        self.ff=FeedForward(d_model,dff,dropout)
        self.dropout2=Dropout(rate=dropout)
        self.ln2=LayerNormalization(d_model)

    def call(self,inputs,mask,training):
        mha_output=self.mha(inputs,mask)
        output_dropout1=self.dropout1(mha_output,training=training)
        ln1_output=self.ln1(output_dropout1+inputs)

        ffn_output=self.ff(ln1_output,training)
        ffn_output_dropout=self.dropout2(ffn_output,training=training)
        ln2_output=self.ln2(ffn_output_dropout+ln1_output)
        return ln2_output

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,inp_seq_len, inp_vocab_size,embed_dim,num_of_heads,d_model,dff,dropout,num_layers):
        super(Encoder,self).__init__()
        self.embedding_layer = PositionEmbedding(inp_seq_len, inp_vocab_size,embed_dim)
        self.encoder_layer=[Encoder_Layer(num_of_heads,d_model,dff,dropout) for _ in range(num_layers)]

    def call(self,inputs,mask=False,training=False):
        x=self.embedding_layer(inputs)
        for layer in self.encoder_layer:
            x=layer(x,mask,training)
        return x

Decoder

In [None]:
class MaskedMultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MaskedMultiHead_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.qkv=Dense(3*d_model)
        self.dense=Dense(d_model)

    def call(self,inputs,mask=None):
        batch_size,max_inp_len,d_model=inputs.shape
        qkv=self.qkv(inputs)
        desired_shape = (tf.shape(inputs)[0], max_inp_len, self.n_heads, 3 * self.head_dims)
        qkv=tf.reshape(qkv,desired_shape)
        qkv=tf.transpose(qkv,perm=(0,2,1,3))
        q,k,v=tf.split(qkv,num_or_size_splits=3,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v,mask)
        v=tf.reshape(v,(tf.shape(inputs)[0],max_inp_len,self.n_heads*self.head_dims))
        output=self.dense(v)
        return output

In [None]:
class MultiHeadCross_Attention(tf.keras.layers.Layer):
    def __init__(self,no_of_heads,d_model):
        super(MultiHeadCross_Attention,self).__init__()
        self.n_heads=no_of_heads
        self.head_dims=d_model//no_of_heads
        self.kv=Dense(2*d_model)
        self.q=Dense(d_model)
        self.dense=Dense(d_model)

    def call(self,x,y,mask=None):
        batch_size,max_inp_len,d_model=x.shape
        batch_size,max_tar_len,d_model=y.shape
        kv=self.kv(x)
        q=self.q(y)
        kv=tf.reshape(kv,(tf.shape(x)[0], max_inp_len, self.n_heads, 2 * self.head_dims))
        q=tf.reshape(q,(tf.shape(y)[0], max_tar_len, self.n_heads, self.head_dims))
        kv=tf.transpose(kv,perm=(0,2,1,3))
        q=tf.transpose(q,perm=(0,2,1,3))
        k,v=tf.split(kv,num_or_size_splits=2,axis=3)
        v,attention_wts=scaled_dot_product(q,k,v,mask)
        v=tf.reshape(v,(tf.shape(x)[0],max_tar_len,d_model))
        output=self.dense(v)
        return output

In [None]:
class Decoder_Layer(tf.keras.layers.Layer):
    def __init__(self,d_model,dropout,no_heads,dff):
        super(Decoder_Layer,self).__init__()
        self.mha=MaskedMultiHead_Attention(no_heads,d_model)
        self.dropout1=Dropout(rate=dropout)
        self.norm1=LayerNormalization(d_model)

        self.mhca=MultiHeadCross_Attention(no_heads,d_model)
        self.dropout2=Dropout(rate=dropout)
        self.norm2=LayerNormalization(d_model)

        self.ff=FeedForward(d_model,dff,dropout)
        self.dropout3=Dropout(rate=dropout)
        self.norm3=LayerNormalization(d_model)

    def call(self,x,y,look_ahead_mask,padding_mask,training=False):
        mha_output=self.mha(y,mask=look_ahead_mask)
        mha_output_dropout=self.dropout1(mha_output,training=training)
        norm1_output=self.norm1(mha_output_dropout+y)

        cmha_output=self.mhca(x,norm1_output,mask=padding_mask)
        cmha_output_dropout=self.dropout2(cmha_output,training=training)
        norm2_output=self.norm2(cmha_output_dropout+norm1_output)

        ff_output=self.ff(norm2_output,training)
        ff_output_dropout=self.dropout3(ff_output,training=training)
        norm3_output=self.norm3(ff_output_dropout+norm2_output)
        return norm3_output

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,tar_seq_len, tar_vocab_size,embed_dim,n_layers,d_model,dropout,no_heads,dff):
        super(Decoder,self).__init__()
        self.embedding_layer = PositionEmbedding(tar_seq_len, tar_vocab_size,embed_dim)
        self.decoder_layers=[Decoder_Layer(d_model,dropout,no_heads,dff) for _ in range(n_layers)]

    def call(self,x,y,look_ahead_mask=None,padding_mask=None,training=False):
        y=self.embedding_layer(y)
        for layer in self.decoder_layers:
            y=layer(x,y,look_ahead_mask,padding_mask,training)
        return y

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,inp_seq_len,inp_vocab_size,tar_seq_len, tar_vocab_size,embed_dim,n_layers,dropout,dff,n_heads,d_model):
        super(Transformer,self).__init__()
        self.encoder=Encoder(inp_seq_len, inp_vocab_size,embed_dim,n_heads,d_model,dff,dropout,n_layers)
        self.decoder=Decoder(tar_seq_len, tar_vocab_size,embed_dim,n_layers,d_model,dropout,n_heads,dff)
        self.dense=Dense(tar_vocab_size)

    def create_padding_mask(self,seq):
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return seq[:,tf.newaxis,tf.newaxis,:]

    def create_look_ahead_mask(self,size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask

    def create_masks(self,inputs, target):
        enc_padding_mask = self.create_padding_mask(inputs)
        dec_padding_mask = self.create_padding_mask(target)
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(target)[1])
        look_ahead_mask = tf.maximum(dec_padding_mask, look_ahead_mask)

        return enc_padding_mask, look_ahead_mask, dec_padding_mask

    def call(self,inputs,training=False):
        x,y=inputs
        enc_padding_mask,lookahead_mask,dec_padding_mask=self.create_masks(x,y)
        encoder_op=self.encoder(x,mask=enc_padding_mask,training=training)
        decoder_op=self.decoder(encoder_op,y,look_ahead_mask=lookahead_mask,
                            padding_mask=enc_padding_mask,training=training)
        output=self.dense(decoder_op)
        return output

In [None]:
embed_dim=512
n_layers=6
dropout=0.1
dff=2048
n_heads=8
d_model=512

In [None]:
transformer=Transformer(max_inp_len,src_vocab_size,max_tar_len, trg_vocab_size,embed_dim,n_layers,dropout,dff,n_heads,d_model)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(src_sequences,tar_sequences,test_size=0.010,random_state=0)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((19800, 1), (200, 1), (19800, 14), (200, 14))

In [None]:
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,beta_1=0.9,beta_2=0.98,epsilon=1e-9,clipnorm=5)

In [None]:
def masked_loss(y_true,y_pred):
  loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
  loss=loss_fn(y_true,y_pred)
  mask=tf.cast(y_true!=0,dtype=loss.dtype)
  loss*=mask
  return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [None]:
def masked_accuracy(y_true,y_pred):
  y_pred=tf.argmax(y_pred,axis=-1)
  y_pred=tf.cast(y_pred,dtype=y_true.dtype)
  mask=tf.cast(y_true!=0,dtype=tf.float32)
  match=tf.cast(y_true==y_pred,dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
expected_loss=tf.math.log(tf.cast(trg_vocab_size,tf.float32)).numpy()
expected_loss

8.968651

In [None]:
expected_accuracy=tf.cast(1/trg_vocab_size,tf.float32).numpy()
expected_accuracy

0.00012733987

In [None]:
transformer.compile(optimizer=optimizer,loss=masked_loss,metrics=[masked_accuracy,masked_loss])

In [None]:
transformer.evaluate([X_test,y_test[:,:-1]],y_test[:,1:],steps=6,return_dict=True)



{'loss': 8.982586860656738,
 'masked_accuracy': 0.0,
 'masked_loss': 8.983532905578613}

In [None]:
history=transformer.fit([X_train,y_train[:,:-1]],y_train[:,1:],steps_per_epoch=10,epochs=10,validation_data=(
    [X_test,y_test[:,:-1]],y_test[:,1:]),validation_steps=20,callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
  )

Epoch 1/10


In [None]:
from tensorflow import data

In [None]:
train_dataset = data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(batch_size)

In [None]:
@tf.function
def train_step(encoder_inputs,decoder_inputs,decoder_targets):
    with tf.GradientTape() as tape:
        predictions = transformer([encoder_inputs,decoder_inputs], training=True)
        loss = masked_loss(decoder_targets, predictions)
        accuracy = masked_accuracy(decoder_targets, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    return loss,accuracy

In [None]:
from time import time

In [None]:
from tensorflow import data, train, math, reduce_sum, cast, equal, argmax, float32, GradientTape, TensorSpec, function, int64

In [None]:
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
class LRScheduler(LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000, **kwargs):
        super(LRScheduler, self).__init__(**kwargs)

        self.d_model = cast(d_model, float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step_num):

        arg1 = step_num ** -0.5
        arg2 = step_num * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * math.minimum(arg1, arg2)

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01, beta_1=0.9, beta_2=0.98,epsilon=1e-9)

In [None]:
epochs=10
batch_size=64
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch + 1))
    total_loss=0.0
    total_accuracy=0.0
    start_time = time()

    # Iterate over the dataset batches
    stepp=0
    for step, (train_batchX, train_batchY) in enumerate(train_dataset):

        encoder_input = train_batchX[:, 1:]
        decoder_input = train_batchY[:, :-1]
        decoder_output = train_batchY[:, 1:]

        loss,accuarcy=train_step(encoder_input, decoder_input, decoder_output)
        total_loss+=loss
        total_accuracy+=accuracy

        if step % 50 == 0:
            print(f'Epoch {epoch + 1} Step {step} Loss {loss:.4f} Accuracy {accuracy:.4f}')
        stepp+=1
    print("Epoch %d: Training Loss %.4f, Training Accuracy %.4f" % (epoch + 1, total_loss, total_accuracy))
    average_loss = total_loss / stepp
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {average_loss:.4f}")

print("Total time taken: %.2fs" % (time() - start_time))


Start of epoch 1
