<a href="https://colab.research.google.com/github/LlakmalGamage/Translator-Using-Transformers/blob/main/translator_spanish_version_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [7]:
import pandas as pd

spanish_file_path="data_spanish.csv"

In [8]:

df2=pd.read_csv(spanish_file_path)

In [9]:
import random
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import string
import re
import csv



In [10]:
df2.head(20)

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
5,Run!,¡Corre!
6,Run.,Corred.
7,Who?,¿Quién?
8,Fire!,¡Fuego!
9,Fire!,¡Incendio!


In [11]:
first_column_values_last_10_rows=df2['english'].tail(10)


paragraph = '\n'.join(first_column_values_last_10_rows)
# Print the paragraph
print(paragraph)

You can't view Flash content on an iPad. However, you can easily email yourself the URLs of these web pages and view that content on your regular computer when you get home.
A mistake young people often make is to start learning too many languages at the same time, as they underestimate the difficulties and overestimate their own ability to learn them.
No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.
In 1969, Roger Miller recorded a song called "You Don't Want My Love." Today, this song is better known as "In the Summer Time." It's the first song he wrote and sang that became popular.
A child who is a native speaker usually knows many things about his or her language that a non-native speaker who has been studying for years still does not know and perhaps will never know.
There are four main causes of alcohol-related death. Injury from car accidents or v

In [12]:
len(df2)

118964

# Split the English and german translation pairs

In [13]:
#common function for both spanish and german

class split_pairs:
 def split_pairs_method(self,df1):
  text_pairs=[]

  for i in range(len(df1)):
    english,language=df1["english"][i],df1["spanish"][i]
    language="[start] " + language + " [end]"
    text_pairs.append((english,language))

  return text_pairs


In [14]:
#randomly selecting that if the above function work
class random_pair_test:
  def random_test_method(self,text_pairs):
   for i in range(3):
    print(random.choice(text_pairs))


In [15]:
#pairing for german text

spanish_text_pairs=split_pairs().split_pairs_method(df2)

random_pair_test().random_test_method(spanish_text_pairs)

('Life is tough.', '[start] La vida es dura. [end]')
('I wish I had been with her then.', '[start] Desearía haber estado con ella entonces. [end]')
('Tom is a very busy person.', '[start] Tom es una persona muy ocupada. [end]')


# Randomizing the data

In [16]:
random.shuffle(spanish_text_pairs)

# Split the data into training, validation,testing

In [17]:
#class for splitting text pairs in to train,test,validation
class splitting:
    def splitting_method(self,text_pairs):
        num_val_sample=int(0.15*len(text_pairs))
        num_train_samples=len(text_pairs) - 2 * num_val_sample
        train_pairs=text_pairs[:num_train_samples]
        val_pairs=text_pairs[num_train_samples:num_train_samples+num_val_sample]
        test_pairs=text_pairs[num_train_samples+num_val_sample:]

        print("Total Sentences: ",len(text_pairs))
        print("Training set size: ",len(train_pairs))
        print("Validation set size: ",len(val_pairs))
        print("Testng set size: ",len(test_pairs))
        return train_pairs,val_pairs,test_pairs



In [18]:
spanish_train_pairs,spanish_val_pairs,spanish_test_pairs=splitting().splitting_method(spanish_text_pairs)


Total Sentences:  118964
Training set size:  83276
Validation set size:  17844
Testng set size:  17844


In [19]:
print(len(spanish_test_pairs)+len(spanish_train_pairs)+len(spanish_val_pairs))

118964


In [20]:
print(spanish_val_pairs[200])

('Come along with us.', '[start] Síguenos. [end]')


# Removing Puncuations

In [21]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


f"[{re.escape(strip_chars)}]"

f"{5+3}"

'8'

# Vectorizing the English and spanish text pairs

In [22]:
def custom_standardization(input_string):
    lowercase=tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase,f"[{re.escape(strip_chars)}]", "")

vocab_size=15000
sequence_length=20

source_vectorization=layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

target_vectorization=layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length+1,
    standardize=custom_standardization,
)

train_english_texts=[pair[0] for pair in spanish_train_pairs]
train_spanish_texts=[pair[1] for pair in spanish_train_pairs]

source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [23]:
print(train_english_texts[1])
print(train_spanish_texts[1])

print(source_vectorization)

I don't feel like doing anything today.
[start] Hoy no tengo ganas de hacer nada. [end]
<keras.src.layers.preprocessing.text_vectorization.TextVectorization object at 0x7b550fa53910>


In [24]:
print(spanish_train_pairs[1:5])

[("I don't feel like doing anything today.", '[start] Hoy no tengo ganas de hacer nada. [end]'), ('When did he go to Europe?', '[start] ¿Cuándo se fue a Europa? [end]'), ('Where are the apples?', '[start] ¿Dónde están las manzanas? [end]'), ('Which brand do you prefer?', '[start] ¿Qué marca prefieres? [end]')]


# Preparing datasets for the translation task

In [25]:
batch_size=64

def format_dataset(eng,spn):
    eng=source_vectorization(eng)
    spn=target_vectorization(spn)
    return ({"english":eng,
             "spanish":spn[:,:-1],
             },spn[:,1:])

def make_dataset(pairs):
    eng_texts,spn_texts=zip(*pairs)
    eng_texts=list(eng_texts)
    spn_texts=list(spn_texts)
    dataset=tf.data.Dataset.from_tensor_slices((eng_texts,spn_texts))
    dataset=dataset.batch(batch_size)
    dataset=dataset.map(format_dataset,num_parallel_calls=4)

    return dataset.shuffle(2048).prefetch(16).cache()

train_data=make_dataset(spanish_train_pairs)
val_data=make_dataset(spanish_val_pairs)

# print(train_data)
# print(val_data)

for inputs, targets in train_data.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")



inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


In [26]:
print(list(train_data.as_numpy_iterator())[50])

({'english': array([[  44,   67, 1774, ...,    0,    0,    0],
       [  49,   14,   28, ...,    0,    0,    0],
       [  19,  376,  164, ...,    0,    0,    0],
       ...,
       [   6,   54,   68, ...,    0,    0,    0],
       [  13,   43,   83, ...,    0,    0,    0],
       [   6,   58,   87, ...,    0,    0,    0]]), 'spanish': array([[  2,  12,  18, ...,   0,   0,   0],
       [  2,  88, 364, ...,   0,   0,   0],
       [  2,  23, 425, ...,   0,   0,   0],
       ...,
       [  2,   8,   7, ...,   0,   0,   0],
       [  2,  59, 175, ...,   0,   0,   0],
       [  2,   8,   7, ...,   0,   0,   0]])}, array([[  12,   18, 1929, ...,    0,    0,    0],
       [  88,  364,   10, ...,    0,    0,    0],
       [  23,  425,   14, ...,    0,    0,    0],
       ...,
       [   8,    7,  136, ...,    0,    0,    0],
       [  59,  175,  133, ...,    0,    0,    0],
       [   8,    7,   26, ...,    0,    0,    0]]))


# Transformers encoder implemented as a subclassed Layer

In [27]:
class TransformerEncoder(layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim=embed_dim
        self.dense_dim=dense_dim
        self.num_heads=num_heads
        self.attention=layers.MultiHeadAttention(
           num_heads=num_heads,key_dim=embed_dim)
        self.dense_proj=keras.Sequential(
            [layers.Dense(dense_dim,activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1=layers.LayerNormalization()
        self.layernorm_2=layers.LayerNormalization()

    def call(self,inputs,mask=None):
        if mask is not None:
            mask=mask[:,tf.newaxis,:]
        attention_output=self.attention(
            inputs,inputs,attention_mask=mask
        )
        project_input=self.layernorm_1(inputs+attention_output)
        project_output=self.dense_proj(project_input)

        return self.layernorm_2(project_input+project_output)

    def get_config(self):
        config=super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

# Transformer decorder

In [28]:
class TransformerDecoder(layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim=embed_dim
        self.dense_dim=dense_dim
        self.num_heads=num_heads
        self.attention_1=layers.MultiHeadAttention(
            num_heads=num_heads,key_dim=embed_dim)
        self.attention_2=layers.MultiHeadAttention(
            num_heads=num_heads,key_dim=embed_dim)
        self.dense_proj=keras.Sequential(
            [layers.Dense(dense_dim,activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1=layers.LayerNormalization()
        self.layernorm_2=layers.LayerNormalization()
        self.layernorm_3=layers.LayerNormalization()
        self.supports_masking=True

    def get_config(self):
        config=super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_casual_attention_mask(self,inputs):
        input_shape=tf.shape(inputs)
        batch_size,sequence_length=input_shape[0],input_shape[1]
        i=tf.range(sequence_length)[:,tf.newaxis]
        j=tf.range(sequence_length)
        mask=tf.cast(i>=j,dtype="int32")
        mask=tf.reshape(mask,(1,input_shape[1],input_shape[1]))
        mult=tf.concat(
            [tf.expand_dims(batch_size,-1),
             tf.constant([1,1],dtype=tf.int32)],axis=0)

        return tf.tile(mask,mult)


    def call(self,inputs,encorder_outputs,mask=None):
        casual_mask=self.get_casual_attention_mask(inputs)
        if mask is not None:
            padding_mask=tf.cast(
                mask[:,tf.newaxis,:],dtype="int32"
            )
            padding_mask=tf.minimum(padding_mask,casual_mask)
        else:
            padding_mask=mask
        attention_output_1=self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=casual_mask
        )
        attention_output_1=self.layernorm_1(inputs+attention_output_1)
        attention_output_2=self.attention_2(
            query=attention_output_1,
            value=encorder_outputs,
            key=encorder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2=self.layernorm_2(
            attention_output_1+attention_output_2
        )
        proj_output=self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2+proj_output)




# Positional Encoding

In [29]:
class PositionalEmbedding(layers.Layer):
    def __init__(self,sequence_length,input_dim,output_dim,**kwargs):
        super().__init__(**kwargs)
        self.token_embeddings=layers.Embedding(
            input_dim=input_dim,output_dim=output_dim)
        self.position_embeddings=layers.Embedding(
            input_dim=sequence_length,output_dim=output_dim)
        self.sequence_length=sequence_length
        self.input_dim=input_dim
        self.output_dim=output_dim

    def call(self,inputs):
        length=tf.shape(inputs)[-1]
        positions=tf.range(start=0,limit=length,delta=1)
        embedded_tokens=self.token_embeddings(inputs)
        embedded_positions=self.position_embeddings(positions)

        return embedded_tokens + embedded_positions


    def compute_mask(self, inputs, mask=None):
        return tf.not_equal(inputs, 0)

    def get_config(self):
        config=super(PositionalEmbedding,self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [30]:
# from tensorflow.python.framework.ops import disable_eager_execution
# disable_eager_execution()

# End-to-End Transformer

In [31]:
embed_dim=256
dense_dim=2048
num_heads=8

encoder_inputs=keras.Input(shape=(None,), dtype="int64", name="english")
x=PositionalEmbedding(sequence_length,vocab_size,embed_dim)(encoder_inputs)
encoder_outputs=TransformerEncoder(embed_dim,dense_dim,num_heads)(x)

decorder_inputs=keras.Input(shape=(None,),dtype="int64",name="spanish")
x=PositionalEmbedding(sequence_length,vocab_size,embed_dim)(decorder_inputs)
x=TransformerDecoder(embed_dim,dense_dim,num_heads)(x,encoder_outputs)
x=layers.Dropout(0.5)(x)
decorder_outputs=layers.Dense(vocab_size,activation="softmax")(x)
transformer_spanish=keras.Model([encoder_inputs,decorder_inputs],decorder_outputs)

In [32]:
transformer_spanish.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 spanish (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding (Posi  (None, None, 256)            3845120   ['english[0][0]']             
 tionalEmbedding)                                                                                 
                                                                                                  
 positional_embedding_1 (Po  (None, None, 256)            3845120   ['spanish[0][0]']         

# Training the sequence-to-sequence Transformer

In [33]:
transformer_spanish.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

transformer_spanish.fit(train_data,epochs=50,validation_data=val_data)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7b54b391e7d0>

In [49]:
import numpy as np

ger_vocab = target_vectorization.get_vocabulary()
ger_index_lookup = dict(zip(range(len(ger_vocab)), ger_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer_spanish([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ger_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break

    return decoded_sentence

In [50]:
ger_eng_texts = [pair[0] for pair in spanish_test_pairs]
for _ in range(20):
    input_sentence = random.choice(ger_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
There are no gods.
[start] no hay ningún peor [end]
-
Tom put his wallet on top of the dresser.
[start] tom puso su caja debajo de la [UNK] [end]
-
We should call Tom.
[start] deberíamos llamar a tom [end]
-
I'll be there.
[start] estaré allí [end]
-
What do you want me to say?
[start] qué quieres que yo diga [end]
-
Yesterday, I bought a book.
[start] compré un libro ayer [end]
-
I was concerned.
[start] estaba preocupado [end]
-
We have arrived safe and sound.
[start] hemos llegado a salvo y tom [end]
-
Are you saying that Tom can't understand French at all?
[start] estás diciendo que no esperaba a tom nada de francés [end]
-
He graduated from Harvard.
[start] Él se ha decidido en cantar [end]
-
We compared his work with hers.
[start] [UNK] su trabajo con el suyo [end]
-
Finish this.
[start] acabó esto [end]
-
She's smarter than him.
[start] ella es lo inteligente que él [end]
-
Where do you think Tom went?
[start] a dónde crees que fue tom [end]
-
He was given up for dead.
[start]

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
transformer_spanish.save('/content/drive/My Drive/Translators/spanish_model')