# The data

https://www.kaggle.com/datasets/mohamedlotfy50/wmt-2014-english-french/data
There over 4.5 million sentence pairs available. However, I will only use 25,000 pairs due to computational feasiblility.


In [1]:
import pandas as pd
import numpy as np

n_sentences = 25000

data = pd.read_csv(
    "./data/en-fr/wmt14_translate_fr-en_train.csv", nrows=n_sentences
).dropna()

data.head()

Unnamed: 0,en,fr
0,Resumption of the session,Reprise de la session
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog..."
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai..."


# spliting the sentences into tokens


In [2]:
import random

original_en_sentences = [sent.strip().split(" ") for sent in data["en"]]
original_fr_sentences = [sent.strip().split(" ") for sent in data["fr"]]


for i in range(3):
    index = random.randint(0, 10000)
    print("English: ", " ".join(original_en_sentences[index]))
    print("French: ", " ".join(original_fr_sentences[index]), "\n")

English:  It has unfortunately been observed, I have to say, that the European Union has failed to reach agreement internally on specific tax measures. This indicates how difficult these dossiers are.
French:  Il a été remarqué, malheureusement, que l' Union européenne n' est parvenue à aucun accord interne concernant certaines mesures fiscales, ce qui prouve combien ces dossiers sont délicats. 

English:  Madam President, Mr President of the Commission, 'Shaping the new Europe' is an ambitious objective both for the Commission and for us all.
French:  Madame la Présidente, Monsieur le Président de la Commission, chers collègues, Shaping the new Europe : voilà qui représente un projet ambitieux, pour la Commission comme pour nous tous. 

English:  The Commission will endeavour, for all these specific guidelines, to quantify the data and to obtain specific information based on the implementation of the new Community framework for 2000 to 2006.
French:  Pour toutes ces lignes directrices

# Adding special tokens

#### I will add "< s >" to mark the start of a sentence and "< /s >" to mark the end of a sentence

This way
we prediction can be done for an arbitrary number of time steps. Using < s > as the starting token gives a
way to signal to the decoder that it should start predicting tokens from the target language.

if < /s > token is not used to mark the end of a sentence, the decoder cannot be signaled to
end a sentence. This can lead the model to enter an infinite loop of predictions.


In [3]:
en_sentences = [["<s>"] + sent + ["</s>"] for sent in original_en_sentences]
fr_sentences = [["<s>"] + sent + ["</s>"] for sent in original_fr_sentences]

for i in range(2):
    index = random.randint(0, 10000)
    print("English: ", " ".join(en_sentences[index]))
    print("German: ", " ".join(fr_sentences[index]), "\n")

English:  <s> In this respect, the excellent speech by Mrs Thyssen cannot be improved upon. </s>
German:  <s> Sur ce point, il n'y a rien à ajouter à l'excellente intervention de Mme Thyssen. </s> 

English:  <s> VOTE </s>
German:  <s> VOTES </s> 



# splitting training and validation dataset

#### 80% training, 10% validation and 10% for testing


In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

(
    train_en_sentences,
    valid_test_en_sentences,
    train_fr_sentences,
    valid_test_fr_sentences,
) = train_test_split(en_sentences, fr_sentences, test_size=0.2)


(valid_en_sentences, test_en_sentences, valid_fr_sentences, test_fr_sentences) = (
    train_test_split(valid_test_en_sentences, valid_test_fr_sentences, test_size=0.5)
)


print(train_en_sentences[1])
print(train_fr_sentences[1])
print("\n")
print(test_en_sentences[0])
print(test_fr_sentences[0])

['<s>', 'Duff', 'and', 'Voggenhuber,', 'on', 'the', 'other', 'hand,', 'think', 'of', 'themselves', 'as', 'true', 'European', 'Parliamentarians', 'whose', 'task', 'it', 'is', 'to', 'further', 'the', 'interests', 'of', 'Europe', 'as', 'a', 'whole.', '</s>']
['<s>', 'MM.', 'Duff', 'et', 'Voggenhuber,', 'par', 'contre,', 'se', 'comportent', 'comme', 'des', 'parlementaires', 'européens', 'authentiques', 'poussés', 'par', "l'intérêt", 'commun', 'européen.', '</s>']


['<s>', 'In', 'Nice,', 'prior', 'to', 'the', 'great', 'enlargement,', 'I', 'would', 'ask', 'you,', 'please,', 'to', 'see', 'this', 'text', 'as', 'anticipating', 'events.', '</s>']
['<s>', 'À', 'Nice,', 'avant', 'le', 'grand', 'élargissement,', 'regardez', 'bien', '-', 'je', 'vous', 'en', 'prie,', 'je', 'vous', 'en', 'supplie', '-', 'ce', 'texte', 'de', 'manière', 'prospective.', '</s>']


### Defining sequence leghts fot the two languages


In [5]:
# Getting some basic statistics from the data

# convert train_en_sentences to a pandas series
pd.Series(train_en_sentences).str.len().describe(percentiles=[0.05, 0.5, 0.95])

count    20000.000000
mean        27.557300
std         15.738945
min          3.000000
5%           8.000000
50%         24.000000
95%         58.000000
max        150.000000
dtype: float64

In [6]:
pd.Series(train_fr_sentences).str.len().describe(percentiles=[0.05, 0.5, 0.95])

count    20000.000000
mean        28.873900
std         16.785401
min          3.000000
5%           8.000000
50%         26.000000
95%         61.000000
max        154.000000
dtype: float64

# from the train data statistics above, 95% of the english sentences have lengths of 57 while in the french, 95 % of sentences have lengths of 60


### padding the sentences with pad_sequences from keras


In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

n_en_seq_length = 60
n_fr_seq_length = 60
unk_token = "<unk>"

train_en_sentences_padded = pad_sequences(
    train_en_sentences,
    maxlen=n_en_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)

valid_en_sentences_padded = pad_sequences(
    valid_en_sentences,
    maxlen=n_en_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)

test_en_sentences_padded = pad_sequences(
    test_en_sentences,
    maxlen=n_en_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)


train_fr_sentences_padded = pad_sequences(
    train_fr_sentences,
    maxlen=n_fr_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)

valid_fr_sentences_padded = pad_sequences(
    valid_fr_sentences,
    maxlen=n_fr_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)

test_fr_sentences_padded = pad_sequences(
    test_fr_sentences,
    maxlen=n_fr_seq_length,
    # value=unk_token,
    dtype=object,
    truncating="post",
    padding="post",
)

print(train_en_sentences_padded[1])

2024-10-14 19:53:30.275188: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-14 19:53:30.421815: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-14 19:53:30.475915: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-14 19:53:30.491911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-14 19:53:30.605540: I tensorflow/core/platform/cpu_feature_guar

['<s>' 'Duff' 'and' 'Voggenhuber,' 'on' 'the' 'other' 'hand,' 'think' 'of'
 'themselves' 'as' 'true' 'European' 'Parliamentarians' 'whose' 'task'
 'it' 'is' 'to' 'further' 'the' 'interests' 'of' 'Europe' 'as' 'a'
 'whole.' '</s>' 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]


# Converting to token IDs


In [8]:
from tensorflow.keras.layers import TextVectorization
import os

# using text vectorization
# text_vectorizer_en = TextVectorization(output_mode="int")
# text_vectorizer_fr = TextVectorization(output_mode="int")
# text_vectorizer_en.adapt(data["en"])
# text_vectorizer_fr.adapt(data["fr"])

en_vocabulary = []
with open(os.path.join("./data/en-fr", "vocab.en"), "r", encoding="utf-8") as en_file:
    for ri, row in enumerate(en_file):

        en_vocabulary.append(row.strip())

fr_vocabulary = []
with open(os.path.join("./data/en-fr", "vocab.fr"), "r", encoding="utf-8") as en_file:
    for ri, row in enumerate(en_file):

        fr_vocabulary.append(row.strip())

text_vectorizer_en = TextVectorization(output_mode="int")
text_vectorizer_fr = TextVectorization(output_mode="int")
text_vectorizer_en.adapt(en_vocabulary)
text_vectorizer_fr.adapt(fr_vocabulary)


en_vocabulary = text_vectorizer_en.get_vocabulary()
fr_vocabulary = text_vectorizer_fr.get_vocabulary()

I0000 00:00:1728950013.095001     996 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728950013.212370     996 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728950013.212424     996 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728950013.217281     996 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1728950013.217339     996 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [9]:
en_unk_token = en_vocabulary.pop(1)
fr_unk_token = fr_vocabulary.pop(1)

en_unk_token, fr_unk_token

('[UNK]', '[UNK]')

In [10]:
import tensorflow as tf

pad_token = "[PAD]"

# English look up layer
en_lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=en_vocabulary,
    oov_token=en_unk_token,
    mask_token=pad_token,
    pad_to_max_tokens=False,
)

# French look up layer
fr_lookup_layer = tf.keras.layers.StringLookup(
    vocabulary=fr_vocabulary,
    oov_token=en_unk_token,
    mask_token=pad_token,
    pad_to_max_tokens=False,
)

In [11]:
# dir(en_lookup_layer)
# en_lookup_layer.get_vocabulary()

# Defining the encoder


In [12]:
# takes n_en_seq_length of sentences
encoder_input = tf.keras.layers.Input(shape=(n_en_seq_length,), dtype=tf.string)

# using lookup layer into word IDs
encoder_wid_out = en_lookup_layer(encoder_input)

"""
With the tokens converted into IDs, route the generated word IDs to a token embedding layer.
Pass in the size of the vocabulary (derived from the en_lookup_layer's get_vocabulary()
method) and the embedding size (128) and finally then ask the layer to mask any zero-valued inputs
as they don’t contain any information:

"""
en_full_vocab_size = len(en_lookup_layer.get_vocabulary())
encoder_emb_out = tf.keras.layers.Embedding(en_full_vocab_size, 128, mask_zero=True)(
    encoder_wid_out
)


encoder_gru_out, encoder_gru_last_state = tf.keras.layers.GRU(
    256, return_sequences=True, return_state=True
)(encoder_emb_out)

encoder = tf.keras.models.Model(inputs=encoder_input, outputs=encoder_gru_out)

# Defining the Decoder with teacher forcing


In [14]:
decoder_input = tf.keras.layers.Input(shape=(n_fr_seq_length - 1,), dtype=tf.string)

# convert tokens to IDs using the de_lookup_layer
decoder_wid_out = fr_lookup_layer(decoder_input)

# decoder embedding layer
fr_full_vocab_size = len(fr_lookup_layer.get_vocabulary())
decoder_emb_out = tf.keras.layers.Embedding(fr_full_vocab_size, 128, mask_zero=True)(
    decoder_wid_out
)

# decoder layer>>> pass the last state of the encoder into the decoder
decoder_gru_out = tf.keras.layers.GRU(256, return_sequences=True)(
    decoder_emb_out, initial_state=encoder_gru_last_state
)