Build a basic Seq2Seq (Encoder–Decoder) model using LSTM layers in TensorFlow 2 to translate short English sentences into French.

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
#Sample parallel corpus
english_sentences = ["hello", "how are you", "thank you", "good night"]
french_sentences = ["bonjour", "comment ça va", "merci", "bonne nuit"]

In [24]:
#Tokenize source (English)
src_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
src_tokenizer.fit_on_texts(english_sentences)
src_sequences = src_tokenizer.texts_to_sequences(english_sentences)
src_word_index = src_tokenizer.word_index
src_vocab_size = len(src_word_index) + 1

In [25]:
#Tokenize target (French) with <start> and <end> tokens
french_sentences = [f"<start> {s} <end>" for s in french_sentences]
tgt_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tgt_tokenizer.fit_on_texts(french_sentences)
tgt_sequences = tgt_tokenizer.texts_to_sequences(french_sentences)
tgt_word_index = tgt_tokenizer.word_index
tgt_index_word = tgt_tokenizer.index_word
tgt_vocab_size = len(tgt_word_index) + 1

The pad_sequences function in TensorFlow (specifically, from tf.keras.preprocessing.sequence) is used to ensure that all sequences (lists of word indices) are the same length, which is required for efficient processing in deep learning models.

In [26]:
#Pad sequences
src_padded = tf.keras.preprocessing.sequence.pad_sequences(src_sequences, padding='post')
tgt_padded = tf.keras.preprocessing.sequence.pad_sequences(tgt_sequences, padding='post')

In [27]:
#Split target into decoder input and output
decoder_input = tgt_padded[:, :-1]
decoder_target = tf.keras.utils.to_categorical(tgt_padded[:, 1:], num_classes=tgt_vocab_size)

In [28]:
embedding_dim = 64
latent_dim = 64

In [29]:
# Encoder
encoder_inputs = tf.keras.Input(shape=(None,))
enc_emb = tf.keras.layers.Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
_, state_h, state_c = tf.keras.layers.LSTM(latent_dim, return_state=True)(enc_emb)

# Decoder
decoder_inputs = tf.keras.Input(shape=(None,))
dec_emb_layer = tf.keras.layers.Embedding(tgt_vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = tf.keras.layers.Dense(tgt_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [30]:
#Model compilation and training
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([src_padded, decoder_input], decoder_target, epochs=300, verbose=0)

<keras.src.callbacks.history.History at 0x25317c34fd0>

In [31]:
# Encoder model
encoder_model = tf.keras.Model(encoder_inputs, [state_h, state_c])

# Decoder model
decoder_state_input_h = tf.keras.Input(shape=(latent_dim,))
decoder_state_input_c = tf.keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = tf.keras.Input(shape=(1,))
dec_emb_inf = dec_emb_layer(decoder_inputs_single)

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=decoder_states_inputs)
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = tf.keras.Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)

In [32]:
def translate(input_text):
    seq = src_tokenizer.texts_to_sequences([input_text])
    seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=src_padded.shape[1], padding='post')

    if not any(seq[0]):
        return "[Unknown Input]"

    h, c = encoder_model.predict(seq)

    dec_input = np.array([[tgt_word_index['<start>']]])
    translated = []

    for _ in range(10):
        output_tokens, h, c = decoder_model.predict([dec_input, h, c])
        token = np.argmax(output_tokens[0, -1, :])
        word = tgt_index_word.get(token, '')
        if word == '<end>':
            break
        translated.append(word)
        dec_input = np.array([[token]])

    return ' '.join(translated)

In [33]:
print("Translate 'hello':", translate("hello"))
print("Translate 'thank you':", translate("thank you"))
print("Translate 'how are you':", translate("how are you"))
print("Translate 'good night':", translate("good night"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Translate 'hello': <start> bonjour
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Translate 'thank you': <start> merci
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0