In [51]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import re
import string
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings('ignore')

In [52]:
# Download and prepare the dataset

!wget https://www.statmt.org/europarl/v7/nl-en.tgz
!tar -xzf nl-en.tgz

en_path = "europarl-v7.nl-en.en"
nl_path = "europarl-v7.nl-en.nl"

max_samples = 50000

def read_lines(file_path, max_lines):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [line.strip() for i, line in enumerate(file) if i < max_lines]
    return lines

english_lines = read_lines(en_path, max_samples)
dutch_lines = read_lines(nl_path, max_samples)

print(f"Loaded {len(english_lines)} English sentences")
print(f"Loaded {len(dutch_lines)} Dutch sentences")
print("\nExample pairs:")
for i in range(3):
    print(f"English: {english_lines[i]}")
    print(f"Dutch: {dutch_lines[i]}")
    print()

--2025-07-06 16:35:31--  https://www.statmt.org/europarl/v7/nl-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.32.28
Connecting to www.statmt.org (www.statmt.org)|129.215.32.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 199077856 (190M) [application/x-gzip]
Saving to: ‘nl-en.tgz.3’


2025-07-06 16:35:43 (17.9 MB/s) - ‘nl-en.tgz.3’ saved [199077856/199077856]

Loaded 50000 English sentences
Loaded 50000 Dutch sentences

Example pairs:
English: Resumption of the session
Dutch: Hervatting van de zitting

English: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Dutch: Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad.

English: Although, as you wi

In [53]:
# Text preprocessing

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    text = '<start> ' + text + ' <end>'
    return text

english_lines = [preprocess_text(line) for line in english_lines]
dutch_lines = [preprocess_text(line) for line in dutch_lines]

train_en, test_en, train_nl, test_nl = train_test_split(
    english_lines, dutch_lines, test_size=0.2, random_state=42)

print(f"Training samples: {len(train_en)}")
print(f"Test samples: {len(test_en)}")

Training samples: 40000
Test samples: 10000


In [54]:
# Text vectorization (creating vocabularies)

def build_vectorizer(lines, max_vocab_size=20000):
    vectorizer = TextVectorization(
        max_tokens=max_vocab_size,
        standardize=None,
        split='whitespace',
        output_mode='int',
        output_sequence_length=None
    )
    vectorizer.adapt(lines)
    return vectorizer

en_vectorizer = build_vectorizer(train_en)
en_vocab = en_vectorizer.get_vocabulary()
en_vocab_size = len(en_vocab)
print(f"English vocabulary size: {en_vocab_size}")

nl_vectorizer = build_vectorizer(train_nl)
nl_vocab = nl_vectorizer.get_vocabulary()
nl_vocab_size = len(nl_vocab)
print(f"Dutch vocabulary size: {nl_vocab_size}")

en_word2idx = {word: idx for idx, word in enumerate(en_vocab)}
nl_word2idx = {word: idx for idx, word in enumerate(nl_vocab)}

en_idx2word = {idx: word for idx, word in enumerate(en_vocab)}
nl_idx2word = {idx: word for idx, word in enumerate(nl_vocab)}

English vocabulary size: 20000
Dutch vocabulary size: 20000


In [55]:
# Prepare datasets

def prepare_dataset(encoder_inputs, decoder_inputs, batch_size=16):
    encoder_inputs = en_vectorizer(encoder_inputs)
    decoder_inputs = nl_vectorizer(decoder_inputs)

    decoder_targets = decoder_inputs[:, 1:]
    decoder_inputs = decoder_inputs[:, :-1]

    dataset = tf.data.Dataset.from_tensor_slices(
        ((encoder_inputs, decoder_inputs), decoder_targets))
    dataset = dataset.shuffle(2048).batch(batch_size).prefetch(16)
    return dataset

train_dataset = prepare_dataset(train_en, train_nl)
val_dataset = prepare_dataset(test_en, test_nl)

for (encoder_inputs, decoder_inputs), decoder_targets in train_dataset.take(1):
    print("Encoder inputs shape:", encoder_inputs.shape)
    print("Decoder inputs shape:", decoder_inputs.shape)
    print("Decoder targets shape:", decoder_targets.shape)

    print("\nFirst example:")
    print("English:", " ".join([en_idx2word[i] for i in encoder_inputs[0].numpy() if i != 0]))
    print("Dutch input:", " ".join([nl_idx2word[i] for i in decoder_inputs[0].numpy() if i != 0]))
    print("Dutch target:", " ".join([nl_idx2word[i] for i in decoder_targets[0].numpy() if i != 0]))

Encoder inputs shape: (16, 165)
Decoder inputs shape: (16, 153)
Decoder targets shape: (16, 153)

First example:
English: <start> berger report [UNK] <end>
Dutch input: <start> verslagberger a500072000 <end>
Dutch target: verslagberger a500072000 <end>


In [56]:
# Build the Encoder

class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.lstm = LSTM(hidden_units, return_sequences=True, return_state=True)

    def call(self, inputs):
        x = self.embedding(inputs)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c

embedding_dim = 256
hidden_units = 512

encoder = Encoder(en_vocab_size, embedding_dim, hidden_units)

sample_encoder_output, sample_encoder_state_h, sample_encoder_state_c = encoder(encoder_inputs)
print("Encoder output shape:", sample_encoder_output.shape)
print("Encoder state_h shape:", sample_encoder_state_h.shape)
print("Encoder state_c shape:", sample_encoder_state_c.shape)

Encoder output shape: (16, 165, 512)
Encoder state_h shape: (16, 512)
Encoder state_c shape: (16, 512)


In [57]:
# Decoder with Bahdanau Attention

class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.lstm = LSTM(hidden_units, return_sequences=True, return_state=True)

        self.W1 = Dense(hidden_units)
        self.W2 = Dense(hidden_units)
        self.V = Dense(1)

        self.dense = Dense(vocab_size)

    def call(self, inputs, initial_state):

        encoder_output, state_h, state_c = initial_state

        x = self.embedding(inputs)  # shape: (batch_size, 1, embedding_dim)

        lstm_output, state_h, state_c = self.lstm(x, initial_state=[state_h, state_c])

        decoder_output_with_attention = tf.repeat(lstm_output, repeats=encoder_output.shape[1], axis=1)

        score = self.V(tf.nn.tanh(
            self.W1(encoder_output) + self.W2(decoder_output_with_attention)
        ))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1, keepdims=True)

        attention_output = tf.concat([context_vector, lstm_output], axis=-1)

        output = self.dense(attention_output)

        return output, state_h, state_c

decoder = Decoder(nl_vocab_size, embedding_dim, hidden_units)

sample_decoder_output, _, _ = decoder(
    tf.random.uniform((16, 1), maxval=nl_vocab_size, dtype=tf.int64),
    initial_state=[sample_encoder_output, sample_encoder_state_h, sample_encoder_state_c])
print("Decoder output shape:", sample_decoder_output.shape)

Decoder output shape: (16, 1, 20000)


In [58]:
# Building the complete NMT model

class NMTModel(Model):
    def __init__(self, encoder, decoder):
        super(NMTModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        encoder_input, decoder_input = inputs

        encoder_output, state_h, state_c = self.encoder(encoder_input)
        decoder_state = [encoder_output, state_h, state_c]

        batch_size = tf.shape(encoder_input)[0]
        max_length = tf.shape(decoder_input)[1]

        outputs = tf.TensorArray(tf.float32, size=max_length)

        t0 = tf.constant(0)
        cond = lambda t, *_: t < max_length

        def body(t, outputs, state_h, state_c):
            current_input = decoder_input[:, t:t+1]

            output, state_h, state_c = self.decoder(
                current_input, [encoder_output, state_h, state_c])

            outputs = outputs.write(t, tf.squeeze(output, axis=1))
            return t + 1, outputs, state_h, state_c

        _, outputs, _, _ = tf.while_loop(
            cond, body, loop_vars=[t0, outputs, state_h, state_c],
            parallel_iterations=32)

        outputs = outputs.stack()
        outputs = tf.transpose(outputs, [1, 0, 2])
        return outputs


model = NMTModel(encoder, decoder)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

sample_output = model((encoder_inputs, decoder_inputs))
print("Model output shape:", sample_output.shape)

Model output shape: (16, 153, 20000)


In [None]:
# Training the model

checkpoint_cb = keras.callbacks.ModelCheckpoint(
    "nmt_model.keras", save_best_only=True)

early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=5, restore_best_weights=True)

AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.prefetch(AUTOTUNE)
val_dataset = val_dataset.prefetch(AUTOTUNE)

history = model.fit(
    train_dataset,
    epochs=30,
    validation_data=val_dataset,
    callbacks=[checkpoint_cb, early_stopping_cb])

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

plt.show()

Epoch 1/30
[1m1074/2500[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:45:22[0m 4s/step - accuracy: 0.8388 - loss: 1.4884

In [None]:
# Inference (Translation) functions

class Translator:
    def __init__(self, encoder, decoder, en_vectorizer, nl_vectorizer, nl_idx2word, nl_word2idx):
        """
        Translator class for inference on trained NMT model.

        Args:
            encoder: Trained encoder model.
            decoder: Trained decoder model.
            en_vectorizer: TextVectorization for English.
            nl_vectorizer: TextVectorization for Dutch.
            nl_idx2word: Dictionary mapping Dutch token IDs to words.
            nl_word2idx: Dictionary mapping Dutch words to token IDs.
        """
        self.encoder = encoder
        self.decoder = decoder
        self.en_vectorizer = en_vectorizer
        self.nl_vectorizer = nl_vectorizer
        self.nl_idx2word = nl_idx2word
        self.nl_word2idx = nl_word2idx
        self.nl_vocab_size = len(nl_idx2word)

    def translate(self, input_sentence, max_length=50):
        """
        Translate a single English sentence to Dutch.

        Args:
            input_sentence (str): English sentence.
            max_length (int): Max decoding steps to prevent infinite loops.

        Returns:
            str: Generated Dutch translation.
        """
        input_sentence = preprocess_text(input_sentence)
        encoder_input = self.en_vectorizer([input_sentence])  # Shape: (1, seq_len)

        encoder_output, state_h, state_c = self.encoder(encoder_input)
        decoder_state = [encoder_output, state_h, state_c]

        decoder_input = tf.expand_dims([self.nl_word2idx['<start>']], 0)  # Shape: (1, 1)

        decoded_sentence = []

        for _ in range(max_length):

            output, state_h, state_c = self.decoder(decoder_input, decoder_state)
            decoder_state = [encoder_output, state_h, state_c]

            predicted_id = tf.argmax(output, axis=-1).numpy()[0, 0].item()
            predicted_word = self.nl_idx2word.get(predicted_id, '<unk>')

            if predicted_word == '<end>':
                break

            decoded_sentence.append(predicted_word)
            decoder_input = tf.expand_dims([predicted_id], 0)

        return ' '.join(decoded_sentence)

In [None]:
translator = Translator(encoder, decoder, en_vectorizer, nl_vectorizer, nl_idx2word, nl_word2idx)
import inspect
print(inspect.getsource(translator.translate))

In [None]:
# Evaluation with BLEU score

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_translations(translator, test_sentences, reference_translations, n=100):
    """
    Evaluate model translations on a subset of test data using BLEU.

    Args:
        translator: Translator object with translate() method.
        test_sentences (list of str): English sentences to translate.
        reference_translations (list of str): Corresponding Dutch references.
        n (int): Number of samples to evaluate.

    Returns:
        float: Average BLEU score.
    """
    if n < len(test_sentences):
        np.random.seed(42)
        indices = np.random.choice(len(test_sentences), n, replace=False)
        test_sentences = [test_sentences[i] for i in indices]
        reference_translations = [reference_translations[i] for i in indices]

    smoothie = SmoothingFunction().method4
    bleu_scores = []

    for i, (src, ref) in enumerate(zip(test_sentences, reference_translations)):

        translation = translator.translate(src)

        ref = ref.replace('<start>', '').replace('<end>', '').strip()
        ref_tokens = ref.split()

        trans_tokens = translation.split()

        score = sentence_bleu([ref_tokens], trans_tokens, smoothing_function=smoothie)
        bleu_scores.append(score)

        if (i + 1) % 10 == 0:
            print(f"Evaluated {i+1}/{n} sentences", end='\r')

    avg_bleu = np.mean(bleu_scores)
    print(f"\nAverage BLEU score over {n} samples: {avg_bleu:.4f}")
    return avg_bleu

bleu_score = evaluate_translations(translator, test_en, test_nl)

In [None]:
# Output

def interactive_translation(translator):
    print("English to Dutch Translator (type 'quit' to exit)")
    while True:
        sentence = input("Enter English sentence: ")
        if sentence.lower() == 'quit':
            break
        translation = translator.translate(sentence)
        print(f"Translation: {translation}\n")

interactive_translation(translator)