# Chapter 11: Sequence-to-sequence learning: Part 1

This notebook reproduces the code and summarizes the theoretical concepts from Chapter 11 of *'TensorFlow in Action'* by Thushan Ganegedara.

This chapter introduces **sequence-to-sequence (seq2seq)** models, a powerful architecture for tasks that map an input sequence of one length to an output sequence of another length (e.g., machine translation).

We will cover:
1.  **Data Preparation**: Loading and processing a parallel English-to-German text corpus.
2.  **The `TextVectorization` Layer**: Using this Keras layer to build an end-to-end model that accepts raw strings.
3.  **Seq2seq Model Architecture**: Building an encoder-decoder model using GRUs (Gated Recurrent Units).
4.  **Training (Teacher Forcing)**: How to train a seq2seq model using the "teacher forcing" technique.
5.  **Inference Model**: Building a separate model for generating new translations recursively.

---

## 11.1 Understanding the machine translation data

We will use an English-to-German parallel corpus from `manythings.org`. The data is a text file where each line contains an English sentence, a tab, and its German translation.

**Note**: The book requires you to manually download the file `deu-eng.zip` from `http://www.manythings.org/anki/deu-eng.zip` and place it in a `data` folder.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import tensorflow.keras.backend as K
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import load_model # Added for inference model
import numpy as np
import pandas as pd
import os
import zipfile
import json
from collections import Counter # Import Counter

# Set a random seed for reproducibility
random_seed = 4321
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

# --- 1. Load and Extract Data ---
data_dir = 'data'
zip_path = os.path.join(data_dir, 'deu-eng.zip')
extracted_path = os.path.join(data_dir, 'deu.txt')

# Ensure data directory exists
os.makedirs(data_dir, exist_ok=True)

if not os.path.exists(extracted_path):
    if not os.path.exists(zip_path):
        print(f"Downloading 'deu-eng.zip' to '{data_dir}'...")
        # Use !wget to download the file directly in Colab
        !wget -P {data_dir} http://www.manythings.org/anki/deu-eng.zip
        print("Download complete.")

    print("Extracting data...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    print("Extraction complete.")
else:
    print("Data already extracted.")

# --- 2. Read data into pandas ---
df = pd.read_csv(extracted_path, delimiter='\t', header=None)
df.columns = ["EN", "DE", "Attribution"]
df = df[["EN", "DE"]]

# Clean up problematic unicode characters from the book's example
clean_inds = [i for i in range(len(df)) if b"\xc2" not in df.iloc[i]["DE"].encode("utf-8")]
df = df.iloc[clean_inds]

# --- 3. Sample and Preprocess Data ---
n_samples = 50000
df = df.sample(n=n_samples, random_state=random_seed)

# Add 'sos' (start of sentence) and 'eos' (end of sentence) tokens
# These are crucial for the decoder during training and inference.
start_token = 'sos'
end_token = 'eos'
df["DE"] = start_token + ' ' + df["DE"] + ' ' + end_token

print(f"Loaded and processed {len(df)} samples.")
print("\nSample data:")
print(df.head())

Data already extracted.
Loaded and processed 50000 samples.

Sample data:
                                                   EN  \
158450                 Do I look like I'm having fun?   
283485  We took the elevator down to the third floor.   
37230                             Mary lost her baby.   
75516                         I prefer mineral water.   
8316                                   Tom is wasted.   

                                                       DE  
158450  sos Sehe ich so aus, als würde ich mich amüsie...  
283485  sos Wir haben den Fahrstuhl hinunter in den zw...  
37230                sos Maria hat ihr Kind verloren. eos  
75516                     sos Ich mag lieber Selters. eos  
8316                            sos Tom ist erledigt. eos  


In [None]:
# --- 4. Create train/validation/test splits ---
n_test = int(n_samples / 10)
n_valid = int(n_samples / 10)

test_df = df.sample(n=n_test, random_state=random_seed)
valid_df = df.loc[~df.index.isin(test_df.index)].sample(n=n_valid, random_state=random_seed)
train_df = df.loc[~(df.index.isin(test_df.index) | df.index.isin(valid_df.index))]

print(f"\nTraining samples: {len(train_df)}")
print(f"Validation samples: {len(valid_df)}")
print(f"Test samples: {len(test_df)}")

# --- 5. Analyze Vocabulary and Sequence Length ---
# (Using helper function from Listing 11.1)
def get_vocabulary_size_greater_than(words, n, verbose=True):
    counter = Counter(words)
    freq_df = pd.Series(list(counter.values()), index=list(counter.keys())).sort_values(ascending=False)
    n_vocab = (freq_df >= n).sum()
    if verbose: print(f"Vocabulary size (>= {n} freq): {n_vocab}")
    return n_vocab

en_words = train_df["EN"].str.split().sum()
de_words = train_df["DE"].str.split().sum()

en_vocab = get_vocabulary_size_greater_than(en_words, n=10)
de_vocab = get_vocabulary_size_greater_than(de_words, n=10)

# Get 99th percentile for sequence lengths
en_seq_length = int(train_df["EN"].str.split().str.len().quantile(0.99)) + 5
de_seq_length = int(train_df["DE"].str.split().str.len().quantile(0.99)) + 5

print(f"EN max sequence length (99th percentile + 5): {en_seq_length}")
print(f"DE max sequence length (99th percentile + 5): {de_seq_length}")


Training samples: 40000
Validation samples: 5000
Test samples: 5000
Vocabulary size (>= 10 freq): 2173
Vocabulary size (>= 10 freq): 2453
EN max sequence length (99th percentile + 5): 19
DE max sequence length (99th percentile + 5): 21


---

## 11.2 Writing an English-German seq2seq machine translator

A seq2seq model consists of two main parts:
1.  **Encoder**: An RNN (we'll use a GRU) that reads the input English sentence one token at a time and compresses its meaning into a single vector, known as the **context vector** or "thought vector". This is the final hidden state of the encoder.
2.  **Decoder**: Another RNN (also a GRU) that takes the encoder's context vector as its *initial hidden state*. It then generates the output German sentence one token at a time.

### 11.2.1 The `TextVectorization` Layer

Instead of preprocessing our text into integers *before* feeding it to the model, we can build the preprocessing *into* the model using the `TextVectorization` layer. This layer will:
1.  Be `adapt`ed (fitted) on our training corpus to build a vocabulary.
2.  When the model is running, it will take raw strings as input.
3.  It will automatically tokenize, convert to integers, and pad the sequences to a fixed length, all inside the model graph.

In [None]:
# Based on Listing 11.3
def get_vectorizer(corpus, n_vocab, max_length=None, return_vocabulary=True, name=None):
    """Creates a TextVectorization layer/model."""

    inp = tf.keras.Input(shape=(1,), dtype=tf.string, name=f'{name}_input')

    # We add 2 to the vocab size for the <PAD> (ID 0) and [UNK] (ID 1) tokens
    vectorize_layer = TextVectorization(
        max_tokens=n_vocab + 2,
        output_mode='int',
        output_sequence_length=max_length,
        name=name
    )

    # Build the vocabulary
    vectorize_layer.adapt(corpus)
    vectorized_out = vectorize_layer(inp)

    model = tf.keras.models.Model(inputs=inp, outputs=vectorized_out)

    if return_vocabulary:
        return model, vectorize_layer.get_vocabulary()
    return model

# Create the vectorizers for English and German
# Note: The decoder's max_length is de_seq_length - 1
# This is because we will feed it 'sos ... word_n' (length N) to predict 'word_1 ... eos' (length N)
en_vectorizer, en_vocabulary = get_vectorizer(
    corpus=np.array(train_df["EN"].tolist()),
    n_vocab=en_vocab,
    max_length=en_seq_length,
    name='en_vectorizer'
)
de_vectorizer, de_vocabulary = get_vectorizer(
    corpus=np.array(train_df["DE"].tolist()),
    n_vocab=de_vocab,
    max_length=de_seq_length - 1,
    name='de_vectorizer'
)

print(f"English Vocabulary size: {len(en_vocabulary)}")
print(f"German Vocabulary size: {len(de_vocabulary)}")

# Test the English vectorizer
print("\nTest EN Vectorizer:")
print(en_vectorizer(np.array([["I like machine learning"]])))

English Vocabulary size: 2175
German Vocabulary size: 2455

Test EN Vectorizer:
tf.Tensor(
[[   5   31  941 1115    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]], shape=(1, 19), dtype=int64)


### 11.2.3 & 11.2.4 Defining the Encoder and Decoder

Now we build the full seq2seq model using the Keras Functional API.

**Encoder (Listing 11.4):**
1.  Input (Raw English strings)
2.  `en_vectorizer` (Text -> Integer IDs)
3.  `Embedding` Layer (IDs -> Dense Vectors)
4.  `Bidirectional(GRU)`: Reads the sequence forwards and backwards. The final hidden state is the context vector.

**Decoder (Listing 11.5):**
1.  Input (Raw German strings, e.g., "sos Ich möchte ein...")
2.  `de_vectorizer` (Text -> Integer IDs)
3.  `Embedding` Layer (IDs -> Dense Vectors)
4.  `GRU`: This GRU's **initial_state** is set to the **encoder's context vector**.
5.  `Dense` Layer (with Softmax): Predicts the next word in the German vocabulary.

In [None]:
K.clear_session()

# --- Define Encoder ---
def get_encoder(n_vocab, vectorizer):
    inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    vectorized_out = vectorizer(inp)
    emb_layer = layers.Embedding(
        n_vocab + 2, 128, mask_zero=True, name='e_embedding'
    )
    emb_out = emb_layer(vectorized_out)
    gru_layer = layers.Bidirectional(
        layers.GRU(128, name='e_gru'), name='e_bidirectional_gru'
    )
    gru_out = gru_layer(emb_out)
    encoder = tf.keras.models.Model(inputs=inp, outputs=gru_out, name='encoder')
    return encoder

# --- Define Final Seq2Seq Model ---
def get_final_seq2seq_model(n_vocab, encoder, vectorizer):
    e_inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input_final')
    d_init_state = encoder(e_inp)

    d_inp = tf.keras.Input(shape=(1,), dtype=tf.string, name='d_input')
    d_vectorized_out = vectorizer(d_inp)

    d_emb_layer = layers.Embedding(
        n_vocab + 2, 128, mask_zero=True, name='d_embedding'
    )
    d_emb_out = d_emb_layer(d_vectorized_out)

    d_gru_layer = layers.GRU(
        256, return_sequences=True, name='d_gru' # 256 units = 128 (fwd) + 128 (bwd) from encoder
    )
    # The encoder's state is fed as the initial state to the decoder's GRU
    d_gru_out = d_gru_layer(d_emb_out, initial_state=d_init_state)

    d_dense_layer_1 = layers.Dense(512, activation='relu', name='d_dense_1')
    d_dense1_out = d_dense_layer_1(d_gru_out)

    d_final_layer = layers.Dense(n_vocab + 2, activation='softmax', name='d_dense_final')
    d_final_out = d_final_layer(d_dense1_out)

    seq2seq = tf.keras.models.Model(
        inputs=[e_inp, d_inp], outputs=d_final_out, name='final_seq2seq'
    )
    return seq2seq

# Get the models
encoder = get_encoder(n_vocab=en_vocab, vectorizer=en_vectorizer)
final_model = get_final_seq2seq_model(n_vocab=de_vocab, encoder=encoder, vectorizer=de_vectorizer)

# Compile the model
final_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

final_model.summary()

---

## 11.3 Training and evaluating the model

To train this model, we use **teacher forcing**.

This means for a translation pair `("I like cats", "sos Ich mag Katzen eos")`:
* `x` (inputs) = `("I like cats", "sos Ich mag Katzen")`
* `y` (target) = `("Ich", "mag", "Katzen", "eos")`

The decoder receives the *true* previous word (e.g., "mag") as input to help it predict the next word (e.g., "Katzen"). This stabilizes and speeds up training.

We also need to define a custom training loop to correctly calculate the **BLEU score**, a standard metric for machine translation that measures the overlap of n-grams between the predicted and reference translations.

In [None]:
# Based on Listing 11.6 - Prepare data for teacher forcing
def prepare_data(df):
    # Reshape all string arrays to (N, 1) to match TextVectorization model input shape (None, 1)
    en_inputs = np.array(df["EN"].tolist()).reshape(-1, 1)
    # Explicitly convert to TensorFlow string tensor
    en_inputs_tf = tf.constant(en_inputs, dtype=tf.string)

    # Decoder inputs = 'sos ... word_n'
    de_inputs = np.array(df["DE"].str.rsplit(n=1, expand=True).iloc[:, 0].tolist()).reshape(-1, 1)
    # Explicitly convert to TensorFlow string tensor
    de_inputs_tf = tf.constant(de_inputs, dtype=tf.string)

    # Decoder labels = 'word_1 ... eos'
    de_labels_str = np.array(df["DE"].str.split(n=1, expand=True).iloc[:, 1].tolist()).reshape(-1, 1)
    # Explicitly convert to TensorFlow string tensor for vectorizer input
    de_labels_str_tf = tf.constant(de_labels_str, dtype=tf.string)

    # The labels need to be vectorized *without* the 'sos' token,
    # so we create a separate vectorizer for them.
    # We still adapt on the full German corpus including 'sos' and 'eos' to get a complete vocabulary.
    # The corpus for adapt is fine as a 1D array of strings.
    de_label_vectorizer_model = get_vectorizer(
        corpus=np.array(train_df["DE"].tolist()), # This is for adapt, 1D array is fine here.
        n_vocab=de_vocab,
        max_length=de_seq_length - 1,
        return_vocabulary=False, # We just need the model here
        name='de_label_vectorizer'
    )

    # Convert string labels to token IDs using the new model.
    de_labels_vec = de_label_vectorizer_model(de_labels_str_tf)
    return en_inputs_tf, de_inputs_tf, de_labels_vec

en_train, de_train_in, de_train_out = prepare_data(train_df)
en_valid, de_valid_in, de_valid_out = prepare_data(valid_df)

print("Training data shapes:")
print(en_train.shape, de_train_in.shape, de_train_out.shape)

# Train the model (simplified .fit() call from the book)
# The book uses a custom training loop (Listing 11.10) to calculate BLEU.
# For simplicity, we will use model.fit() here.

print("\nStarting model training (1 epoch for demo)...")
history = final_model.fit(
    x=[en_train, de_train_in],
    y=de_train_out,
    validation_data=([en_valid, de_valid_in], de_valid_out),
    epochs=1, # Book uses 5
    batch_size=32
)

print("Training complete.")

# Save the model in the native Keras format
os.makedirs('models', exist_ok=True)
model_path = os.path.join('models', 'seq2seq_ch11.keras') # Changed to .keras
final_model.save(model_path)
print(f"Model saved to {model_path}")

Training data shapes:
(40000, 1) (40000, 1) (40000, 20)

Starting model training (1 epoch for demo)...
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m747s[0m 596ms/step - accuracy: 0.1566 - loss: 3.1215 - val_accuracy: 0.1781 - val_loss: 2.6620
Training complete.
Model saved to models/seq2seq_ch11.keras


## 11.4 From training to inference: Defining the inference model

We can't use the trained model directly for inference because it relies on **teacher forcing** (i.e., it expects the *true* German sentence as an input to the decoder).

For inference, we must build a new model that generates text **recursively**:
1.  Feed the English sentence to the **Encoder** to get the context vector.
2.  Feed the context vector (as the initial state) and the `sos` token to the **Decoder**.
3.  The Decoder predicts the first word (e.g., "Ich").
4.  Feed the *new* state and the predicted word ("Ich") back into the Decoder.
5.  The Decoder predicts the second word (e.g., "möchte").
6.  Repeat this process until the Decoder predicts the `eos` token.

In [54]:
# Based on Listing 11.11 - Create the inference models

def get_inference_model(save_path, de_vocab_size):
    print("Loading trained model and building inference models...")
    K.clear_session()
    model = load_model(save_path)

    # 1. Get the Encoder
    # Need to reconstruct the encoder with a fixed batch_shape
    # The original encoder's input was tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    # We need to recreate it to ensure a fixed batch size for inference.
    # Get the layers from the original encoder in the loaded model
    original_encoder = model.get_layer("encoder")
    # Corrected: Use the globally available en_vectorizer, not from original_encoder
    global en_vectorizer # Ensure en_vectorizer is accessible

    # Clone encoder layers to ensure proper re-use in a new graph
    encoder_embedding_config = original_encoder.get_layer('e_embedding').get_config()
    encoder_embedding = layers.Embedding.from_config(encoder_embedding_config)
    _ = encoder_embedding(tf.zeros((1, en_seq_length), dtype=tf.int32)) # Build layer
    encoder_embedding.set_weights(original_encoder.get_layer('e_embedding').get_weights())

    encoder_gru_config = original_encoder.get_layer('e_bidirectional_gru').get_config()
    encoder_gru = layers.Bidirectional.from_config(encoder_gru_config)
    _ = encoder_gru(tf.zeros((1, en_seq_length, 128))) # Build layer
    encoder_gru.set_weights(original_encoder.get_layer('e_bidirectional_gru').get_weights())


    # Define a new input for the encoder with fixed batch_shape
    e_infer_input = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='e_infer_input')
    e_vectorized_out = en_vectorizer(e_infer_input)
    e_emb_out = encoder_embedding(e_vectorized_out)
    e_gru_out = encoder_gru(e_emb_out)
    en_model = tf.keras.models.Model(inputs=e_infer_input, outputs=e_gru_out, name='encoder_inference')

    # 2. Build the Decoder
    # We need to define new inputs for the decoder's state
    d_inp = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='d_infer_input')
    d_state_inp = tf.keras.Input(batch_shape=(1, 256), name='d_infer_state') # 256 = GRU units

    # Recreate the TextVectorization layer for German text
    # We use the globally available 'de_vocabulary' for this.
    global de_vocabulary, de_seq_length
    recreated_de_vectorizer = TextVectorization(
        max_tokens=len(de_vocabulary), # Adjust max_tokens to match the actual vocabulary size
        output_mode='int',
        output_sequence_length=1, # <--- CRUCIAL FIX: Set sequence length to 1 for inference
        name='recreated_de_vectorizer' # Assign a unique name
    )
    recreated_de_vectorizer.set_vocabulary(de_vocabulary)
    # Explicitly build the vectorizer with a dummy input of the correct batch_shape
    _ = recreated_de_vectorizer(tf.zeros(shape=(1, 1), dtype=tf.string))

    # Clone decoder layers to ensure proper re-use in a new graph
    d_emb_layer_config = model.get_layer('d_embedding').get_config()
    d_emb_layer = layers.Embedding.from_config(d_emb_layer_config)
    _ = d_emb_layer(tf.zeros((1, 1), dtype=tf.int32)) # Build layer
    d_emb_layer.set_weights(model.get_layer('d_embedding').get_weights())

    # Rebuild GRU layer explicitly with correct settings and input_shape
    d_gru_layer_original = model.get_layer("d_gru")
    d_gru_units = d_gru_layer_original.units
    d_emb_output_dim = d_emb_layer.output_dim
    d_gru_layer = layers.GRU(
        units=d_gru_units,
        return_sequences=False,
        return_state=True,
        # Removed input_shape as it can conflict with inferred shapes in functional API
        name='d_gru_inference' # Give it a new name
    )
    # Build the layer with dummy input matching the batch_shape of d_emb_out
    _ = d_gru_layer(tf.zeros((1, 1, d_emb_output_dim)), initial_state=tf.zeros((1, d_gru_units)))
    d_gru_layer.set_weights(d_gru_layer_original.get_weights())

    d_dense_layer_1_config = model.get_layer("d_dense_1").get_config()
    d_dense_layer_1 = layers.Dense.from_config(d_dense_layer_1_config)
    _ = d_dense_layer_1(tf.zeros((1, 256))) # Build layer
    d_dense_layer_1.set_weights(model.get_layer("d_dense_1").get_weights())

    d_final_layer_config = model.get_layer("d_dense_final").get_config()
    d_final_layer = layers.Dense.from_config(d_final_layer_config)
    _ = d_final_layer(tf.zeros((1, 512))) # Build layer
    d_final_layer.set_weights(model.get_layer("d_dense_final").get_weights())

    # Build the graph using the recreated vectorizer
    d_vectorized_out = recreated_de_vectorizer(d_inp)
    d_emb_out = d_emb_layer(d_vectorized_out)

    # GRU now returns output and state
    d_gru_output, d_new_state = d_gru_layer(d_emb_out, initial_state=d_state_inp)

    d_dense1_out = d_dense_layer_1(d_gru_output) # Use d_gru_output for dense layers
    d_final_out = d_final_layer(d_dense1_out)

    de_model = tf.keras.models.Model(
        inputs=[d_inp, d_state_inp],
        outputs=[d_final_out, d_new_state] # Output prediction AND new state
    )
    return en_model, de_model

# Update model_path to use the new .keras extension
model_path = os.path.join('models', 'seq2seq_ch11.keras')
en_model, de_model = get_inference_model(model_path, de_vocab_size=de_vocab)
print("Inference models built.")

Loading trained model and building inference models...


  saveable.load_own_variables(weights_store.get(inner_path))


Inference models built.


In [61]:
import tensorflow as tf
import numpy as np

# Based on Listing 11.11 - Create the inference models

def get_inference_model(save_path, de_vocab_size):
    print("Loading trained model and building inference models...")
    K.clear_session()
    model = load_model(save_path)

    # 1. Get the Encoder
    # Need to reconstruct the encoder with a fixed batch_shape
    # The original encoder's input was tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    # We need to recreate it to ensure a fixed batch size for inference.
    # Get the layers from the original encoder in the loaded model
    original_encoder = model.get_layer("encoder")
    # Use the globally available en_vectorizer, not from original_encoder
    global en_vectorizer # Ensure en_vectorizer is accessible

    # Clone encoder layers to ensure proper re-use in a new graph
    encoder_embedding_config = original_encoder.get_layer('e_embedding').get_config()
    encoder_embedding = layers.Embedding.from_config(encoder_embedding_config)
    _ = encoder_embedding(tf.zeros((1, en_seq_length), dtype=tf.int32)) # Build layer
    encoder_embedding.set_weights(original_encoder.get_layer('e_embedding').get_weights())

    encoder_gru_config = original_encoder.get_layer('e_bidirectional_gru').get_config()
    encoder_gru = layers.Bidirectional.from_config(encoder_gru_config)
    # The Bidirectional GRU input shape is (batch_size, sequence_length, embedding_dim)
    # where batch_size is 1, sequence_length is en_seq_length, embedding_dim is 128
    _ = encoder_gru(tf.zeros((1, en_seq_length, 128))) # Build layer
    encoder_gru.set_weights(original_encoder.get_layer('e_bidirectional_gru').get_weights())


    # Define a new input for the encoder with fixed batch_shape
    e_infer_input = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='e_infer_input')
    e_vectorized_out = en_vectorizer(e_infer_input) # Output (1, en_seq_length)
    e_emb_out = encoder_embedding(e_vectorized_out) # Output (1, en_seq_length, 128)
    e_gru_out = encoder_gru(e_emb_out) # Output (1, 256)
    en_model = tf.keras.models.Model(inputs=e_infer_input, outputs=e_gru_out, name='encoder_inference')

    # 2. Build the Decoder
    # We need to define new inputs for the decoder's state
    d_inp = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='d_infer_input') # Decoder's current word input
    d_state_inp = tf.keras.Input(batch_shape=(1, 256), name='d_infer_state') # Decoder's recurrent state input (from encoder or previous step)

    # Recreate the TextVectorization layer for German text
    global de_vocabulary, de_seq_length
    recreated_de_vectorizer = TextVectorization(
        max_tokens=len(de_vocabulary),
        output_mode='int',
        output_sequence_length=1, # Crucial: single token at a time for inference
        name='recreated_de_vectorizer'
    )
    recreated_de_vectorizer.set_vocabulary(de_vocabulary)
    _ = recreated_de_vectorizer(tf.zeros(shape=(1, 1), dtype=tf.string)) # Build with dummy input

    # Rebuild Embedding layer explicitly
    d_emb_layer_original = model.get_layer('d_embedding')
    d_emb_layer = layers.Embedding(
        input_dim=d_emb_layer_original.input_dim,
        output_dim=d_emb_layer_original.output_dim,
        mask_zero=d_emb_layer_original.mask_zero,
        name='d_embedding_inference'
    )
    _ = d_emb_layer(tf.zeros((1, 1), dtype=tf.int32)) # Build layer with appropriate dummy input
    d_emb_layer.set_weights(d_emb_layer_original.get_weights())


    # Rebuild GRU layer explicitly with correct settings
    d_gru_layer_original = model.get_layer("d_gru")
    d_gru_units = d_gru_layer_original.units
    d_emb_output_dim = d_emb_layer.output_dim # This will be 128 (embedding_dim)
    d_gru_layer = layers.GRU(
        units=d_gru_units,
        return_sequences=False, # Single output per step
        return_state=True,     # Return the updated state
        name='d_gru_inference' # Give it a new name
    )
    # Build the GRU layer with dummy input matching its expected sequence input and initial state
    _ = d_gru_layer(tf.zeros((1, 1, d_emb_output_dim)), initial_state=tf.zeros((1, d_gru_units)))
    d_gru_layer.set_weights(d_gru_layer_original.get_weights()) # Set weights after building

    d_dense_layer_1_config = model.get_layer("d_dense_1").get_config()
    d_dense_layer_1 = layers.Dense.from_config(d_dense_layer_1_config)
    _ = d_dense_layer_1(tf.zeros((1, 256))) # Build layer
    d_dense_layer_1.set_weights(model.get_layer("d_dense_1").get_weights())

    d_final_layer_config = model.get_layer("d_dense_final").get_config()
    d_final_layer = layers.Dense.from_config(d_final_layer_config)
    _ = d_final_layer(tf.zeros((1, 512))) # Build layer
    d_final_layer.set_weights(model.get_layer("d_dense_final").get_weights())

    # Build the graph using the recreated vectorizer
    d_vectorized_out = recreated_de_vectorizer(d_inp) # Output (1, 1)
    d_emb_out = d_emb_layer(d_vectorized_out)         # Output (1, 1, 128)

    # GRU now returns output and state
    d_gru_output, d_new_state = d_gru_layer(d_emb_out, initial_state=d_state_inp) # d_gru_output (1, 256), d_new_state (1, 256)

    d_dense1_out = d_dense_layer_1(d_gru_output) # Use d_gru_output for dense layers, output (1, 512)
    d_final_out = d_final_layer(d_dense1_out)   # Output (1, vocab_size+2)

    de_model_base = tf.keras.models.Model(
        inputs=[d_inp, d_state_inp],
        outputs=[d_final_out, d_new_state] # Output prediction AND new state
    )

    # Wrap the decoder model in tf.function with explicit input_signature
    @tf.function(input_signature=[
        tf.TensorSpec(shape=(1, 1), dtype=tf.string), # Input for d_inp (current word token)
        tf.TensorSpec(shape=(1, 256), dtype=tf.float32) # Input for d_state_inp (previous GRU state)
    ])
    def de_model_inference_step(input_token, prev_state):
        pred_logits, new_state_raw = de_model_base(inputs=[input_token, prev_state], training=False)
        # Explicitly ensure the shape of the output state
        new_state = tf.ensure_shape(new_state_raw, (1, 256))
        return pred_logits, new_state

    return en_model, de_model_inference_step

# Update model_path to use the new .keras extension
model_path = os.path.join('models', 'seq2seq_ch11.keras')
en_model, de_model_inference_step = get_inference_model(model_path, de_vocab_size=de_vocab)
print("Inference models built.")

def generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text, max_len=20):
    print(f"Input: {sample_en_text}")

    # 1. Get the context vector from the encoder
    d_state = en_model.predict(tf.constant([[sample_en_text]], dtype=tf.string), verbose=0)

    # 2. Start the decoder with the 'sos' token
    de_word = start_token
    de_translation = []

    # 3. Recursive loop
    for _ in range(max_len):
        # Predict the next word and get the new state using the tf.function wrapped step
        # Pass inputs directly to the tf.function, not through .predict()
        de_pred, d_state = de_model_inference_step(tf.constant([[de_word]], dtype=tf.string), d_state)

        # Get the word ID with the highest probability
        de_word_id = np.argmax(de_pred[0])

        # Look up the word from the ID
        de_word = de_vocabulary[de_word_id]

        if de_word == end_token:
            break

        de_translation.append(de_word)

    print(f"Translation: {' '.join(de_translation)}\n")

# --- Test the inference model ---
for i in range(5):
    sample_en_text = test_df["EN"].iloc[i]
    generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text)


Loading trained model and building inference models...


  saveable.load_own_variables(weights_store.get(inner_path))


Inference models built.
Input: She pushed him out the door.


ValueError: in user code:

    File "/tmp/ipython-input-2864226451.py", line 114, in de_model_inference_step  *
        pred_logits, new_state_raw = de_model_base(inputs=[input_token, prev_state], training=False)
    File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling GRU.call().
    
    [1mInput tensor `prev_state:0` enters the loop with shape (1, 256), but has shape (None, 256) after one iteration. To allow the shape to vary across iterations, use the `shape_invariants` argument of tf.while_loop to specify a less-specific shape.[0m
    
    Arguments received by GRU.call():
      • sequences=tf.Tensor(shape=(None, 1, 128), dtype=float32)
      • initial_state=tf.Tensor(shape=(1, 256), dtype=float32)
      • mask=None
      • training=False


# Task
The `ValueError` arises because the `d_gru_layer` expects a static batch size for its `sequences` input. To resolve this, I will add `d_emb_out = tf.ensure_shape(d_emb_out, (1, 1, d_emb_layer.output_dim))` before `d_emb_out` is passed to the `d_gru_layer` within the `get_inference_model` function in cell `7ff67b0f`. This will explicitly enforce the required static shape for the batch dimension. I will then re-execute the cell to update the inference model. Finally, I will re-run the translation test to ensure the fix is successful and no new errors are introduced.

## ensure_static_embedding_output_shape

### Subtask:
Modify the `get_inference_model` function in cell `7ff67b0f` to add `d_emb_out = tf.ensure_shape(d_emb_out, (1, 1, d_emb_layer.output_dim))` before it is passed to the `d_gru_layer`. This will explicitly enforce a static batch size for the GRU's `sequences` input, which is currently reported as having a dynamic batch dimension by TensorFlow.


**Reasoning**:
The subtask requires modifying the `get_inference_model` function in cell `7ff67b0f` to include `tf.ensure_shape` for `d_emb_out` to fix the dynamic batch dimension issue before passing it to the GRU layer. I will replicate the existing code and insert the specified line.



In [62]:
import tensorflow as tf
import numpy as np

# Based on Listing 11.11 - Create the inference models

def get_inference_model(save_path, de_vocab_size):
    print("Loading trained model and building inference models...")
    K.clear_session()
    model = load_model(save_path)

    # 1. Get the Encoder
    # Need to reconstruct the encoder with a fixed batch_shape
    # The original encoder's input was tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    # We need to recreate it to ensure a fixed batch size for inference.
    # Get the layers from the original encoder in the loaded model
    original_encoder = model.get_layer("encoder")
    # Use the globally available en_vectorizer, not from original_encoder
    global en_vectorizer # Ensure en_vectorizer is accessible

    # Clone encoder layers to ensure proper re-use in a new graph
    encoder_embedding_config = original_encoder.get_layer('e_embedding').get_config()
    encoder_embedding = layers.Embedding.from_config(encoder_embedding_config)
    _ = encoder_embedding(tf.zeros((1, en_seq_length), dtype=tf.int32)) # Build layer
    encoder_embedding.set_weights(original_encoder.get_layer('e_embedding').get_weights())

    encoder_gru_config = original_encoder.get_layer('e_bidirectional_gru').get_config()
    encoder_gru = layers.Bidirectional.from_config(encoder_gru_config)
    # The Bidirectional GRU input shape is (batch_size, sequence_length, embedding_dim)
    # where batch_size is 1, sequence_length is en_seq_length, embedding_dim is 128
    _ = encoder_gru(tf.zeros((1, en_seq_length, 128))) # Build layer
    encoder_gru.set_weights(original_encoder.get_layer('e_bidirectional_gru').get_weights())


    # Define a new input for the encoder with fixed batch_shape
    e_infer_input = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='e_infer_input')
    e_vectorized_out = en_vectorizer(e_infer_input) # Output (1, en_seq_length)
    e_emb_out = encoder_embedding(e_vectorized_out) # Output (1, en_seq_length, 128)
    e_gru_out = encoder_gru(e_emb_out) # Output (1, 256)
    en_model = tf.keras.models.Model(inputs=e_infer_input, outputs=e_gru_out, name='encoder_inference')

    # 2. Build the Decoder
    # We need to define new inputs for the decoder's state
    d_inp = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='d_infer_input') # Decoder's current word input
    d_state_inp = tf.keras.Input(batch_shape=(1, 256), name='d_infer_state') # Decoder's recurrent state input (from encoder or previous step)

    # Recreate the TextVectorization layer for German text
    global de_vocabulary, de_seq_length
    recreated_de_vectorizer = TextVectorization(
        max_tokens=len(de_vocabulary),
        output_mode='int',
        output_sequence_length=1, # Crucial: single token at a time for inference
        name='recreated_de_vectorizer'
    )
    recreated_de_vectorizer.set_vocabulary(de_vocabulary)
    _ = recreated_de_vectorizer(tf.zeros(shape=(1, 1), dtype=tf.string)) # Build with dummy input

    # Rebuild Embedding layer explicitly
    d_emb_layer_original = model.get_layer('d_embedding')
    d_emb_layer = layers.Embedding(
        input_dim=d_emb_layer_original.input_dim,
        output_dim=d_emb_layer_original.output_dim,
        mask_zero=d_emb_layer_original.mask_zero,
        name='d_embedding_inference'
    )
    _ = d_emb_layer(tf.zeros((1, 1), dtype=tf.int32)) # Build layer with appropriate dummy input
    d_emb_layer.set_weights(d_emb_layer_original.get_weights())


    # Rebuild GRU layer explicitly with correct settings
    d_gru_layer_original = model.get_layer("d_gru")
    d_gru_units = d_gru_layer_original.units
    d_emb_output_dim = d_emb_layer.output_dim # This will be 128 (embedding_dim)
    d_gru_layer = layers.GRU(
        units=d_gru_units,
        return_sequences=False, # Single output per step
        return_state=True,     # Return the updated state
        name='d_gru_inference' # Give it a new name
    )
    # Build the GRU layer with dummy input matching its expected sequence input and initial state
    _ = d_gru_layer(tf.zeros((1, 1, d_emb_output_dim)), initial_state=tf.zeros((1, d_gru_units)))
    d_gru_layer.set_weights(d_gru_layer_original.get_weights()) # Set weights after building

    d_dense_layer_1_config = model.get_layer("d_dense_1").get_config()
    d_dense_layer_1 = layers.Dense.from_config(d_dense_layer_1_config)
    _ = d_dense_layer_1(tf.zeros((1, 256))) # Build layer
    d_dense_layer_1.set_weights(model.get_layer("d_dense_1").get_weights())

    d_final_layer_config = model.get_layer("d_dense_final").get_config()
    d_final_layer = layers.Dense.from_config(d_final_layer_config)
    _ = d_final_layer(tf.zeros((1, 512))) # Build layer
    d_final_layer.set_weights(model.get_layer("d_dense_final").get_weights())

    # Build the graph using the recreated vectorizer
    d_vectorized_out = recreated_de_vectorizer(d_inp) # Output (1, 1)
    d_emb_out = d_emb_layer(d_vectorized_out)         # Output (1, 1, 128)
    d_emb_out = tf.ensure_shape(d_emb_out, (1, 1, d_emb_layer.output_dim)) # Enforce static shape

    # GRU now returns output and state
    d_gru_output, d_new_state = d_gru_layer(d_emb_out, initial_state=d_state_inp) # d_gru_output (1, 256), d_new_state (1, 256)

    d_dense1_out = d_dense_layer_1(d_gru_output) # Use d_gru_output for dense layers, output (1, 512)
    d_final_out = d_final_layer(d_dense1_out)   # Output (1, vocab_size+2)

    de_model_base = tf.keras.models.Model(
        inputs=[d_inp, d_state_inp],
        outputs=[d_final_out, d_new_state] # Output prediction AND new state
    )

    # Wrap the decoder model in tf.function with explicit input_signature
    @tf.function(input_signature=[
        tf.TensorSpec(shape=(1, 1), dtype=tf.string), # Input for d_inp (current word token)
        tf.TensorSpec(shape=(1, 256), dtype=tf.float32) # Input for d_state_inp (previous GRU state)
    ])
    def de_model_inference_step(input_token, prev_state):
        pred_logits, new_state_raw = de_model_base(inputs=[input_token, prev_state], training=False)
        # Explicitly ensure the shape of the output state
        new_state = tf.ensure_shape(new_state_raw, (1, 256))
        return pred_logits, new_state

    return en_model, de_model_inference_step

# Update model_path to use the new .keras extension
model_path = os.path.join('models', 'seq2seq_ch11.keras')
en_model, de_model_inference_step = get_inference_model(model_path, de_vocab_size=de_vocab)
print("Inference models built.")

def generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text, max_len=20):
    print(f"Input: {sample_en_text}")

    # 1. Get the context vector from the encoder
    d_state = en_model.predict(tf.constant([[sample_en_text]], dtype=tf.string), verbose=0)

    # 2. Start the decoder with the 'sos' token
    de_word = start_token
    de_translation = []

    # 3. Recursive loop
    for _ in range(max_len):
        # Predict the next word and get the new state using the tf.function wrapped step
        # Pass inputs directly to the tf.function, not through .predict()
        de_pred, d_state = de_model_inference_step(tf.constant([[de_word]], dtype=tf.string), d_state)

        # Get the word ID with the highest probability
        de_word_id = np.argmax(de_pred[0])

        # Look up the word from the ID
        de_word = de_vocabulary[de_word_id]

        if de_word == end_token:
            break

        de_translation.append(de_word)

    print(f"Translation: {' '.join(de_translation)}\n")

# --- Test the inference model ---
for i in range(5):
    sample_en_text = test_df["EN"].iloc[i]
    generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text)

Loading trained model and building inference models...


  saveable.load_own_variables(weights_store.get(inner_path))


ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


**Reasoning**:
The previous attempt to use `tf.ensure_shape` directly on a KerasTensor resulted in a `ValueError`. To fix this, I will wrap `tf.ensure_shape` in a `layers.Lambda` layer, which is the correct way to apply arbitrary TensorFlow functions within the Keras Functional API model construction. This ensures the output of the embedding layer has the expected static shape before being fed to the GRU.



In [63]:
import tensorflow as tf
import numpy as np

# Based on Listing 11.11 - Create the inference models

def get_inference_model(save_path, de_vocab_size):
    print("Loading trained model and building inference models...")
    K.clear_session()
    model = load_model(save_path)

    # 1. Get the Encoder
    # Need to reconstruct the encoder with a fixed batch_shape
    # The original encoder's input was tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    # We need to recreate it to ensure a fixed batch size for inference.
    # Get the layers from the original encoder in the loaded model
    original_encoder = model.get_layer("encoder")
    # Use the globally available en_vectorizer, not from original_encoder
    global en_vectorizer # Ensure en_vectorizer is accessible

    # Clone encoder layers to ensure proper re-use in a new graph
    encoder_embedding_config = original_encoder.get_layer('e_embedding').get_config()
    encoder_embedding = layers.Embedding.from_config(encoder_embedding_config)
    _ = encoder_embedding(tf.zeros((1, en_seq_length), dtype=tf.int32)) # Build layer
    encoder_embedding.set_weights(original_encoder.get_layer('e_embedding').get_weights())

    encoder_gru_config = original_encoder.get_layer('e_bidirectional_gru').get_config()
    encoder_gru = layers.Bidirectional.from_config(encoder_gru_config)
    # The Bidirectional GRU input shape is (batch_size, sequence_length, embedding_dim)
    # where batch_size is 1, sequence_length is en_seq_length, embedding_dim is 128
    _ = encoder_gru(tf.zeros((1, en_seq_length, 128))) # Build layer
    encoder_gru.set_weights(original_encoder.get_layer('e_bidirectional_gru').get_weights())


    # Define a new input for the encoder with fixed batch_shape
    e_infer_input = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='e_infer_input')
    e_vectorized_out = en_vectorizer(e_infer_input) # Output (1, en_seq_length)
    e_emb_out = encoder_embedding(e_vectorized_out) # Output (1, en_seq_length, 128)
    e_gru_out = encoder_gru(e_emb_out) # Output (1, 256)
    en_model = tf.keras.models.Model(inputs=e_infer_input, outputs=e_gru_out, name='encoder_inference')

    # 2. Build the Decoder
    # We need to define new inputs for the decoder's state
    d_inp = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='d_infer_input') # Decoder's current word input
    d_state_inp = tf.keras.Input(batch_shape=(1, 256), name='d_infer_state') # Decoder's recurrent state input (from encoder or previous step)

    # Recreate the TextVectorization layer for German text
    global de_vocabulary, de_seq_length
    recreated_de_vectorizer = TextVectorization(
        max_tokens=len(de_vocabulary),
        output_mode='int',
        output_sequence_length=1, # Crucial: single token at a time for inference
        name='recreated_de_vectorizer'
    )
    recreated_de_vectorizer.set_vocabulary(de_vocabulary)
    _ = recreated_de_vectorizer(tf.zeros(shape=(1, 1), dtype=tf.string)) # Build with dummy input

    # Rebuild Embedding layer explicitly
    d_emb_layer_original = model.get_layer('d_embedding')
    d_emb_layer = layers.Embedding(
        input_dim=d_emb_layer_original.input_dim,
        output_dim=d_emb_layer_original.output_dim,
        mask_zero=d_emb_layer_original.mask_zero,
        name='d_embedding_inference'
    )
    _ = d_emb_layer(tf.zeros((1, 1), dtype=tf.int32)) # Build layer with appropriate dummy input
    d_emb_layer.set_weights(d_emb_layer_original.get_weights())


    # Rebuild GRU layer explicitly with correct settings
    d_gru_layer_original = model.get_layer("d_gru")
    d_gru_units = d_gru_layer_original.units
    d_emb_output_dim = d_emb_layer.output_dim # This will be 128 (embedding_dim)
    d_gru_layer = layers.GRU(
        units=d_gru_units,
        return_sequences=False, # Single output per step
        return_state=True,     # Return the updated state
        name='d_gru_inference' # Give it a new name
    )
    # Build the GRU layer with dummy input matching its expected sequence input and initial state
    _ = d_gru_layer(tf.zeros((1, 1, d_emb_output_dim)), initial_state=tf.zeros((1, d_gru_units)))
    d_gru_layer.set_weights(d_gru_layer_original.get_weights()) # Set weights after building

    d_dense_layer_1_config = model.get_layer("d_dense_1").get_config()
    d_dense_layer_1 = layers.Dense.from_config(d_dense_layer_1_config)
    _ = d_dense_layer_1(tf.zeros((1, 256))) # Build layer
    d_dense_layer_1.set_weights(model.get_layer("d_dense_1").get_weights())

    d_final_layer_config = model.get_layer("d_dense_final").get_config()
    d_final_layer = layers.Dense.from_config(d_final_layer_config)
    _ = d_final_layer(tf.zeros((1, 512))) # Build layer
    d_final_layer.set_weights(model.get_layer("d_dense_final").get_weights())

    # Build the graph using the recreated vectorizer
    d_vectorized_out = recreated_de_vectorizer(d_inp) # Output (1, 1)
    d_emb_out = d_emb_layer(d_vectorized_out)         # Output (1, 1, 128)

    # Wrap tf.ensure_shape in a Lambda layer to use it within the functional API
    d_emb_out = layers.Lambda(lambda x: tf.ensure_shape(x, (1, 1, d_emb_layer.output_dim)),
                              name='ensure_emb_shape')(d_emb_out)

    # GRU now returns output and state
    d_gru_output, d_new_state = d_gru_layer(d_emb_out, initial_state=d_state_inp) # d_gru_output (1, 256), d_new_state (1, 256)

    d_dense1_out = d_dense_layer_1(d_gru_output) # Use d_gru_output for dense layers, output (1, 512)
    d_final_out = d_final_layer(d_dense1_out)   # Output (1, vocab_size+2)

    de_model_base = tf.keras.models.Model(
        inputs=[d_inp, d_state_inp],
        outputs=[d_final_out, d_new_state] # Output prediction AND new state
    )

    # Wrap the decoder model in tf.function with explicit input_signature
    @tf.function(input_signature=[
        tf.TensorSpec(shape=(1, 1), dtype=tf.string), # Input for d_inp (current word token)
        tf.TensorSpec(shape=(1, 256), dtype=tf.float32) # Input for d_state_inp (previous GRU state)
    ])
    def de_model_inference_step(input_token, prev_state):
        pred_logits, new_state_raw = de_model_base(inputs=[input_token, prev_state], training=False)
        # Explicitly ensure the shape of the output state
        new_state = tf.ensure_shape(new_state_raw, (1, 256))
        return pred_logits, new_state

    return en_model, de_model_inference_step

# Update model_path to use the new .keras extension
model_path = os.path.join('models', 'seq2seq_ch11.keras')
en_model, de_model_inference_step = get_inference_model(model_path, de_vocab_size=de_vocab)
print("Inference models built.")

def generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text, max_len=20):
    print(f"Input: {sample_en_text}")

    # 1. Get the context vector from the encoder
    d_state = en_model.predict(tf.constant([[sample_en_text]], dtype=tf.string), verbose=0)

    # 2. Start the decoder with the 'sos' token
    de_word = start_token
    de_translation = []

    # 3. Recursive loop
    for _ in range(max_len):
        # Predict the next word and get the new state using the tf.function wrapped step
        # Pass inputs directly to the tf.function, not through .predict()
        de_pred, d_state = de_model_inference_step(tf.constant([[de_word]], dtype=tf.string), d_state)

        # Get the word ID with the highest probability
        de_word_id = np.argmax(de_pred[0])

        # Look up the word from the ID
        de_word = de_vocabulary[de_word_id]

        if de_word == end_token:
            break

        de_translation.append(de_word)

    print(f"Translation: {' '.join(de_translation)}\n")

# --- Test the inference model ---
for i in range(5):
    sample_en_text = test_df["EN"].iloc[i]
    generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text)


Loading trained model and building inference models...


  saveable.load_own_variables(weights_store.get(inner_path))


Inference models built.
Input: She pushed him out the door.
Translation: sie [UNK] ihn die [UNK]

Input: Tom doesn't use salt in his cooking.
Translation: tom [UNK] sich nicht auf seine [UNK]

Input: We're safe in here, aren't we?
Translation: wir sind [UNK] das problem

Input: Is anybody thirsty?
Translation: ist jemand [UNK]

Input: You should have your head examined.
Translation: sie hätten ihre [UNK] [UNK]



**Reasoning**:
The previous modification successfully resolved the `ValueError` by wrapping `tf.ensure_shape` within a `layers.Lambda` layer, allowing the inference model to be built and executed without crashing. The generated translations confirm that the fix was successful.



In [None]:
import tensorflow as tf
import numpy as np

# Based on Listing 11.11 - Create the inference models

def get_inference_model(save_path, de_vocab_size):
    print("Loading trained model and building inference models...")
    K.clear_session()
    model = load_model(save_path)

    # 1. Get the Encoder
    # Need to reconstruct the encoder with a fixed batch_shape
    # The original encoder's input was tf.keras.Input(shape=(1,), dtype=tf.string, name='e_input')
    # We need to recreate it to ensure a fixed batch size for inference.
    # Get the layers from the original encoder in the loaded model
    original_encoder = model.get_layer("encoder")
    # Use the globally available en_vectorizer, not from original_encoder
    global en_vectorizer # Ensure en_vectorizer is accessible

    # Clone encoder layers to ensure proper re-use in a new graph
    encoder_embedding_config = original_encoder.get_layer('e_embedding').get_config()
    encoder_embedding = layers.Embedding.from_config(encoder_embedding_config)
    _ = encoder_embedding(tf.zeros((1, en_seq_length), dtype=tf.int32)) # Build layer
    encoder_embedding.set_weights(original_encoder.get_layer('e_embedding').get_weights())

    encoder_gru_config = original_encoder.get_layer('e_bidirectional_gru').get_config()
    encoder_gru = layers.Bidirectional.from_config(encoder_gru_config)
    # The Bidirectional GRU input shape is (batch_size, sequence_length, embedding_dim)
    # where batch_size is 1, sequence_length is en_seq_length, embedding_dim is 128
    _ = encoder_gru(tf.zeros((1, en_seq_length, 128))) # Build layer
    encoder_gru.set_weights(original_encoder.get_layer('e_bidirectional_gru').get_weights())


    # Define a new input for the encoder with fixed batch_shape
    e_infer_input = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='e_infer_input')
    e_vectorized_out = en_vectorizer(e_infer_input) # Output (1, en_seq_length)
    e_emb_out = encoder_embedding(e_vectorized_out) # Output (1, en_seq_length, 128)
    e_gru_out = encoder_gru(e_emb_out) # Output (1, 256)
    en_model = tf.keras.models.Model(inputs=e_infer_input, outputs=e_gru_out, name='encoder_inference')

    # 2. Build the Decoder
    # We need to define new inputs for the decoder's state
    d_inp = tf.keras.Input(batch_shape=(1, 1), dtype=tf.string, name='d_infer_input') # Decoder's current word input
    d_state_inp = tf.keras.Input(batch_shape=(1, 256), name='d_infer_state') # Decoder's recurrent state input (from encoder or previous step)

    # Recreate the TextVectorization layer for German text
    global de_vocabulary, de_seq_length
    recreated_de_vectorizer = TextVectorization(
        max_tokens=len(de_vocabulary),
        output_mode='int',
        output_sequence_length=1, # Crucial: single token at a time for inference
        name='recreated_de_vectorizer'
    )
    recreated_de_vectorizer.set_vocabulary(de_vocabulary)
    _ = recreated_de_vectorizer(tf.zeros(shape=(1, 1), dtype=tf.string)) # Build with dummy input

    # Rebuild Embedding layer explicitly
    d_emb_layer_original = model.get_layer('d_embedding')
    d_emb_layer = layers.Embedding(
        input_dim=d_emb_layer_original.input_dim,
        output_dim=d_emb_layer_original.output_dim,
        mask_zero=d_emb_layer_original.mask_zero,
        name='d_embedding_inference'
    )
    _ = d_emb_layer(tf.zeros((1, 1), dtype=tf.int32)) # Build layer with appropriate dummy input
    d_emb_layer.set_weights(d_emb_layer_original.get_weights())


    # Rebuild GRU layer explicitly with correct settings
    d_gru_layer_original = model.get_layer("d_gru")
    d_gru_units = d_gru_layer_original.units
    d_emb_output_dim = d_emb_layer.output_dim # This will be 128 (embedding_dim)
    d_gru_layer = layers.GRU(
        units=d_gru_units,
        return_sequences=False, # Single output per step
        return_state=True,     # Return the updated state
        name='d_gru_inference' # Give it a new name
    )
    # Build the GRU layer with dummy input matching its expected sequence input and initial state
    _ = d_gru_layer(tf.zeros((1, 1, d_emb_output_dim)), initial_state=tf.zeros((1, d_gru_units)))
    d_gru_layer.set_weights(d_gru_layer_original.get_weights()) # Set weights after building

    d_dense_layer_1_config = model.get_layer("d_dense_1").get_config()
    d_dense_layer_1 = layers.Dense.from_config(d_dense_layer_1_config)
    _ = d_dense_layer_1(tf.zeros((1, 256))) # Build layer
    d_dense_layer_1.set_weights(model.get_layer("d_dense_1").get_weights())

    d_final_layer_config = model.get_layer("d_dense_final").get_config()
    d_final_layer = layers.Dense.from_config(d_final_layer_config)
    _ = d_final_layer(tf.zeros((1, 512))) # Build layer
    d_final_layer.set_weights(model.get_layer("d_dense_final").get_weights())

    # Build the graph using the recreated vectorizer
    d_vectorized_out = recreated_de_vectorizer(d_inp) # Output (1, 1)
    d_emb_out = d_emb_layer(d_vectorized_out)         # Output (1, 1, 128)

    # Wrap tf.ensure_shape in a Lambda layer to use it within the functional API
    d_emb_out = layers.Lambda(lambda x: tf.ensure_shape(x, (1, 1, d_emb_layer.output_dim)),
                              name='ensure_emb_shape')(d_emb_out)

    # GRU now returns output and state
    d_gru_output, d_new_state = d_gru_layer(d_emb_out, initial_state=d_state_inp) # d_gru_output (1, 256), d_new_state (1, 256)

    d_dense1_out = d_dense_layer_1(d_gru_output) # Use d_gru_output for dense layers, output (1, 512)
    d_final_out = d_final_layer(d_dense1_out)   # Output (1, vocab_size+2)

    de_model_base = tf.keras.models.Model(
        inputs=[d_inp, d_state_inp],
        outputs=[d_final_out, d_new_state] # Output prediction AND new state
    )

    # Wrap the decoder model in tf.function with explicit input_signature
    @tf.function(input_signature=[
        tf.TensorSpec(shape=(1, 1), dtype=tf.string), # Input for d_inp (current word token)
        tf.TensorSpec(shape=(1, 256), dtype=tf.float32) # Input for d_state_inp (previous GRU state)
    ])
    def de_model_inference_step(input_token, prev_state):
        pred_logits, new_state_raw = de_model_base(inputs=[input_token, prev_state], training=False)
        # Explicitly ensure the shape of the output state
        new_state = tf.ensure_shape(new_state_raw, (1, 256))
        return pred_logits, new_state

    return en_model, de_model_inference_step

# Update model_path to use the new .keras extension
model_path = os.path.join('models', 'seq2seq_ch11.keras')
en_model, de_model_inference_step = get_inference_model(model_path, de_vocab_size=de_vocab)
print("Inference models built.")

def generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text, max_len=20):
    print(f"Input: {sample_en_text}")

    # 1. Get the context vector from the encoder
    d_state = en_model.predict(tf.constant([[sample_en_text]], dtype=tf.string), verbose=0)

    # 2. Start the decoder with the 'sos' token
    de_word = start_token
    de_translation = []

    # 3. Recursive loop
    for _ in range(max_len):
        # Predict the next word and get the new state using the tf.function wrapped step
        # Pass inputs directly to the tf.function, not through .predict()
        de_pred, d_state = de_model_inference_step(tf.constant([[de_word]], dtype=tf.string), d_state)

        # Get the word ID with the highest probability
        de_word_id = np.argmax(de_pred[0])

        # Look up the word from the ID
        de_word = de_vocabulary[de_word_id]

        if de_word == end_token:
            break

        de_translation.append(de_word)

    print(f"Translation: {' '.join(de_translation)}\n")

# --- Test the inference model ---
for i in range(5):
    sample_en_text = test_df["EN"].iloc[i]
    generate_new_translation(en_model, de_model_inference_step, de_vocabulary, sample_en_text)
