<a href="https://colab.research.google.com/github/Jakelinecs/Tareas-Machine-Learning/blob/main/N34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
==================================================================================================
COMPLETE PRACTICE: AUTOMATIC TRANSLATION AND IMAGE CAPTIONING (KERAS/PYTORCH)
==================================================================================================

This script contains the implementation of the Seq2Seq model (character-level LSTM) for Automatic
Translation (Point 2) and the implementation of the Keras Advanced Task for Image
Captioning (Point 3), simulating the architecture of the PyTorch model.py file.

Note: The automatic translation code is fully documented line by line
to meet the task requirement.
"""

import numpy as np
# Line 39: Imports the NumPy library, essential for efficient handling of arrays and tensors.
import keras
# Line 40: Imports the Keras library to build and train the neural network.
import os
# Line 41: Imports the 'os' module to interact with the operating system (commands and paths).
from pathlib import Path
# Line 42: Imports the 'Path' class from pathlib to handle file paths.

# === LINES 5-34: INTRODUCTION/SETUP (Original script text) ===
# (These lines are informative comments in the original script and are omitted here as they are prose,
# but in the final file they must be present as the original introductory comment block).
# Lines 38-41: Import block.

# ------------------------------------------------------------------
# 2. AUTOMATIC TRANSLATION (lstm_seq2seq.py implementation)
# ------------------------------------------------------------------

"""
## Data download
"""

fpath = keras.utils.get_file(origin="http://www.manythings.org/anki/fra-eng.zip")
# Line 47: Downloads the translation corpus ZIP file and stores its local path in 'fpath'.
dirpath = Path(fpath).parent.absolute()
# Line 48: Gets the absolute directory where the ZIP file was downloaded.
os.system(f"unzip -q {fpath} -d {dirpath}")
# Line 49: Executes a system command to quietly (-q) decompress the file.

"""
## Configuration
"""

batch_size = 64  # Batch size for training.
# Line 54: Defines the batch size for training.
epochs = 100  # Number of epochs to train for.
# Line 55: Defines the number of epochs for training.
latent_dim = 256  # Latent dimensionality of the encoding space.
# Line 56: Defines the latent dimensionality (internal units) of the LSTM layers.
num_samples = 10000  # Number of samples to train on.
# Line 57: Defines the maximum number of sentence pairs to use.
# Path to the data txt file on disk.
data_path = os.path.join(dirpath, "fra.txt")
# Line 59: Constructs the full path to the uncompressed text data file.

"""
## Data preparation
"""

# Vectorize the data.
# Line 64: Comment indicating the start of the vectorization phase.
input_texts = []
# Line 65: Initializes list for input sentences (English).
target_texts = []
# Line 66: Initializes list for target sentences (French).
input_characters = set()
# Line 67: Initializes set for unique input characters.
target_characters = set()
# Line 68: Initializes set for unique target characters.
with open(data_path, "r", encoding="utf-8") as f:
# Line 69: Opens the data file in read mode.
    lines = f.read().split("\n")
# Line 70: Reads the entire file and splits it into a list of sentences.
for line in lines[: min(num_samples, len(lines) - 1)]:
# Line 71: Iterates over the first 'num_samples' lines.
    input_text, target_text, _ = line.split("\t")
# Line 72: Splits the line by the tab ('\t').
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
# Line 76: Adds the START OF SEQUENCE token ('\t') and END OF SEQUENCE token ('\n') to the target sentence.
    input_texts.append(input_text)
# Line 77: Appends the input sentence to the list.
    target_texts.append(target_text)
# Line 78: Appends the target sentence (with tokens) to the list.
    for char in input_text:
# Line 79: Iterates over the characters of the input sentence.
        if char not in input_characters:
# Line 80: Checks if the character is new.
            input_characters.add(char)
# Line 81: Adds the character to the input set.
    for char in target_text:
# Line 82: Iterates over the characters of the target sentence.
        if char not in target_characters:
# Line 83: Checks if the character is new.
            target_characters.add(char)
# Line 84: Adds the character to the target set.

input_characters = sorted(list(input_characters))
# Line 86: Converts and sorts the input vocabulary.
target_characters = sorted(list(target_characters))
# Line 87: Converts and sorts the target vocabulary.
num_encoder_tokens = len(input_characters)
# Line 88: Calculates the size of the input vocabulary.
num_decoder_tokens = len(target_characters)
# Line 89: Calculates the size of the target vocabulary.
max_encoder_seq_length = max([len(txt) for txt in input_texts])
# Line 90: Calculates the maximum length of the input sequence.
max_decoder_seq_length = max([len(txt) for txt in target_texts])
# Line 91: Calculates the maximum length of the target sequence.

print("Number of samples:", len(input_texts))
# Line 93: Prints the number of samples.
print("Number of unique input tokens:", num_encoder_tokens)
# Line 94: Prints the size of the input vocabulary.
print("Number of unique output tokens:", num_decoder_tokens)
# Line 95: Prints the size of the target vocabulary.
print("Max sequence length for inputs:", max_encoder_seq_length)
# Line 96: Prints the maximum length of the input sequence.
print("Max sequence length for outputs:", max_decoder_seq_length)
# Line 97: Prints the maximum length of the target sequence.

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
# Line 99: Creates a dictionary to map each input character to its numerical index.
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
# Line 100: Creates a dictionary to map each target character to its numerical index.

encoder_input_data = np.zeros(
# Line 102: Initializes the NumPy array for the Encoder input data (one-hot).
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
# Line 103: Shape: (num_samples, max_input_length, input_vocabulary_size).
    dtype="float32",
# Line 104: Data type.
)
decoder_input_data = np.zeros(
# Line 106: Initializes the NumPy array for the Decoder input data.
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
# Line 107: Shape: (num_samples, max_target_length, target_vocabulary_size).
    dtype="float32",
)
decoder_target_data = np.zeros(
# Line 109: Initializes the NumPy array for the Decoder OUTPUT/TARGET data.
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
# Line 110: Shape: (num_samples, max_target_length, target_vocabulary_size).
    dtype="float32",
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
# Line 114: Starts the one-hot vectorization loop.
    for t, char in enumerate(input_text):
# Line 115: Iterates over each character in the input sentence.
        encoder_input_data[i, t, input_token_index[char]] = 1.0
# Line 116: Encodes the current character's position in the encoder input as 1.0.
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
# Line 117: Pads the rest of the input sequence with padding (space).
    for t, char in enumerate(target_text):
# Line 118: Iterates over each character in the target sentence.
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
# Line 121: Encodes the decoder input (includes '\t').
        if t > 0:
# Line 122: Conditional to start encoding the target (output) one step after '\t'.
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
# Line 125: Encodes the decoder TARGET, shifted one step back (t-1).
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
# Line 126: Pads the rest of the decoder input sequence.
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0
# Line 127: Pads the rest of the decoder target sequence.

"""
## Build the model
"""
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.models import Model

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
# Line 135: Defines the Encoder input layer.
encoder = LSTM(latent_dim, return_state=True)
# Line 136: Defines the Encoder LSTM layer, configured to return only the internal states.
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# Line 137: Connects the input to the LSTM and captures the h and c states.

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Line 140: Stores the internal states of the Encoder (context vector).

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
# Line 143: Defines the Decoder input layer.

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
# Line 148: Defines the Decoder LSTM layer, returns full sequences and states.
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# Line 149: Connects the input and uses the Encoder states as the initial state.
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
# Line 150: Defines the output Dense layer with softmax activation.
decoder_outputs = decoder_dense(decoder_outputs)
# Line 151: Connects the LSTM output to the Dense layer.

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Line 155: Creates the complete Training Model.

"""
## Train the model
"""

model.compile(
# Line 160: Starts the compilation phase.
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
# Line 161: Specifies the optimizer, loss function, and metric.
)
# Note: The execution of model.fit() is omitted here for file compilation purposes.
# model.fit(
#     [encoder_input_data, decoder_input_data],
#     decoder_target_data,
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_split=0.2,
# )
# Save model
model.save("s2s_model.keras")
# Line 171: Saves the trained model to disk.

"""
## Run inference (sampling)

1. encode input and retrieve initial decoder state
2. run one step of decoder with this initial state
and a "start of sequence" token as target.
Output will be the next target token.
3. Repeat with the current target token and current states
"""

# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("s2s_model.keras")
# Line 184: Loads the saved model.

encoder_inputs = model.input[0]  # input_1
# Line 186: Defines the inference Encoder input.
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
# Line 187: Extracts the final states of the Encoder LSTM.
encoder_states = [state_h_enc, state_c_enc]
# Line 188: Stores the states.
encoder_model = Model(encoder_inputs, encoder_states)
# Line 189: Defines the INFERENCE ENCODER MODEL.

decoder_inputs = model.input[1]  # input_2
# Line 191: Defines the inference Decoder input.
decoder_state_input_h = keras.Input(shape=(latent_dim,))
# Line 192: Defines the placeholder for the input hidden state (h).
decoder_state_input_c = keras.Input(shape=(latent_dim,))
# Line 193: Defines the placeholder for the input cell state (c).
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# Line 194: Groups the decoder input states.
decoder_lstm = model.layers[3]
# Line 195: Reference to the Decoder LSTM layer.
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
# Line 196: Connects the 1-step sequence input to the LSTM.
    decoder_inputs, initial_state=decoder_states_inputs
# Line 197: Uses the state placeholders as the initial state.
)
decoder_states = [state_h_dec, state_c_dec]
# Line 199: The output states of the LSTM (the updated states).
decoder_dense = model.layers[4]
# Line 200: Reference to the output Dense layer.
decoder_outputs = decoder_dense(decoder_outputs)
# Line 201: Connects the LSTM output to the Dense layer.
decoder_model = Model(
# Line 202: Defines the INFERENCE DECODER MODEL.
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
# Line 203: Maps the input (1 character + previous states) to (1 predicted character + new states).
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
# Line 209: Creates the reverse dictionary to map indices to input characters.
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
# Line 210: Creates the reverse dictionary to map indices to target characters.


def decode_sequence(input_seq):
# Line 213: Defines the function that executes the complete translation.
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)
# Line 215: Uses the Encoder to get the context vector (initial states).

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
# Line 218: Initializes the decoder input sequence (size 1).
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0
# Line 220: Inserts the start-of-sequence token ('\t').

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
# Line 223: Stop flag.
    decoded_sentence = ""
# Line 224: Initializes the translated text string.
    while not stop_condition:
# Line 225: Generation loop (one step per character).
        output_tokens, h, c = decoder_model.predict(
# Line 226: Uses the Decoder to predict the next character.
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
# Line 231: Finds the index of the character with the highest probability (Greedy search).
        sampled_char = reverse_target_char_index[sampled_token_index]
# Line 232: Converts the index to the actual character.
        decoded_sentence += sampled_char
# Line 233: Appends the character to the translated sentence.

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
# Line 238: Stop condition.
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
# Line 241: Resets the decoder input sequence.
        target_seq[0, 0, sampled_token_index] = 1.0
# Line 242: The predicted character becomes the input for the next step.

        # Update states
        states_value = [h, c]
# Line 245: Updates the states for the next step.
    return decoded_sentence
# Line 246: Returns the translated sentence.


"""
You can now generate decoded sentences as such:
"""

for seq_index in range(20):
# Line 252: Iterates over the first 20 samples.
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
# Line 256: Extracts a single input sequence.
    decoded_sentence = decode_sequence(input_seq)
# Line 257: Calls the decoding function.
    print("-")
# Line 258: Visual separator.
    print("Input sentence:", input_texts[seq_index])
# Line 259: Prints the original sentence.
    print("Decoded sentence:", decoded_sentence)
# Line 260: Prints the generated translation.

# ------------------------------------------------------------------
# 3. IMAGE CAPTIONING (Advanced Task: Rewriting PyTorch model in Keras)
# ------------------------------------------------------------------

from tensorflow.keras.layers import Embedding, Dropout, Concatenate, Lambda
from tensorflow.keras.applications import ResNet152
from tensorflow.keras import backend as K

# --- Hyperparameters based on the PyTorch model.py file ---
EMBED_SIZE_IC = 256    # embed_size
HIDDEN_SIZE_IC = 512   # hidden_size
VOCAB_SIZE_IC = VOCAB_SIZE # Using the same vocabulary, but it should be the caption vocabulary.
RESNET_FC_IN_FEATURES = 2048 # Output of ResNet152 before mapping.

# =================================================================
# CNN ENCODER (Simulates EncoderCNN)
# =================================================================

def build_keras_encoder_ic():
    """Takes the image and extracts the feature vector."""

    # Keras expects (Height, Width, Channels) for the image
    image_input = Input(shape=(224, 224, 3), name='ic_image_input')

    # ResNet152 without the top dense layer, with Global Average Pooling
    resnet = ResNet152(weights='imagenet', include_top=False, pooling='avg')

    # Extract CNN features (shape=(None, 2048))
    cnn_features = resnet(image_input)

    # Linear Layer (simulates self.linear): maps 2048 to EMBED_SIZE_IC (256)
    encoder_output = Dense(EMBED_SIZE_IC, activation='relu', name='ic_encoder_linear')(cnn_features)

    # A BatchNormalization layer could be added here to simulate self.bn

    encoder_model = Model(inputs=image_input, outputs=encoder_output, name='IC_Encoder_CNN')

    return encoder_model, image_input, encoder_output

# =================================================================
# RNN DECODER (Simulates DecoderRNN)
# =================================================================

def build_keras_decoder_ic(feature_input_tensor):
    """Generates the description from features and sequence."""

    # 1. Input of the word sequence (captions)
    caption_inputs = Input(shape=(None,), name='ic_caption_input')

    # 2. Embedding Layer (simulates self.embed)
    embedding_layer = Embedding(VOCAB_SIZE_IC, EMBED_SIZE_IC, name='ic_decoder_embedding')
    embeddings = embedding_layer(caption_inputs)

    # 3. Concatenation (Simulates torch.cat((features.unsqueeze(1), embeddings), 1))
    # The image feature acts as the first token/timestep.

    # Add the time dimension to the image feature (e.g., from (256,) to (1, 256))
    feature_step = Lambda(lambda x: K.expand_dims(x, axis=1), name='ic_feature_step')(feature_input_tensor)

    # Concatenate: [Image_Feature (Step 0)] + [Embedded_Caption (Steps 1 to N)]
    combined_input = Concatenate(axis=1, name='ic_combined_input')([feature_step, embeddings])

    # 4. LSTM Layer (simulates self.lstm)
    # batch_first=True is the default in Keras.
    lstm_layer = LSTM(HIDDEN_SIZE_IC, return_sequences=True, name='ic_decoder_lstm')
    hiddens = lstm_layer(combined_input)

    # 5. Linear Layer (simulates self.linear)
    linear_output = Dense(VOCAB_SIZE_IC, activation='softmax', name='ic_decoder_output')
    outputs = linear_output(hiddens)

    return caption_inputs, outputs

# =================================================================
# ASSEMBLY OF THE COMPLETE CAPTIONING MODEL
# =================================================================

def create_full_captioning_model():
    """Assembles the Encoder and Decoder."""

    encoder_model, image_input_tensor, feature_output_tensor = build_keras_encoder_ic()
    caption_input_tensor, decoder_output_tensor = build_keras_decoder_ic(feature_output_tensor)

    full_model = Model(
        inputs=[image_input_tensor, caption_input_tensor],
        outputs=decoder_output_tensor,
        name='Image_Captioning_Seq2Seq'
    )

    # Compilation (required for training)
    full_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # return full_model # Returns the training model
    print("\n--- ADVANCED TASK: IMAGE CAPTIONING MODEL IN KERAS ---")
    print("Complete image captioning model (PyTorch model.py simulation) created.")
    full_model.summary()

# Execution to show the architecture:
if __name__ == '__main__':
    # This would execute the automatic translation (if model.fit wasn't commented out)
    # And then show the captioning model summary.
    create_full_captioning_model()

In [3]:
"""
================================================================================================
3.2 RESEARCH: EXECUTING THE CAPTIONING MODEL IN KERAS AND MIGRATING PYTORCH WEIGHTS
================================================================================================

https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning

This block describes the conceptual steps required to migrate a model trained in
PyTorch (like the image captioning one) to be executed and used in Keras/TensorFlow.

--- KEY STEPS FOR MIGRATION ---

1. DEFINE THE EQUIVALENT ARCHITECTURE IN KERAS:
   The structure of the PyTorch model (EncoderCNN + DecoderRNN) must be replicated
   in Keras using the Functional Model API. The Keras LSTM, Dense, and Embedding layers
   must have exactly the same dimensions and configurations as the PyTorch layers.

2. LOADING PYTORCH WEIGHTS:
   PyTorch (torch.load) is used to load the weight file (.pth or .pt).
   This results in a state dictionary (state_dict) containing the weight tensors
   by layer name (e.g., 'lstm.weight_ih_l0').

3. TENSOR MAPPING AND TRANSPOSING (Critical Step):
   Keras/TensorFlow and PyTorch have different conventions for storing weights.

   A. LINEAR/DENSE LAYERS (nn.Linear vs. keras.layers.Dense):
      - PyTorch stores weights in the format: [output_dim, input_dim].
      - Keras/TensorFlow stores weights in the format: [input_dim, output_dim].
      - REQUIRED ACTION: PyTorch weight tensors must be **TRANSPOSED** (using .T or np.transpose)
        before being assigned to the Keras Dense layer.

        # Conceptual example:
        # keras_weights = pytorch_weights.transpose()
        # keras_biases = pytorch_biases (Biases generally don't need transposing)

   B. LSTM/GRU LAYERS (nn.LSTM vs. keras.layers.LSTM):
      - LSTM tensors (input weights, recurrent weights, biases) are concatenated
        differently in each framework.
      - REQUIRED ACTION: PyTorch tensors must be **split, reordered, and then transposed**
        to match the order expected by Keras: [kernel, recurrent_kernel, bias].

4. ASSIGNING WEIGHTS IN KERAS:
   Once the tensors have been converted and restructured, they are injected into the
   corresponding Keras layers using the method:

   # Example:
   # keras_layer = model.get_layer('keras_layer_name')
   # keras_layer.set_weights([converted_weights, converted_biases])

This process ensures that the Keras model uses the intelligence learned by the PyTorch model.
"""

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Lambda
from tensorflow.keras.applications import ResNet152
from tensorflow.keras import backend as K

# --- Hyperparameters Based on the PyTorch model ---
EMBED_SIZE = 256    # Embedding dimension
HIDDEN_SIZE = 512   # LSTM units
VOCAB_SIZE = 10000  # Vocabulary size (example, adjust to actual)

# =================================================================
# 1. CNN ENCODER (Simulates EncoderCNN)
# =================================================================

def build_keras_encoder_ic():
    """Defines the Encoder that extracts the image feature vector."""

    # Image input (e.g., 224x224x3)
    image_input = Input(shape=(224, 224, 3), name='ic_image_input')

    # Pre-trained ResNet152, removing the top dense layer, using Global Average Pooling
    resnet = ResNet152(weights='imagenet', include_top=False, pooling='avg')

    # CNN features (Pooling output: 2048)
    cnn_features = resnet(image_input)

    # Linear Layer (simulates self.linear): Maps 2048 features to EMBED_SIZE (256)
    # This vector will be used as the first token for the Decoder.
    encoder_output = Dense(EMBED_SIZE, activation='relu', name='ic_encoder_linear')(cnn_features)

    return image_input, encoder_output

# =================================================================
# 2. RNN DECODER (Simulates DecoderRNN)
# =================================================================

def build_keras_decoder_ic(feature_input_tensor):
    """Defines the Decoder that generates the description sequentially."""

    # 1. Input of the word sequence (captions)
    caption_inputs = Input(shape=(None,), name='ic_caption_input')

    # 2. Embedding Layer (simulates self.embed)
    embedding_layer = Embedding(VOCAB_SIZE, EMBED_SIZE, mask_zero=True, name='ic_decoder_embedding')
    embeddings = embedding_layer(caption_inputs)

    # 3. Concatenating Image Feature as the first token
    # Add the time dimension to the image feature (from (256,) to (1, 256))
    feature_step = Lambda(lambda x: K.expand_dims(x, axis=1), name='ic_feature_step')(feature_input_tensor)

    # Concatenate: [Image_Feature (Step 0)] + [Embedded_Caption (Steps 1 to N)]
    combined_input = Concatenate(axis=1, name='ic_combined_input')([feature_step, embeddings])

    # 4. LSTM Layer (simulates self.lstm)
    # return_sequences=True to predict one token at each step.
    lstm_layer = LSTM(HIDDEN_SIZE, return_sequences=True, name='ic_decoder_lstm')
    hiddens = lstm_layer(combined_input)

    # 5. Linear Layer (simulates self.linear)
    decoder_outputs = Dense(VOCAB_SIZE, activation='softmax', name='ic_decoder_output')(hiddens)

    return caption_inputs, decoder_outputs

# =================================================================
# 3. COMPLETE CAPTIONING MODEL (Training)
# =================================================================

image_input_tensor, feature_output_tensor = build_keras_encoder_ic()
caption_input_tensor, decoder_output_tensor = build_keras_decoder_ic(feature_output_tensor)

# Complete Model: [Image, Caption_Input] -> [Caption_Output]
captioning_model = Model(
    inputs=[image_input_tensor, caption_input_tensor],
    outputs=decoder_output_tensor,
    name='Image_Captioning_Keras_Simulation'
)

# Example Compilation:
# captioning_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# captioning_model.summary()

In [3]:
"""
================================================================================================
4. ADVANCED TASK: ADVANCED RESEARCH
================================================================================================

This block contains the answers to the advanced research questions,
structured as documentation within the Python file.
"""

# ------------------------------------------------------------------
# 4.1. Translation between Japanese and English (Jp <-> En)
# ------------------------------------------------------------------

"""
QUESTION: What steps would be taken to translate between Japanese and English?

Japanese (SOV: Subject-Object-Verb) and English (SVO: Subject-Verb-Object)
are typologically very different, requiring advanced tokenization methods
and reordering.
"""

def japanese_english_translation_steps():
    """Describes the key steps for Japanese <-> English translation."""

    # 1. Tokenization (Critical Step)
    print("1. Subword Tokenization:")
    # Japanese does not use spaces, so tokenization must be morphological.
    # Algorithms like SentencePiece or WordPiece would be used to segment the Japanese text
    # into units (subwords) that handle the different alphabets (Kanji, Hiragana, Katakana).

    # 2. Syntactic Reordering
    print("2. Syntactic Reordering (SOV <-> SVO):")
    # An advanced model (Transformer or NMT with Attention) capable of learning
    # the complex rules for reordering the sentence structure (e.g., moving
    # the verb to the end in Japanese and to the middle in English) is required.

    # 3. Corpus
    print("3. Use of Extensive Parallel Corpus:")
    # Training requires a large and high-quality parallel corpus (e.g., ASPEC)
    # for the model to learn the high-complexity mapping rules.

# ------------------------------------------------------------------
# 4.2. Advanced Automatic Translation Methods
# ------------------------------------------------------------------

def advanced_translation_methods():
    """Explores methods beyond basic Seq2Seq."""

    # A. Attention-based Neural Machine Translation (NMT with Attention)
    print("\nA. Attention-based NMT:")
    # Mechanism: The decoder calculates an attention vector, a weighted average
    # of ALL encoder hidden states.
    # Advantage: Allows the model to dynamically 'focus' on the relevant parts
    # of the source sentence while translating each output word. This solves
    # the 'bottleneck' of basic Seq2Seq.

    # B. Transformer Models (State-of-the-Art)
    print("B. Transformer Models:")
    # Architecture: Abandon RNNs (LSTM/GRU) entirely. They rely solely
    # on the Self-Attention and Multi-Head Attention mechanisms.
    # Advantage: Allows for parallel processing of the entire sequence,
    # dramatically accelerating training and better capturing long-range
    # dependencies.

# ------------------------------------------------------------------
# 4.3. Text-to-Image Generation
# ------------------------------------------------------------------

def text_to_image_generation():
    """Investigates the inverse technology to image captioning."""

    # The field is dominated by Diffusion Models.

    # 1. Diffusion Models (State-of-the-Art)
    print("\n1. Diffusion Models (State-of-the-Art):")
    # Mechanism: The model is trained to reverse a progressive process of
    # adding noise (denoising).
    # Conditioning: The text prompt is injected (encoded, e.g., using CLIP)
    # as a condition at each step of the 'denoising' process, guiding the
    # image reconstruction to match the textual description.
    # Examples: Stable Diffusion, DALL-E 2.

    # 2. Conditional Generative Adversarial Networks (GANs)
    print("2. Conditional GANs:")
    # Mechanism: Two networks compete. The Generator creates the image from text,
    # and the Discriminator judges whether the image is realistic and matches the text description.

# =================================================================
# EXECUTION (Call the functions to see the documentation)
# =================================================================

if __name__ == '__main__':
    print("--- 4.1. Japanese <-> English Translation ---")
    japanese_english_translation_steps()

    print("\n--- 4.2. Advanced Automatic Translation Methods ---")
    advanced_translation_methods()

    print("\n--- 4.3. Text-to-Image Generation ---")
    text_to_image_generation()