In [1]:
# Cell 1: Install Libraries and Import Modules
# !pip install wandb tensorflow numpy pandas scikit-learn matplotlib

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout, Concatenate, AdditiveAttention, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split # Though not directly used in sweep, good to have
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

import os
import re
import time
import unicodedata
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker # For heatmap ticks

In [2]:
# Cell 2: Wandb Login
# Run this cell and follow the instructions to log in to your W&B account.
wandb.login()

wandb: Currently logged in as: ce21b097 (ce21b097-indian-institute-of-technology-madras) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

In [3]:
# Cell 3: Data Loading and Initial Parsing

def load_data(filepath):
    """Loads data from a TSV file."""
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, on_bad_lines='skip', names=['native', 'roman', 'count'])
        df.dropna(subset=['native', 'roman'], inplace=True)
        input_texts = df['roman'].astype(str).tolist()
        target_texts = df['native'].astype(str).tolist()
        return input_texts, target_texts
    except Exception as e:
        print(f"Error loading data from {filepath}: {e}")
        return [], []

# --- Define file paths ---
dataset_base_dir = 'dakshina_dataset_v1.0' 
language = 'hi' # Hindi

train_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.train.tsv')
dev_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.dev.tsv')
test_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.test.tsv')

# Load data
input_texts_train_full, target_texts_train_full = load_data(train_file)
input_texts_val, target_texts_val = load_data(dev_file)
input_texts_test, target_texts_test = load_data(test_file)

# For sweeps, we can use the same split as before or the full training data
input_texts_train, target_texts_train = input_texts_train_full, target_texts_train_full

print(f"Training samples: {len(input_texts_train)}")
print(f"Validation samples: {len(input_texts_val)}")
print(f"Test samples: {len(input_texts_test)}")

if len(input_texts_train) > 0 and len(target_texts_train) > 0:
    print("\nSample training data:")
    for i in range(min(3, len(input_texts_train))):
        print(f"Input: {input_texts_train[i]}, Target: {target_texts_train[i]}")
else:
    print("No training data loaded. Please check file paths and content.")

Training samples: 44202
Validation samples: 4358
Test samples: 4502

Sample training data:
Input: an, Target: अं
Input: ankganit, Target: अंकगणित
Input: uncle, Target: अंकल


In [4]:
# Cell 4: Data Preprocessing - Vocabulary, Tokenization, Padding

# --- Character sets and tokenization ---
input_characters = set()
target_characters = set()

# Use all available text data (train_full, val, test) to build a comprehensive vocabulary
# This ensures that characters encountered in val/test are known.
all_input_texts_for_vocab = input_texts_train_full + input_texts_val + input_texts_test
all_target_texts_for_vocab = target_texts_train_full + target_texts_val + target_texts_test


for text in all_input_texts_for_vocab:
    for char in str(text): # Ensure text is string
        if char not in input_characters:
            input_characters.add(char)

for text in all_target_texts_for_vocab:
    for char in str(text): # Ensure text is string
        if char not in target_characters:
            target_characters.add(char)

# Add special tokens
SOS_TOKEN = '\t' # Start Of Sequence
EOS_TOKEN = '\n' # End Of Sequence
target_characters.add(SOS_TOKEN)
target_characters.add(EOS_TOKEN)


input_char_list = sorted(list(input_characters))
target_char_list = sorted(list(target_characters))

num_encoder_tokens = len(input_char_list)
num_decoder_tokens = len(target_char_list)

input_token_index = {char: i for i, char in enumerate(input_char_list)}
target_token_index = {char: i for i, char in enumerate(target_char_list)}

reverse_input_char_index = {i: char for char, i in input_token_index.items()}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

# Determine max sequence lengths from all data splits
all_input_texts_combined = input_texts_train_full + input_texts_val + input_texts_test
all_target_texts_combined = target_texts_train_full + target_texts_val + target_texts_test

max_encoder_seq_length = max(len(str(text)) for text in all_input_texts_combined)
max_decoder_seq_length = max(len(str(text)) for text in all_target_texts_combined) + 2 # +2 for SOS and EOS

print(f"\nNumber of unique input tokens: {num_encoder_tokens}")
print(f"Number of unique output tokens: {num_decoder_tokens}")
print(f"Max sequence length for inputs: {max_encoder_seq_length}")
print(f"Max sequence length for outputs: {max_decoder_seq_length}")


# --- Vectorize the data ---
def vectorize_data(input_texts, target_texts):
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
    decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length), dtype="float32")
    decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(str(input_text)): # Ensure input_text is string
            if t < max_encoder_seq_length and char in input_token_index:
                 encoder_input_data[i, t] = input_token_index[char]
        
        processed_target_text = SOS_TOKEN + str(target_text) + EOS_TOKEN # Ensure target_text is string
        for t, char in enumerate(processed_target_text):
            if t < max_decoder_seq_length:
                if char in target_token_index:
                    decoder_input_data[i, t] = target_token_index[char]
                    if t > 0: 
                        decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 
    
    return encoder_input_data, decoder_input_data, decoder_target_data

# Vectorize the training and validation sets (using the 'train' subset for initial sweep training if preferred)
encoder_input_train, decoder_input_train, decoder_target_train = vectorize_data(input_texts_train, target_texts_train)
encoder_input_val, decoder_input_val, decoder_target_val = vectorize_data(input_texts_val, target_texts_val)
# Test data will be vectorized later before final evaluation.

print("\nShape of encoder_input_train:", encoder_input_train.shape)
print("Shape of decoder_input_train:", decoder_input_train.shape)
print("Shape of decoder_target_train:", decoder_target_train.shape)
print("Shape of encoder_input_val:", encoder_input_val.shape)


Number of unique input tokens: 26
Number of unique output tokens: 65
Max sequence length for inputs: 20
Max sequence length for outputs: 21

Shape of encoder_input_train: (44202, 20)
Shape of decoder_input_train: (44202, 21)
Shape of decoder_target_train: (44202, 21, 65)
Shape of encoder_input_val: (4358, 20)


In [5]:
# Cell 5: Bahdanau Attention Layer Definition

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units) # For encoder outputs
        self.W2 = tf.keras.layers.Dense(units) # For decoder hidden state
        self.V = tf.keras.layers.Dense(1)      # To compute the score

    def call(self, query, values):
        # query shape == (batch_size, hidden_size) (decoder hidden state)
        # values shape == (batch_size, max_len, hidden_size) (encoder outputs)

        # Expand query to broadcast addition along sequence length dimension
        # query_with_time_axis shape == (batch_size, 1, hidden_size)
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_len, 1)
        # self.V applied to sum of W1(values) and W2(query)
        # W1(values) shape == (batch_size, max_len, units)
        # W2(query_with_time_axis) shape == (batch_size, 1, units)
        # tf.nn.tanh(...) shape == (batch_size, max_len, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(query_with_time_axis)))

        # attention_weights shape == (batch_size, max_len, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


In [6]:
# Cell 6: Attention Model Building Function

def build_attention_seq2seq_model(config):
    # Encoder
    encoder_inputs = Input(shape=(None,), name="encoder_inputs")
    enc_emb = Embedding(num_encoder_tokens, config.input_embedding_size, name="encoder_embedding")(encoder_inputs)

    # Select RNN cell type
    if config.cell_type == "LSTM":
        RNNCellEncoder = LSTM
        RNNCellDecoder = LSTM
    elif config.cell_type == "GRU":
        RNNCellEncoder = GRU
        RNNCellDecoder = GRU
    else:
        RNNCellEncoder = keras.layers.SimpleRNN
        RNNCellDecoder = keras.layers.SimpleRNN

    # For simplicity and effective attention, encoder returns all sequences.
    # Using a single layer for encoder as suggested for simplicity, but configurable.
    encoder_outputs_list = []
    current_encoder_output = enc_emb
    encoder_states = [] # For the final state

    for i in range(config.encoder_layers):
        is_last_layer = (i == config.encoder_layers - 1)
        encoder_rnn = RNNCellEncoder(config.hidden_size,
                                     return_sequences=True, # Crucial for attention
                                     return_state=True,
                                     dropout=config.dropout_rate,
                                     name=f"encoder_{config.cell_type}_{i}")
        if config.cell_type == "LSTM":
            current_encoder_output, state_h, state_c = encoder_rnn(current_encoder_output)
            if is_last_layer:
                encoder_states = [state_h, state_c]
        else: # GRU or SimpleRNN
            current_encoder_output, state_h = encoder_rnn(current_encoder_output)
            if is_last_layer:
                encoder_states = [state_h]
    
    encoder_all_outputs = current_encoder_output # These are the 'values' for attention

    # Decoder
    decoder_inputs = Input(shape=(None,), name="decoder_inputs")
    dec_emb_layer = Embedding(num_decoder_tokens, config.input_embedding_size, name="decoder_embedding")
    dec_emb = dec_emb_layer(decoder_inputs)

    # Decoder RNN - single layer as suggested for simplicity, but configurable
    # It will process one timestep at a time in the training model for clarity,
    # though Keras handles the loop.
    decoder_rnn_layer = RNNCellDecoder(config.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   dropout=config.dropout_rate,
                                   name=f"decoder_{config.cell_type}")

    # Attention Layer
    attention_layer = BahdanauAttention(config.hidden_size) # Attention units often same as hidden_size

    all_decoder_outputs = []
    
    # Initial decoder hidden state from last encoder state
    # If multiple encoder layers, encoder_states will be from the *last* encoder layer.
    # If multiple decoder layers, this initial state is for the *first* decoder layer.
    # For this example, we'll assume single layer decoder for state handling ease.
    # If decoder_layers > 1, state management becomes more complex in custom loops.
    # For now, let's make decoder_layers = 1 for this example to match the "simplicity" note.
    # If config.decoder_layers > 1, the loop below needs careful state piping.
    
    # For training, Keras handles the looping over timesteps.
    # We need to prepare the inputs for the attention mechanism.
    # The decoder's hidden state at each step `t` will query the encoder_all_outputs.

    # We need a setup that allows Keras to build the graph.
    # Using an RNN with a custom cell or a more explicit loop with functional API.
    # Keras `RNN` layer with a custom cell is one way.
    # Another is to use the `AdditiveAttention` layer if we want Bahdanau, or `Attention` for Luong.
    
    # Simpler approach for Keras:
    # Decoder RNN runs first, then its output is used with context.
    # Let's use the Keras built-in AdditiveAttention or Attention layer for conciseness.
    # `tf.keras.layers.Attention` is Luong-style (multiplicative/dot-product).
    # `tf.keras.layers.AdditiveAttention` is Bahdanau-style.
    
    # For teaching purposes, let's try a manual loop-like structure within functional API for one step,
    # then adapt to how Keras expects full sequences.

    # Correct way for training with Keras layers:
    # The decoder RNN will take the embedded input and its previous state.
    # Its output (query) and encoder_all_outputs (values) go to attention.
    # The context vector from attention is concatenated with RNN output and fed to Dense.

    # Initial decoder hidden state:
    decoder_hidden_states = encoder_states # From last encoder layer

    # Decoder processing loop (conceptual for training; Keras handles this with return_sequences=True)
    # For TF functional API, we'd pass the whole sequence
    
    # Let's use the decoder_rnn_layer to process the entire sequence of decoder embeddings
    if config.cell_type == "LSTM":
        decoder_rnn_outputs, _, _ = decoder_rnn_layer(dec_emb, initial_state=decoder_hidden_states)
    else: # GRU / SimpleRNN
        decoder_rnn_outputs, _ = decoder_rnn_layer(dec_emb, initial_state=decoder_hidden_states)
    
    # decoder_rnn_outputs shape: (batch_size, target_seq_len, decoder_hidden_size)
    # encoder_all_outputs shape: (batch_size, input_seq_len, encoder_hidden_size)
    # Note: For Bahdanau, encoder_hidden_size and decoder_hidden_size can be different.
    # Our BahdanauAttention layer takes W1(values) and W2(query).
    # If hidden_size is same for encoder and decoder, it simplifies. Assume config.hidden_size is used for both.
    
    # Using tf.keras.layers.AdditiveAttention (Bahdanau-style)
    # query is decoder_rnn_outputs, value is encoder_all_outputs
    # The attention layer will compute context vector for each decoder timestep.
    context_vector_seq, attention_weights_seq = tf.keras.layers.AdditiveAttention(name="attention_layer")(
        [decoder_rnn_outputs, encoder_all_outputs], return_attention_scores=True
    )
    # context_vector_seq shape: (batch_size, target_seq_len, encoder_hidden_size)
    # attention_weights_seq shape: (batch_size, target_seq_len, input_seq_len)

    # Concatenate context vector with decoder RNN output
    decoder_concat_input = Concatenate(axis=-1, name="concat_layer")([decoder_rnn_outputs, context_vector_seq])

    # Final output layer
    decoder_dense = Dense(num_decoder_tokens, activation="softmax", name="decoder_output_dense")
    decoder_outputs_final = decoder_dense(decoder_concat_input)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs_final)
    
    # Compile
    optimizer_choice = tf.keras.optimizers.Adam(learning_rate=config.learning_rate)
    # Allow other optimizers from config if specified
    if hasattr(config, 'optimizer'):
        if config.optimizer == 'nadam':
            optimizer_choice = tf.keras.optimizers.Nadam(learning_rate=config.learning_rate)
        elif config.optimizer == 'rmsprop':
            optimizer_choice = tf.keras.optimizers.RMSprop(learning_rate=config.learning_rate)
    
    model.compile(optimizer=optimizer_choice, loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# Quick test of model building (optional, for dev)
# class DummyConfig:
#     input_embedding_size = 64
#     hidden_size = 128
#     encoder_layers = 1
#     decoder_layers = 1 # For attention model, decoder RNN is often single layer before attention combination
#     cell_type = "GRU"
#     dropout_rate = 0.1
#     learning_rate = 0.001
#     optimizer = "adam"
# test_config = DummyConfig()
# if num_encoder_tokens > 0 and num_decoder_tokens > 0 : # Only if vocab is built
#     try:
#         test_attention_model = build_attention_seq2seq_model(test_config)
#         test_attention_model.summary()
#         print("Attention model built successfully.")
#     except Exception as e:
#         print(f"Error building attention model: {e}")
# else:
#     print("Vocab not built, skipping model build test.")

In [7]:
# Cell 7: Inference Models for Attention Model

def build_attention_inference_models(training_model, config):
    # --- Encoder Model ---
    encoder_inputs_inf = training_model.get_layer("encoder_inputs").input
    encoder_embedding_inf = training_model.get_layer("encoder_embedding")(encoder_inputs_inf)
    
    current_encoder_output_inf = encoder_embedding_inf
    encoder_states_inf = [] # Final states of the last encoder layer
    
    for i in range(config.encoder_layers):
        encoder_rnn_layer_inf = training_model.get_layer(f"encoder_{config.cell_type}_{i}")
        if config.cell_type == "LSTM":
            current_encoder_output_inf, state_h_enc, state_c_enc = encoder_rnn_layer_inf(current_encoder_output_inf)
            if i == config.encoder_layers - 1: # Last layer
                encoder_states_inf = [state_h_enc, state_c_enc]
        else: # GRU or SimpleRNN
            current_encoder_output_inf, state_h_enc = encoder_rnn_layer_inf(current_encoder_output_inf)
            if i == config.encoder_layers - 1: # Last layer
                encoder_states_inf = [state_h_enc]
    
    encoder_all_outputs_inf = current_encoder_output_inf # All hidden states from last encoder layer
    encoder_model_inf = Model(encoder_inputs_inf, [encoder_all_outputs_inf] + encoder_states_inf)

    # --- Decoder Model for Inference (step-by-step) ---
    decoder_hidden_size = config.hidden_size # Assuming decoder hidden size is same as attention units
    
    # Inputs for the decoder step
    decoder_input_single_step = Input(shape=(1,), name="decoder_input_single_step") # One token
    encoder_all_outputs_as_input = Input(shape=(max_encoder_seq_length, config.hidden_size), name="encoder_all_outputs_as_input") # From encoder

    # Decoder initial states (list of tensors, one for h, one for c if LSTM)
    decoder_initial_states_inputs = []
    if config.cell_type == "LSTM":
        decoder_state_input_h = Input(shape=(decoder_hidden_size,), name="decoder_state_h_input")
        decoder_state_input_c = Input(shape=(decoder_hidden_size,), name="decoder_state_c_input")
        decoder_initial_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    else: # GRU or SimpleRNN
        decoder_state_input_h = Input(shape=(decoder_hidden_size,), name="decoder_state_h_input")
        decoder_initial_states_inputs = [decoder_state_input_h]

    # Get layers from the trained model
    dec_emb_layer_inf = training_model.get_layer("decoder_embedding")
    decoder_rnn_layer_inf = training_model.get_layer(f"decoder_{config.cell_type}") # Assumes single layer decoder for simplicity
    attention_layer_inf = training_model.get_layer("attention_layer") # Keras AdditiveAttention
    decoder_dense_inf = training_model.get_layer("decoder_output_dense")
    concat_layer_inf = training_model.get_layer("concat_layer")


    # Decoder step execution
    dec_emb_single_step = dec_emb_layer_inf(decoder_input_single_step) # Shape (batch, 1, embedding_dim)

    # Decoder RNN step
    if config.cell_type == "LSTM":
        decoder_rnn_output_step, state_h_dec, state_c_dec = decoder_rnn_layer_inf(
            dec_emb_single_step, initial_state=decoder_initial_states_inputs
        )
        decoder_new_states = [state_h_dec, state_c_dec]
    else: # GRU or SimpleRNN
        decoder_rnn_output_step, state_h_dec = decoder_rnn_layer_inf(
            dec_emb_single_step, initial_state=decoder_initial_states_inputs
        )
        decoder_new_states = [state_h_dec]
    # decoder_rnn_output_step shape (batch, 1, decoder_hidden_size)
    
    # Attention step
    # query is decoder_rnn_output_step, value is encoder_all_outputs_as_input
    context_vector_step, attention_weights_step = attention_layer_inf(
        [decoder_rnn_output_step, encoder_all_outputs_as_input], return_attention_scores=True
    )
    # context_vector_step shape: (batch, 1, encoder_hidden_size)
    # attention_weights_step shape: (batch, 1, input_seq_len) (for AdditiveAttention)
    # Squeeze out the time dimension from attention weights for plotting: (batch, input_seq_len)
    squeezed_attention_weights = tf.squeeze(attention_weights_step, axis=1)


    # Concatenate context vector with decoder RNN output for this step
    decoder_concat_input_step = concat_layer_inf([decoder_rnn_output_step, context_vector_step])
    # decoder_concat_input_step shape (batch, 1, combined_hidden_size)

    # Final dense layer for this step
    decoder_output_final_step = decoder_dense_inf(decoder_concat_input_step) # Shape (batch, 1, num_decoder_tokens)
    # Squeeze out the time dimension for output probabilities
    squeezed_decoder_output = tf.squeeze(decoder_output_final_step, axis=1)


    decoder_model_inf = Model(
        [decoder_input_single_step, encoder_all_outputs_as_input] + decoder_initial_states_inputs, 
        [squeezed_decoder_output, squeezed_attention_weights] + decoder_new_states
    )
    
    return encoder_model_inf, decoder_model_inf

In [8]:
# Cell 8: Beam Search Decode Function for Attention Model

def decode_sequence_beam_search_attention(input_seq_vectorized, encoder_model, decoder_model, beam_width, config):
    # Encode the input to get all encoder outputs and final states
    encoder_outputs = encoder_model.predict(input_seq_vectorized, verbose=0)
    encoder_all_hidden_states = encoder_outputs[0] # This is the `values` for attention
    
    # Initial decoder states come from the rest of encoder_outputs
    # If LSTM: [state_h, state_c], if GRU/RNN: [state_h]
    initial_decoder_states = encoder_outputs[1:] 

    # Start with the SOS token
    start_token_idx = target_token_index[SOS_TOKEN]
    
    # Initial beam: (sequence_indices, log_probability, last_decoder_states, list_of_attention_weights_for_seq)
    # Sequence is a list of token indices
    initial_beam = [([start_token_idx], 0.0, initial_decoder_states, [])] 
    live_hypotheses = initial_beam

    completed_hypotheses = []

    for _ in range(max_decoder_seq_length): # Max decode steps
        new_hypotheses_candidates = []
        
        # If all live hypotheses have ended or no live hypotheses left
        if not live_hypotheses or all(h[0][-1] == target_token_index[EOS_TOKEN] for h in live_hypotheses if len(h[0]) > 1):
            break

        for seq_indices, score, current_decoder_states, attn_weights_list in live_hypotheses:
            # If EOS token is the last token, this hypothesis is complete
            if seq_indices[-1] == target_token_index[EOS_TOKEN] and len(seq_indices) > 1:
                completed_hypotheses.append((seq_indices, score / (len(seq_indices)-1), current_decoder_states, attn_weights_list)) # Normalize score
                continue # Don't expand completed hypotheses

            # Prepare decoder input for the next step
            last_token_idx_input = np.array([[seq_indices[-1]]])
            
            decoder_model_inputs = [last_token_idx_input, encoder_all_hidden_states] + current_decoder_states
            
            # Predict next token probabilities, attention_weights, and new decoder states
            decoder_pred_outputs = decoder_model.predict(decoder_model_inputs, verbose=0)
            
            output_token_probs = decoder_pred_outputs[0]      # Shape (batch=1, num_decoder_tokens)
            attention_weights_step = decoder_pred_outputs[1]  # Shape (batch=1, input_seq_len)
            new_decoder_states = decoder_pred_outputs[2:]     # List of state tensors

            # Using log probabilities
            log_probs = np.log(output_token_probs[0] + 1e-9) # Add epsilon for stability
            
            # Get top N candidates (N=beam_width)
            top_k_indices = np.argsort(log_probs)[-beam_width:]
            
            for token_idx in top_k_indices:
                new_seq_indices = seq_indices + [token_idx]
                new_score = score + log_probs[token_idx]
                new_attn_weights_list = attn_weights_list + [attention_weights_step[0]] # Store current step's attention
                new_hypotheses_candidates.append((new_seq_indices, new_score, new_decoder_states, new_attn_weights_list))
        
        # Sort all candidates by score and prune to keep only beam_width
        if new_hypotheses_candidates:
            live_hypotheses = sorted(new_hypotheses_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
        else: # No new candidates, all might have completed or something went wrong
            live_hypotheses = [] 

    # Add any remaining live hypotheses to completed (if they didn't end with EOS but max_len reached)
    for seq_indices, score, states, attn_list in live_hypotheses:
         completed_hypotheses.append((seq_indices, score / (len(seq_indices)-1) if len(seq_indices) > 1 else score, states, attn_list))

    if not completed_hypotheses: # Handle cases where no hypothesis completes (e.g. very short max_decoder_seq_length)
        # Fallback: use the best live hypothesis if any, or return empty
        if live_hypotheses:
             best_hypothesis = max(live_hypotheses, key=lambda x: x[1]/len(x[0]) if len(x[0]) > 1 else x[1])
        else:
            return "", np.array([]) # Return empty string and empty attention weights
    else:
        # Choose the best hypothesis from completed ones (highest normalized score)
        best_hypothesis = max(completed_hypotheses, key=lambda x: x[1])
        
    decoded_sentence_indices = best_hypothesis[0]
    final_attention_weights_list = best_hypothesis[3] # List of attention arrays for each decoded step
    
    # Convert indices to characters
    decoded_sentence = ""
    for token_idx in decoded_sentence_indices:
        if token_idx == target_token_index[SOS_TOKEN]:
            continue
        if token_idx == target_token_index[EOS_TOKEN]:
            break
        if token_idx in reverse_target_char_index:
             decoded_sentence += reverse_target_char_index[token_idx]
    
    # Stack attention weights into a matrix: (target_len, input_len)
    # Exclude SOS token's attention if it was part of the loop
    # The length of final_attention_weights_list should match length of decoded_sentence (approx)
    if final_attention_weights_list:
        attention_matrix = np.array(final_attention_weights_list)
        # Ensure attention_matrix is 2D, e.g. (len_output_seq, len_input_seq)
        if attention_matrix.ndim == 1: # If only one step, may need reshape
            attention_matrix = np.expand_dims(attention_matrix, axis=0)
    else:
        attention_matrix = np.array([]) # Empty if no decoding steps produced attention

    return decoded_sentence, attention_matrix

In [9]:
# Cell 9: Training and Evaluation Function for Sweep (train_evaluate_attention)

def train_evaluate_attention():
    keras.backend.clear_session()
    run = wandb.init() # Project/entity inherited from sweep
    config = wandb.config

    print(f"--- Attention Model: Building model for run {run.id if run else 'N/A'} with config: {dict(config)} ---")
    attention_training_model = build_attention_seq2seq_model(config)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=config.early_stopping_patience, restore_best_weights=True, verbose=1)
    wandb_metrics_logger = WandbMetricsLogger(log_freq="epoch")

    print(f"--- Attention Model: Starting training for run {run.id if run else 'N/A'} ---")
    history = attention_training_model.fit(
        [encoder_input_train, decoder_input_train],
        decoder_target_train,
        batch_size=config.batch_size,
        epochs=config.epochs,
        validation_data=([encoder_input_val, decoder_input_val], decoder_target_val),
        callbacks=[early_stopping, wandb_metrics_logger],
        verbose=1
    )

    wandb.log({"val_exact_match_accuracy_attention": history.history['val_accuracy'][-1]})
    
#    # --- Evaluation with Beam Search (on validation set for sweep metric) ---
#    print(f"--- Attention Model: Building inference models for run {run.id if run else 'N/A'} ---")
#    encoder_model_inf, decoder_model_inf = build_attention_inference_models(attention_training_model, config)
#
#    correct_predictions = 0
#    total_predictions = encoder_input_val.shape[0]
#    
#    if total_predictions == 0:
#        print("No validation data for attention model evaluation.")
#        wandb.log({"val_exact_match_accuracy_attention": 0.0})
#        return
#
#    eval_table_data_attention = []
#    print(f"--- Attention Model: Starting validation evaluation for run {run.id if run else 'N/A'} ---")
#    for i in range(min(total_predictions, 1000)): # Evaluate on a subset of val for speed during sweep
#        current_input_vector = encoder_input_val[i:i+1]
#        original_input_text = input_texts_val[i]
#        original_target_text = target_texts_val[i]
#        
#        decoded_sentence, _ = decode_sequence_beam_search_attention( # We don't need attention weights here
#            current_input_vector, encoder_model_inf, decoder_model_inf, config.beam_size, config
#        )
#        
#        if decoded_sentence == original_target_text:
#            correct_predictions += 1
#        
#        if i < 3 and run: # Log first 3 examples to W&B Table per run
#            eval_table_data_attention.append([original_input_text, original_target_text, decoded_sentence])
#
#    if eval_table_data_attention and run:
#        try:
#            wandb.log({"eval_examples_attention": wandb.Table(data=eval_table_data_attention,
#                                                       columns=["Input", "True Target", "Predicted Target"])})
#        except Exception as e:
#            print(f"Error logging eval_examples_attention to W&B: {e}")  
#
#    val_exact_match_accuracy = correct_predictions / min(total_predictions, 1000) if min(total_predictions, 1000) > 0 else 0.0
#    
#    if run:
#        try:
#            wandb.log({"val_exact_match_accuracy_attention": val_exact_match_accuracy})
#        except Exception as e:
#            print(f"Error logging val_exact_match_accuracy_attention to W&B: {e}")
#    
#    print(f"Run {run.id if run else 'Unknown'} | Attention Model Validation Accuracy (Beam {config.beam_size}): {val_exact_match_accuracy:.4f}")
#    print(f"--- Attention Model: Finished evaluation for run {run.id if run else 'N/A'} ---")

In [10]:
# Cell 10: Wandb Sweep Configuration for Attention Model

sweep_config_attention = {
    'method': 'bayes',  # Bayesian optimization, or 'random', 'grid'
    'metric': {
        'name': 'val_exact_match_accuracy_attention', # Custom metric from beam search eval
        'goal': 'maximize'   
    },
    'parameters': {
        'input_embedding_size': {
            'values': [32, 64, 128] 
        },
        'hidden_size': {
            'values': [64, 128, 256] 
        },
        'encoder_layers': {
            'values': [1, 2]
        },
        'decoder_layers': {
            'values': [1, 2]
        },
        'cell_type': {
            'values': ['GRU', 'LSTM']
        },
        'dropout_rate': {
            'values': [0.2, 0.3]
        },
        'learning_rate': {
            'values': [0.001]
        },
        'batch_size': {
            'values': [64, 128, 256]
        },
        'epochs': { # Max epochs, early stopping will handle actual duration
            'values': [50] # Reduced for quicker sweep, increase for final model
        },
        'early_stopping_patience': {
            'values': [5]
        },
        'beam_size': { # This is for evaluation
            'values': [1, 3, 5] # 1 is greedy
        },
        'optimizer': {
            'values': ['adam', 'nadam']
        }
    }
}

# Note on hyperparameters for Q5(a):
# Yes, hyperparameters should be tuned again for the attention model.
# The optimal configuration for a vanilla seq2seq model might not be optimal
# for a model with an attention mechanism due to differences in architecture,
# parameter count, and learning dynamics. The sweep above is an example.
# For simplicity, the problem statement suggests single-layer encoder/decoder.
# The sweep config can be adjusted accordingly (e.g., 'encoder_layers': {'values': [1]}).
# The current `build_attention_seq2seq_model` is designed for a single-layer decoder RNN
# before combining with attention context.

In [None]:
# Cell 11: Start Sweep Agent for Attention Model

# --- Initialize Sweep for Attention Model ---
# Ensure your W&B entity and project are correctly set.
# You might want a new project name or a way to distinguish these runs.
sweep_id_attention = wandb.sweep(
    sweep_config_attention, 
    entity="ce21b097-indian-institute-of-technology-madras", # Replace with your entity
    project="CE21B097 - DA6401 - Assignment 3" # Example: New project or tag runs
)

# --- Run Agent for Attention Model ---
# Adjust 'count' as needed.
wandb.agent(sweep_id_attention, function=train_evaluate_attention, count=15) # Example: 10 runs

print("\n--- Attention Model Sweep Finished ---")
print("Go to your W&B project page to see the results.")

Create sweep with ID: 1wuogarq
Sweep URL: https://wandb.ai/ce21b097-indian-institute-of-technology-madras/CE21B097%20-%20DA6401%20-%20Assignment%203/sweeps/1wuogarq


wandb: Agent Starting Run: duke9oan with config:
wandb: 	batch_size: 64
wandb: 	beam_size: 3
wandb: 	cell_type: GRU
wandb: 	decoder_layers: 1
wandb: 	dropout_rate: 0.2
wandb: 	early_stopping_patience: 5
wandb: 	encoder_layers: 1
wandb: 	epochs: 50
wandb: 	hidden_size: 64
wandb: 	input_embedding_size: 128
wandb: 	learning_rate: 0.001
wandb: 	optimizer: nadam


--- Attention Model: Building model for run duke9oan with config: {'batch_size': 64, 'beam_size': 3, 'cell_type': 'GRU', 'decoder_layers': 1, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 1, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 128, 'learning_rate': 0.001, 'optimizer': 'nadam'} ---
--- Attention Model: Starting training for run duke9oan ---
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
 20/691 [..............................] - ETA: 11s - loss: 0.7485 - ac