In [1]:
# Cell 1: Install Libraries (if not already installed)
# !pip install wandb tensorflow numpy pandas scikit-learn

# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras # Ensures keras is from tensorflow
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout # Removed Bidirectional, AdditiveAttention, Attention as not used in Q2 model
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping # Removed ModelCheckpoint as WandbModelCheckpoint is used
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split # Not strictly used in the sweep, but good for general use
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint


import os
import re
import time
import unicodedata

In [2]:
# Cell 2: Wandb Login
# Make sure to replace with your actual entity and project names in the sweep config later
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mce21b097[0m ([33mce21b097-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# Cell 3: Data Loading and Initial Parsing

def load_data(filepath):
    """Loads data from a TSV file."""
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, on_bad_lines='skip', names=['native', 'roman', 'count'])
        # The problem statement is romanized (input) to native (target)
        # e.g., "ajanabee अजनबी" -> x = "ajanabee", y = "अजनबी"
        # The Dakshina dataset lexicon format is: native_word, romanization, count
        # So, for our task: input_texts = df['roman'], target_texts = df['native']
        
        # Handle cases where lines might not have 3 columns or have NaN values
        df.dropna(subset=['native', 'roman'], inplace=True)
        
        input_texts = df['roman'].astype(str).tolist()
        target_texts = df['native'].astype(str).tolist()
        return input_texts, target_texts
    except Exception as e:
        print(f"Error loading data from {filepath}: {e}")
        return [], []

# --- Define file paths ---
# Make sure these paths are correct for your local project structure
# Assuming dakshina_dataset_v1.0 is in the same directory as the notebook
dataset_base_dir = 'dakshina_dataset_v1.0' 
language = 'hi' # Hindi

train_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.train.tsv')
dev_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.dev.tsv')
test_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.test.tsv')

# Load data
input_texts_train_full, target_texts_train_full = load_data(train_file)
input_texts_val, target_texts_val = load_data(dev_file)
input_texts_test, target_texts_test = load_data(test_file) # Test set for final evaluation after sweep

# For faster sweep iterations, you might want to use a subset of the training data
# For now, let's use the full training data. If sweeps are too slow, consider sampling.
input_texts_train, target_texts_train = input_texts_train_full, target_texts_train_full

print(f"Training samples: {len(input_texts_train)}")
print(f"Validation samples: {len(input_texts_val)}")
print(f"Test samples: {len(input_texts_test)}")

if len(input_texts_train) > 0 and len(target_texts_train) > 0:
    print("\nSample training data:")
    for i in range(min(3, len(input_texts_train))):
        print(f"Input: {input_texts_train[i]}, Target: {target_texts_train[i]}")
else:
    print("No training data loaded. Please check file paths and content.")

if len(input_texts_val) == 0:
    print("No validation data loaded. Sweeps will not work correctly without validation data.")



Training samples: 44202
Validation samples: 4358
Test samples: 4502

Sample training data:
Input: an, Target: अं
Input: ankganit, Target: अंकगणित
Input: uncle, Target: अंकल


In [4]:
# Cell 4: Data Preprocessing - Vocabulary, Tokenization, Padding

# --- Character sets and tokenization ---
input_characters = set()
target_characters = set()

for text in input_texts_train:
    for char in text:
        if char not in input_characters:
            input_characters.add(char)

for text in target_texts_train:
    for char in text: # Add SOS and EOS tokens
        if char not in target_characters:
            target_characters.add(char)

# Add special tokens
SOS_TOKEN = '\t' # Start Of Sequence
EOS_TOKEN = '\n' # End Of Sequence
# Ensure target characters include SOS and EOS, even if not in training data explicitly for some reason
target_characters.add(SOS_TOKEN)
target_characters.add(EOS_TOKEN)


input_char_list = sorted(list(input_characters))
target_char_list = sorted(list(target_characters))

num_encoder_tokens = len(input_char_list)
num_decoder_tokens = len(target_char_list)

# Create char-to-index and index-to-char mappings
input_token_index = {char: i for i, char in enumerate(input_char_list)}
target_token_index = {char: i for i, char in enumerate(target_char_list)}

reverse_input_char_index = {i: char for char, i in input_token_index.items()}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

# Determine max sequence lengths
max_encoder_seq_length = max(len(text) for text in input_texts_train + input_texts_val)
max_decoder_seq_length = max(len(text) for text in target_texts_train + target_texts_val) + 2 # +2 for SOS and EOS

print(f"\nNumber of unique input tokens: {num_encoder_tokens}")
print(f"Number of unique output tokens: {num_decoder_tokens}")
print(f"Max sequence length for inputs: {max_encoder_seq_length}")
print(f"Max sequence length for outputs: {max_decoder_seq_length}")


# --- Vectorize the data ---
def vectorize_data(input_texts, target_texts, is_training=True):
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
    decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length), dtype="float32")
    decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32") # For sparse_categorical_crossentropy, this should be integers

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            if char in input_token_index: # Handle chars not in vocab if any (should not happen if vocab from train)
                 encoder_input_data[i, t] = input_token_index[char]
            # else: ignore unknown char or map to a special <UNK> token if defined
        
        # Decoder target data is ahead of decoder input data by one timestep
        # and includes the start token.
        # Decoder input: <SOS> char1 char2 ...
        # Decoder target: char1 char2 ... <EOS>
        
        processed_target_text = SOS_TOKEN + target_text + EOS_TOKEN
        for t, char in enumerate(processed_target_text):
            if t < max_decoder_seq_length:
                if char in target_token_index:
                    decoder_input_data[i, t] = target_token_index[char]
                    if t > 0: # decoder_target_data will be one timestep ahead
                        # For sparse_categorical_crossentropy, target should be (batch, seq_len) with token indices
                        # However, standard Keras examples use one-hot for TimeDistributed(Dense)
                        # If using sparse_categorical_crossentropy, decoder_target_data should be integer indices
                        decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 # One-hot
            # else: sequence is longer than max_decoder_seq_length, truncate
    
    # If using sparse_categorical_crossentropy, decoder_target_data should be:
    # decoder_target_data_sparse = np.zeros((len(target_texts), max_decoder_seq_length), dtype="float32")
    # ... loop ...
    # if t > 0 and char in target_token_index:
    #    decoder_target_data_sparse[i, t-1] = target_token_index[char]
    # return encoder_input_data, decoder_input_data, decoder_target_data_sparse
    
    return encoder_input_data, decoder_input_data, decoder_target_data


encoder_input_train, decoder_input_train, decoder_target_train = vectorize_data(input_texts_train, target_texts_train)
encoder_input_val, decoder_input_val, decoder_target_val = vectorize_data(input_texts_val, target_texts_val)

print("\nShape of encoder_input_train:", encoder_input_train.shape)
print("Shape of decoder_input_train:", decoder_input_train.shape)
print("Shape of decoder_target_train:", decoder_target_train.shape)



Number of unique input tokens: 26
Number of unique output tokens: 65
Max sequence length for inputs: 20
Max sequence length for outputs: 21

Shape of encoder_input_train: (44202, 20)
Shape of decoder_input_train: (44202, 21)
Shape of decoder_target_train: (44202, 21, 65)


In [5]:
# Cell 5: Model Building Function

def build_seq2seq_model(config):
    """Builds the Encoder-Decoder model based on wandb config."""
    
    # Encoder
    encoder_inputs = Input(shape=(None,), name="encoder_inputs") # None allows variable sequence length for encoder input
    emb_enc = Embedding(num_encoder_tokens, config.input_embedding_size, name="encoder_embedding")(encoder_inputs)
    
    # Select RNN cell type
    if config.cell_type == "LSTM":
        RNNCell = LSTM
    elif config.cell_type == "GRU":
        RNNCell = GRU
    else: # Default to Vanilla RNN
        RNNCell = keras.layers.SimpleRNN # Corrected SimpleRNN layer access

    encoder_rnn_output = emb_enc
    encoder_states_list = []

    for i in range(config.encoder_layers):
        is_last_layer = (i == config.encoder_layers - 1)
        rnn_layer = RNNCell(config.hidden_size, 
                            return_sequences=not is_last_layer, # Only last layer returns just state
                            return_state=True, 
                            dropout=config.dropout_rate if config.encoder_layers > 1 and i < config.encoder_layers-1 else 0.0, # Dropout between layers
                            name=f"encoder_{config.cell_type}_{i}")
        
        if config.cell_type == "LSTM":
            encoder_rnn_output, state_h, state_c = rnn_layer(encoder_rnn_output)
            encoder_states = [state_h, state_c]
        else: # GRU or SimpleRNN
            encoder_rnn_output, state_h = rnn_layer(encoder_rnn_output)
            encoder_states = [state_h]
        
        if is_last_layer: # We need the states from the last encoder layer for the decoder
            encoder_states_list = encoder_states
        else: # If intermediate layer, its output is sequence for next layer
             encoder_rnn_output = Dropout(config.dropout_rate)(encoder_rnn_output) if config.dropout_rate > 0 else encoder_rnn_output


    # Decoder
    decoder_inputs = Input(shape=(None,), name="decoder_inputs")
    # For decoder embedding, we can use a separate one or share with encoder if vocabularies were merged.
    # Here, using a separate embedding layer for the decoder.
    emb_dec = Embedding(num_decoder_tokens, config.input_embedding_size, name="decoder_embedding")(decoder_inputs) # Using same embedding dim as input
    
    decoder_rnn_output = emb_dec
    
    for i in range(config.decoder_layers):
        rnn_layer = RNNCell(config.hidden_size, 
                            return_sequences=True, 
                            return_state=True, 
                            dropout=config.dropout_rate if config.decoder_layers > 1 and i < config.decoder_layers-1 else 0.0,
                            name=f"decoder_{config.cell_type}_{i}")
        
        # Initialize decoder with encoder's final states
        # For the first decoder layer, initialize with encoder_states_list
        # For subsequent decoder layers, they will initialize with their own previous states (Keras handles this)
        initial_state_arg = encoder_states_list if i == 0 else None 
        
        if config.cell_type == "LSTM":
            decoder_rnn_output, _, _ = rnn_layer(decoder_rnn_output, initial_state=initial_state_arg)
        else: # GRU or SimpleRNN
            decoder_rnn_output, _ = rnn_layer(decoder_rnn_output, initial_state=initial_state_arg)
        
        if i < config.decoder_layers - 1: # Apply dropout between decoder layers
             decoder_rnn_output = Dropout(config.dropout_rate)(decoder_rnn_output) if config.dropout_rate > 0 else decoder_rnn_output

    # Final output layer
    decoder_dense = Dense(num_decoder_tokens, activation="softmax", name="decoder_output_dense")
    decoder_outputs = decoder_dense(decoder_rnn_output)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Compile the model
    # Optimizer can also be part of the sweep
    optimizer_choice = tf.keras.optimizers.Adam(learning_rate=config.learning_rate)
    if hasattr(config, 'optimizer'):
        if config.optimizer == 'rmsprop':
            optimizer_choice = tf.keras.optimizers.RMSprop(learning_rate=config.learning_rate)
        elif config.optimizer == 'sgd':
            optimizer_choice = tf.keras.optimizers.SGD(learning_rate=config.learning_rate)
            
    model.compile(optimizer=optimizer_choice, loss="categorical_crossentropy", metrics=["accuracy"])
    # If decoder_target_data was integer indices, use "sparse_categorical_crossentropy"
    
    return model


In [6]:
# Cell 6: Inference Models and Beam Search Decode Function

def build_inference_models(training_model, config):
    # --- Encoder Model for Inference ---
    # Fix: Create Input layer instead of getting it from training model
    encoder_inputs_inf = Input(shape=(None,), name="encoder_inputs_inf") 
    
    # Get encoder embedding layer output through the layer's weights
    encoder_embedding_layer = training_model.get_layer("encoder_embedding")
    encoder_embedding_inf = encoder_embedding_layer(encoder_inputs_inf)
    
    current_encoder_output = encoder_embedding_inf
    encoder_states_inf_list = []

    # Rest of the encoder model building...
    for i in range(config.encoder_layers):
        encoder_rnn_layer_inf = training_model.get_layer(f"encoder_{config.cell_type}_{i}")
        if config.cell_type == "LSTM":
            current_encoder_output, state_h_enc, state_c_enc = encoder_rnn_layer_inf(current_encoder_output)
            if i == config.encoder_layers - 1:
                 encoder_states_inf_list = [state_h_enc, state_c_enc]
        else: # GRU or SimpleRNN
            current_encoder_output, state_h_enc = encoder_rnn_layer_inf(current_encoder_output)
            if i == config.encoder_layers - 1:
                encoder_states_inf_list = [state_h_enc]

    # Create encoder model with the new input
    encoder_model_inf = Model(encoder_inputs_inf, encoder_states_inf_list)
    # --- Decoder Model for Inference ---
    decoder_state_input_h_list = []
    decoder_state_input_c_list = [] # Only for LSTM
    decoder_states_inputs_inf_list = []

    for i in range(config.decoder_layers):
        state_h = Input(shape=(config.hidden_size,), name=f"decoder_state_input_h_{i}")
        decoder_state_input_h_list.append(state_h)
        decoder_states_inputs_inf_list.append(state_h)
        if config.cell_type == "LSTM":
            state_c = Input(shape=(config.hidden_size,), name=f"decoder_state_input_c_{i}")
            decoder_state_input_c_list.append(state_c)
            decoder_states_inputs_inf_list.append(state_c)


    decoder_inputs_inf_single_step = Input(shape=(1,), name="decoder_inputs_single_step") # Input is one char at a time
    decoder_embedding_inf = training_model.get_layer("decoder_embedding")(decoder_inputs_inf_single_step)

    current_decoder_output_inf = decoder_embedding_inf
    decoder_states_output_inf_list = []

    # The initial states for the *first* decoder layer during inference come from the encoder.
    # However, the inference decoder model needs to be general and accept states for *all its layers*.
    # For the first step, we pass encoder_states to the first decoder layer.
    # For subsequent steps, we pass the output states of the previous step.
    
    # We need to reconstruct the state inputs for each decoder layer carefully
    # Keras layers expect a list of states if stateful, or if return_state=True
    
    temp_decoder_states_inputs_inf = []
    if config.cell_type == "LSTM":
        for i in range(config.decoder_layers):
            temp_decoder_states_inputs_inf.extend([decoder_state_input_h_list[i], decoder_state_input_c_list[i]])
    else: # GRU/SimpleRNN
        for i in range(config.decoder_layers):
            temp_decoder_states_inputs_inf.append(decoder_state_input_h_list[i])

    idx = 0
    for i in range(config.decoder_layers):
        decoder_rnn_layer_inf = training_model.get_layer(f"decoder_{config.cell_type}_{i}")
        
        # Prepare initial_state for this specific layer from the input states list
        if config.cell_type == "LSTM":
            layer_initial_states = [temp_decoder_states_inputs_inf[idx], temp_decoder_states_inputs_inf[idx+1]]
            idx += 2
            current_decoder_output_inf, state_h_dec, state_c_dec = decoder_rnn_layer_inf(
                current_decoder_output_inf, initial_state=layer_initial_states
            )
            decoder_states_output_inf_list.extend([state_h_dec, state_c_dec])
        else: # GRU or SimpleRNN
            layer_initial_states = [temp_decoder_states_inputs_inf[idx]]
            idx += 1
            current_decoder_output_inf, state_h_dec = decoder_rnn_layer_inf(
                current_decoder_output_inf, initial_state=layer_initial_states
            )
            decoder_states_output_inf_list.append(state_h_dec)
            
    decoder_dense_inf = training_model.get_layer("decoder_output_dense")
    decoder_outputs_inf = decoder_dense_inf(current_decoder_output_inf)
    
    decoder_model_inf = Model(
        [decoder_inputs_inf_single_step] + temp_decoder_states_inputs_inf, 
        [decoder_outputs_inf] + decoder_states_output_inf_list
    )
    
    return encoder_model_inf, decoder_model_inf


def decode_sequence_beam_search(input_seq_vectorized, encoder_model, decoder_model, beam_width, config):
    # Encode the input as state vectors.
    states_value_list = encoder_model.predict(input_seq_vectorized, verbose=0)
    
    if not isinstance(states_value_list, list):
        states_value_list = [states_value_list]  # Convert to list if single state
    
    # Initialize decoder states
    current_states_for_decoder_model = []
    
    # Populate initial states for the first decoder layer from encoder
    current_states_for_decoder_model.extend(states_value_list)
    
    # Populate zero states for subsequent decoder layers (if any)
    batch_size = 1  # For single sequence decoding
    num_states_per_layer = 2 if config.cell_type == "LSTM" else 1
    for _ in range(1, config.decoder_layers):
        for _ in range(num_states_per_layer):
            current_states_for_decoder_model.append(np.zeros((batch_size, config.hidden_size)))

    # Start with the SOS token
    target_seq = np.array([[target_token_index[SOS_TOKEN]]])
    
    # Initial beam: (sequence_indices, log_probability, states_for_decoder_model)
    initial_beam = [([target_token_index[SOS_TOKEN]], 0.0, current_states_for_decoder_model)]
    live_hypotheses = initial_beam

    for _ in range(max_decoder_seq_length):
        new_hypotheses = []
        for seq_indices, score, current_states in live_hypotheses:
            if seq_indices[-1] == target_token_index[EOS_TOKEN] and len(seq_indices) > 1:
                new_hypotheses.append((seq_indices, score, current_states))
                continue

            # Predict next token
            last_token_idx = np.array([[seq_indices[-1]]])
            
            # Make sure all inputs have batch_size=1 and correct shapes
            decoder_model_inputs = [last_token_idx] + [
                np.reshape(state, (1, -1)) if state.shape[0] != 1 else state 
                for state in current_states
            ]
            
            output_tokens_probs_list = decoder_model.predict(decoder_model_inputs, verbose=0)
            
            output_tokens_probs = output_tokens_probs_list[0]  # Shape should be (1, 1, num_decoder_tokens)
            new_states_list = output_tokens_probs_list[1:]

            # Calculate log probabilities for all tokens
            log_probs = np.log(output_tokens_probs[0, 0] + 1e-9)  # Get probabilities for the first (and only) timestep
            top_k_indices = np.argsort(log_probs)[-beam_width:]  # Get indices of top k probabilities
            
            for token_idx in top_k_indices:
                if token_idx < len(log_probs):  # Add bounds check
                    new_seq_indices = seq_indices + [token_idx]
                    new_score = score + log_probs[token_idx]
                    new_hypotheses.append((new_seq_indices, new_score, new_states_list))

        # Sort and keep top beam_width hypotheses
        if new_hypotheses:  # Only sort if we have new hypotheses
            live_hypotheses = sorted(new_hypotheses, key=lambda x: x[1], reverse=True)[:beam_width]
        else:
            break  # No valid hypotheses, end decoding

        if all(h[0][-1] == target_token_index[EOS_TOKEN] for h in live_hypotheses if len(h[0]) > 1):
            break

    # Choose best hypothesis and convert to text
    if not live_hypotheses:  # Handle the case where we have no valid hypotheses
        return ""
        
    best_hypothesis = max(live_hypotheses, key=lambda x: x[1]/len(x[0]) if len(x[0]) > 1 else x[1])
    decoded_sentence_indices = best_hypothesis[0]
    
    decoded_sentence = ""
    for token_idx in decoded_sentence_indices:
        if token_idx == target_token_index[SOS_TOKEN]:
            continue
        if token_idx == target_token_index[EOS_TOKEN]:
            break
        if token_idx in reverse_target_char_index:
             decoded_sentence += reverse_target_char_index[token_idx]
    return decoded_sentence


In [7]:
# Cell 7: Training and Evaluation Function (train_evaluate) (Modified)
from tqdm import tqdm
def train_evaluate():
    keras.backend.clear_session() # <<< --- ADD THIS LINE TO CLEAR KERAS SESSION
    
    # The agent (wandb.agent) calls this function.
    # It sets up wandb.config. We initialize W&B to connect to this run.
    run = wandb.init() # Project and entity are typically inherited from the sweep environment.
                       
    config = wandb.config # This is populated by the W&B agent

    # Build the training model
    print(f"--- Building model for run {run.id if run else 'N/A'} with config: {dict(config)} ---")
    training_model = build_seq2seq_model(config)
    # training_model.summary() # Optional: for debugging model structure
    
    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', 
                                   patience=config.early_stopping_patience, 
                                   restore_best_weights=True, 
                                   verbose=1)
    wandb_metrics_logger = WandbMetricsLogger(log_freq="epoch")

    # Train the model
    print(f"--- Starting training for run {run.id if run else 'N/A'} ---")
    history = training_model.fit(
        [encoder_input_train, decoder_input_train],
        decoder_target_train,
        batch_size=config.batch_size,
        epochs=config.epochs, 
        validation_data=([encoder_input_val, decoder_input_val], decoder_target_val),
        callbacks=[early_stopping, wandb_metrics_logger],
        verbose=1 
    )
    
    wandb.log({"val_exact_match_accuracy": history.history['val_accuracy'][-1]})
    # # --- Evaluation with Beam Search ---
    # # Build inference models from the *trained* training_model weights
    # print(f"--- Building inference models for run {run.id if run else 'N/A'} ---")
    # encoder_model_inf, decoder_model_inf = build_inference_models(training_model, config)

    # correct_predictions = 0
    # total_predictions = encoder_input_val.shape[0] 
    
    # if total_predictions == 0:
    #     print("No validation data to evaluate.")
    #     wandb.log({"val_exact_match_accuracy": 0.0})
    #     # wandb.finish() # Agent handles finishing the run
    #     return

    # eval_table_data = []
    # print(f"--- Starting evaluation for run {run.id if run else 'N/A'} ---")
    # for i in tqdm(range(total_predictions)):
    #     current_input_vector = encoder_input_val[i:i+1] 
    #     original_input_text = input_texts_val[i] 
    #     original_target_text = target_texts_val[i]
        
    #     decoded_sentence = decode_sequence_beam_search(
    #         current_input_vector, 
    #         encoder_model_inf, 
    #         decoder_model_inf, 
    #         config.beam_size,
    #         config 
    #     )
        
    #     if decoded_sentence == original_target_text:
    #         correct_predictions += 1
            
    #     if i < 5: 
    #         eval_table_data.append([original_input_text, original_target_text, decoded_sentence])

    # if eval_table_data:
    #     try: # Add try-except for wandb logging as a precaution
    #         wandb.log({"eval_examples": wandb.Table(data=eval_table_data,
    #                                                columns=["Input", "True Target", "Predicted Target"])})
    #     except Exception as e:
    #         print(f"Error logging eval_examples to W&B: {e}")


    # val_exact_match_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
    
    # try:
    #     wandb.log({"val_exact_match_accuracy": val_exact_match_accuracy}) 
    # except Exception as e:
    #     print(f"Error logging val_exact_match_accuracy to W&B: {e}")

    
    # print(f"Run {run.id if run else 'Unknown'} | Validation Exact Match Accuracy (Beam Size {config.beam_size}): {val_exact_match_accuracy:.4f}")
    # print(f"--- Finished evaluation for run {run.id if run else 'N/A'} ---")
    # # The W&B agent calling this function will handle wandb.finish()


In [8]:
# Cell 8: Wandb Sweep Configuration

sweep_config = {
    'method': 'bayes',  # Bayesian optimization, or 'random', 'grid'
    'metric': {
        'name': 'val_exact_match_accuracy', # Custom metric from beam search eval
        'goal': 'maximize'   
    },
    'parameters': {
        'input_embedding_size': {
            'values': [32, 64, 128] 
        },
        'hidden_size': {
            'values': [64, 128, 256] 
        },
        'encoder_layers': {
            'values': [1, 2]
        },
        'decoder_layers': {
            'values': [1, 2]
        },
        'cell_type': {
            'values': ['RNN', 'GRU', 'LSTM']
        },
        'dropout_rate': {
            'values': [0.2, 0.3]
        },
        'learning_rate': {
            'values': [0.001, 0.0001]
        },
        'batch_size': {
            'values': [64, 128, 356]
        },
        'epochs': { # Max epochs, early stopping will handle actual duration
            'values': [50] # Reduced for quicker sweep, increase for final model
        },
        'early_stopping_patience': {
            'values': [5]
        },
        'beam_size': { # This is for evaluation
            'values': [1, 3, 5] # 1 is greedy
        },
        'optimizer': {
            'values': ['adam', 'nadam']
        }
    }
}

# Add a note about sweep strategy:
# Smart strategies:
# 1. Bayesian optimization (`method: 'bayes'`) is generally more efficient than random or grid search.
# 2. Early Stopping: Already implemented to stop unpromising runs early, saving compute.
# 3. Iterative Sweeps: Start with broader ranges and fewer epochs/smaller dataset subset.
#    Analyze results (parallel coordinates, correlation plots from W&B) to identify promising regions.
#    Then, conduct a more focused sweep with narrowed ranges, more epochs, or the full dataset.
#    (For this assignment, a single comprehensive sweep as configured might be sufficient if time permits,
#     otherwise, reduce `epochs` or `count` for the agent initially).
# 4. Prioritize parameters: Learning rate, cell type, and hidden size are often critical.
#    Dropout and number of layers can be fine-tuned once a good base is found.
# 5. Reduce `count` for `wandb.agent` for initial testing of the pipeline.


In [9]:
history.history['val_accuracy'][-1]

NameError: name 'history' is not defined

In [10]:
# Cell 9: Start the Sweep Agent

# --- Initialize Sweep ---
sweep_id = wandb.sweep(sweep_config, entity="ce21b097-indian-institute-of-technology-madras", project="CE21B097 - DA6401 - Assignment 3")

# --- Run Agent ---
# The 'count' parameter specifies how many runs the agent should perform.
# For a thorough search, this might be 20-50 or more, depending on your compute budget.
# For testing, set count to a small number like 3-5.
wandb.agent(sweep_id, function=train_evaluate, count=30) # Example: 10 runs

print("\n--- Sweep Finished ---")
print("Go to your W&B project page to see the results, including:")
print("- Accuracy v/s Created plot")
print("- Parallel Co-ordinates plot")
print("- Correlation Summary table")


Create sweep with ID: a9aclnzc
Sweep URL: https://wandb.ai/ce21b097-indian-institute-of-technology-madras/CE21B097%20-%20DA6401%20-%20Assignment%203/sweeps/a9aclnzc


[34m[1mwandb[0m: Agent Starting Run: 0ysiqu3z with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam





Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run 0ysiqu3z with config: {'batch_size': 128, 'beam_size': 3, 'cell_type': 'GRU', 'decoder_layers': 2, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 128, 'learning_rate': 0.001, 'optimizer': 'adam'} ---
--- Starting training for run 0ysiqu3z ---
Epoch 1/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 60ms/step - accuracy: 0.0692 - loss: 1.2046 - val_accuracy: 0.0787 - val_loss: 1.0016
Epoch 2/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 58ms/step - accuracy: 0.0861 - loss: 1.0232 - val_accuracy: 0.0942 - val_loss: 0.9222
Epoch 3/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 56ms/step - accuracy: 0.1036 - loss: 0.9480 - val_accuracy: 0.1047 - val_loss: 0.8651
Epoch 4/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 58ms/step - accuracy: 0.1149 - loss: 0.8913 - val_accuracy: 0.1200 - val_loss:

0,1
epoch/accuracy,▁▂▂▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▂▂▃▃▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇████████████████████
epoch/val_loss,█▇▆▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.26523
epoch/epoch,49.0
epoch/learning_rate,0.001
epoch/loss,0.30303
epoch/val_accuracy,0.26017
epoch/val_loss,0.26034
val_exact_match_accuracy,0.26017


[34m[1mwandb[0m: Agent Starting Run: btvjpe57 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run btvjpe57 with config: {'batch_size': 64, 'beam_size': 5, 'cell_type': 'LSTM', 'decoder_layers': 2, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 32, 'learning_rate': 0.0001, 'optimizer': 'nadam'} ---
--- Starting training for run btvjpe57 ---
Epoch 1/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 29ms/step - accuracy: 0.0463 - loss: 1.3440 - val_accuracy: 0.0650 - val_loss: 1.1780
Epoch 2/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 28ms/step - accuracy: 0.0664 - loss: 1.2218 - val_accuracy: 0.0683 - val_loss: 1.1481
Epoch 3/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0712 - loss: 1.1944 - val_accuracy: 0.0710 - val_loss: 1.1320
Epoch 4/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0739 - loss: 1.1786 - val_accuracy: 0.0717 - val_loss

0,1
epoch/accuracy,▁▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████
epoch/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇█████
epoch/val_loss,█▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.12234
epoch/epoch,49.0
epoch/learning_rate,0.0001
epoch/loss,0.90264
epoch/val_accuracy,0.11887
epoch/val_loss,0.85844
val_exact_match_accuracy,0.11887


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lqzsvpbj with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run lqzsvpbj with config: {'batch_size': 64, 'beam_size': 5, 'cell_type': 'RNN', 'decoder_layers': 1, 'dropout_rate': 0.3, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 256, 'input_embedding_size': 128, 'learning_rate': 0.0001, 'optimizer': 'adam'} ---
--- Starting training for run lqzsvpbj ---
Epoch 1/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 32ms/step - accuracy: 0.0717 - loss: 1.2193 - val_accuracy: 0.0919 - val_loss: 1.0769
Epoch 2/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0979 - loss: 1.1070 - val_accuracy: 0.1056 - val_loss: 0.9511
Epoch 3/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.1106 - loss: 0.9793 - val_accuracy: 0.1145 - val_loss: 0.8683
Epoch 4/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.1215 - loss: 0.9031 - val_accuracy: 0.1249 - val_loss

0,1
epoch/accuracy,▁▂▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇████▇████
epoch/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▅▄▄▄▄▄▄▃▃▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▂▃▃▄▃▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▅▇▇▇▇▇▇▇▇▇██████
epoch/val_loss,█▆▅▅▅▄▅▄▄▄▃▃▄▃▃▃▃▃▃▃▂▃▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.19398
epoch/epoch,49.0
epoch/learning_rate,0.0001
epoch/loss,0.55314
epoch/val_accuracy,0.19414
epoch/val_loss,0.50421
val_exact_match_accuracy,0.19414


[34m[1mwandb[0m: Agent Starting Run: h6vwylba with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run h6vwylba with config: {'batch_size': 64, 'beam_size': 3, 'cell_type': 'RNN', 'decoder_layers': 1, 'dropout_rate': 0.3, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 64, 'learning_rate': 0.0001, 'optimizer': 'adam'} ---
--- Starting training for run h6vwylba ---
Epoch 1/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.0454 - loss: 1.3413 - val_accuracy: 0.0725 - val_loss: 1.1234
Epoch 2/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.0769 - loss: 1.1633 - val_accuracy: 0.0814 - val_loss: 1.0797
Epoch 3/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.0837 - loss: 1.1227 - val_accuracy: 0.0852 - val_loss: 1.0512
Epoch 4/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.0880 - loss: 1.0934 - val_accuracy: 0.0888 - val_loss: 1.0

0,1
epoch/accuracy,▁▃▃▃▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▆▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▂▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███████████
epoch/val_loss,█▇▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.14193
epoch/epoch,49.0
epoch/learning_rate,0.0001
epoch/loss,0.81768
epoch/val_accuracy,0.13438
epoch/val_loss,0.78517
val_exact_match_accuracy,0.13438


[34m[1mwandb[0m: Agent Starting Run: nj2tm2g2 with config:
[34m[1mwandb[0m: 	batch_size: 356
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run nj2tm2g2 with config: {'batch_size': 356, 'beam_size': 5, 'cell_type': 'GRU', 'decoder_layers': 2, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 1, 'epochs': 50, 'hidden_size': 256, 'input_embedding_size': 64, 'learning_rate': 0.001, 'optimizer': 'nadam'} ---
--- Starting training for run nj2tm2g2 ---
Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 250ms/step - accuracy: 0.0567 - loss: 1.2560 - val_accuracy: 0.0715 - val_loss: 1.0170
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 240ms/step - accuracy: 0.0786 - loss: 1.0446 - val_accuracy: 0.0827 - val_loss: 0.9488
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 253ms/step - accuracy: 0.0944 - loss: 0.9608 - val_accuracy: 0.0979 - val_loss: 0.8666
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 225ms/step - accuracy: 0.1100 - loss: 0.8840 - val_accuracy: 0.1193 - val_

0,1
epoch/accuracy,▁▁▂▂▃▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▆▆▅▅▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▁▂▃▃▄▅▅▆▆▇▇▇▇██████████████████████████
epoch/val_loss,█▇▇▆▆▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.31846
epoch/epoch,42.0
epoch/learning_rate,0.001
epoch/loss,0.09806
epoch/val_accuracy,0.27797
epoch/val_loss,0.17905
val_exact_match_accuracy,0.27797


[34m[1mwandb[0m: Agent Starting Run: gh0gbznu with config:
[34m[1mwandb[0m: 	batch_size: 356
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run gh0gbznu with config: {'batch_size': 356, 'beam_size': 5, 'cell_type': 'LSTM', 'decoder_layers': 2, 'dropout_rate': 0.3, 'early_stopping_patience': 5, 'encoder_layers': 1, 'epochs': 50, 'hidden_size': 256, 'input_embedding_size': 64, 'learning_rate': 0.0001, 'optimizer': 'nadam'} ---
--- Starting training for run gh0gbznu ---
Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 282ms/step - accuracy: 0.0455 - loss: 1.3691 - val_accuracy: 0.0593 - val_loss: 1.1562
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 277ms/step - accuracy: 0.0619 - loss: 1.2082 - val_accuracy: 0.0642 - val_loss: 1.1487
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 285ms/step - accuracy: 0.0645 - loss: 1.2054 - val_accuracy: 0.0636 - val_loss: 1.1482
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 282ms/step - accuracy: 0.0663 - loss: 1.1967 - val_accuracy: 0.0667 - va

0,1
epoch/accuracy,▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
epoch/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▇▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁
epoch/val_accuracy,▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆█████
epoch/val_loss,███▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▄▄▃▃▃▃▃▃▃▂▂▂▂▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.12701
epoch/epoch,49.0
epoch/learning_rate,0.0001
epoch/loss,0.86836
epoch/val_accuracy,0.11786
epoch/val_loss,0.84756
val_exact_match_accuracy,0.11786


[34m[1mwandb[0m: Agent Starting Run: yehc77n9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run yehc77n9 with config: {'batch_size': 64, 'beam_size': 5, 'cell_type': 'LSTM', 'decoder_layers': 1, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 64, 'learning_rate': 0.001, 'optimizer': 'nadam'} ---
--- Starting training for run yehc77n9 ---
Epoch 1/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.0656 - loss: 1.2026 - val_accuracy: 0.0881 - val_loss: 0.9758
Epoch 2/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.1007 - loss: 0.9868 - val_accuracy: 0.1109 - val_loss: 0.8784
Epoch 3/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.1250 - loss: 0.8861 - val_accuracy: 0.1358 - val_loss: 0.7891
Epoch 4/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.1452 - loss: 0.8066 - val_accuracy: 0.1477 - val_loss:

0,1
epoch/accuracy,▁▂▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▆▅▅▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▂▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇██████████
epoch/val_loss,█▇▆▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.23494
epoch/epoch,49.0
epoch/learning_rate,0.001
epoch/loss,0.43397
epoch/val_accuracy,0.22004
epoch/val_loss,0.42699
val_exact_match_accuracy,0.22004


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: arxk04o1 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run arxk04o1 with config: {'batch_size': 128, 'beam_size': 1, 'cell_type': 'GRU', 'decoder_layers': 2, 'dropout_rate': 0.3, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 128, 'learning_rate': 0.001, 'optimizer': 'nadam'} ---
--- Starting training for run arxk04o1 ---
Epoch 1/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 46ms/step - accuracy: 0.0562 - loss: 1.2408 - val_accuracy: 0.0754 - val_loss: 1.0064
Epoch 2/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step - accuracy: 0.0891 - loss: 1.0139 - val_accuracy: 0.0967 - val_loss: 0.9083
Epoch 3/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step - accuracy: 0.1056 - loss: 0.9374 - val_accuracy: 0.1127 - val_loss: 0.8441
Epoch 4/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 43ms/step - accuracy: 0.1185 - loss: 0.8806 - val_accuracy: 0.1218 - val_loss

0,1
epoch/accuracy,▁▂▃▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▆▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/val_accuracy,▁▂▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
epoch/val_loss,█▇▆▆▆▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.25198
epoch/epoch,49.0
epoch/learning_rate,0.001
epoch/loss,0.34255
epoch/val_accuracy,0.25333
epoch/val_loss,0.28092
val_exact_match_accuracy,0.25333


[34m[1mwandb[0m: Agent Starting Run: a6vnocnb with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run a6vnocnb with config: {'batch_size': 64, 'beam_size': 3, 'cell_type': 'LSTM', 'decoder_layers': 2, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 1, 'epochs': 50, 'hidden_size': 64, 'input_embedding_size': 32, 'learning_rate': 0.0001, 'optimizer': 'nadam'} ---
--- Starting training for run a6vnocnb ---
Epoch 1/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.0487 - loss: 1.3222 - val_accuracy: 0.0635 - val_loss: 1.1477
Epoch 2/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/step - accuracy: 0.0662 - loss: 1.1947 - val_accuracy: 0.0658 - val_loss: 1.1349
Epoch 3/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/step - accuracy: 0.0706 - loss: 1.1816 - val_accuracy: 0.0705 - val_loss: 1.1298
Epoch 4/50
[1m691/691[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/step - accuracy: 0.0739 - loss: 1.1768 - val_accuracy: 0.0707 - val_loss

0,1
epoch/accuracy,▁▂▃▃▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████
epoch/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▆▆▆▆▅▅▅▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
epoch/val_accuracy,▁▁▂▂▃▄▄▄▄▄▄▄▄▄▄▄▅▅▄▄▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇█████
epoch/val_loss,██▇▇▇▇▇▇▆▆▆▇▆▆▆▆▅▅▅▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
val_exact_match_accuracy,▁

0,1
epoch/accuracy,0.10972
epoch/epoch,49.0
epoch/learning_rate,0.0001
epoch/loss,0.95876
epoch/val_accuracy,0.10305
epoch/val_loss,0.92312
val_exact_match_accuracy,0.10305


[34m[1mwandb[0m: Agent Starting Run: pnio67bb with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	early_stopping_patience: 5
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


--- Building model for run pnio67bb with config: {'batch_size': 128, 'beam_size': 5, 'cell_type': 'GRU', 'decoder_layers': 2, 'dropout_rate': 0.2, 'early_stopping_patience': 5, 'encoder_layers': 2, 'epochs': 50, 'hidden_size': 256, 'input_embedding_size': 128, 'learning_rate': 0.0001, 'optimizer': 'nadam'} ---
--- Starting training for run pnio67bb ---
Epoch 1/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 205ms/step - accuracy: 0.0532 - loss: 1.3037 - val_accuracy: 0.0685 - val_loss: 1.0566
Epoch 2/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 202ms/step - accuracy: 0.0727 - loss: 1.0825 - val_accuracy: 0.0757 - val_loss: 0.9990
Epoch 3/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 200ms/step - accuracy: 0.0821 - loss: 1.0337 - val_accuracy: 0.0836 - val_loss: 0.9551
Epoch 4/50
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 202ms/step - accuracy: 0.0907 - loss: 0.9826 - val_accuracy: 0.0886 - va

: 