# Install Libraries and Import Modules

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout, Concatenate, AdditiveAttention, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split # Though not directly used in sweep, good to have
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

import os
import re
import time
import unicodedata
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker # For heatmap ticks

# Wandb Login

In [2]:
wandb.login()

wandb: Currently logged in as: ce21b097 (ce21b097-indian-institute-of-technology-madras) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

# Data Loading and Initial Parsing

In [3]:
def load_data(filepath):
    """Loads data from a TSV file."""
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, on_bad_lines='skip', names=['native', 'roman', 'count'])
        df.dropna(subset=['native', 'roman'], inplace=True)
        input_texts = df['roman'].astype(str).tolist()
        target_texts = df['native'].astype(str).tolist()
        return input_texts, target_texts
    except Exception as e:
        print(f"Error loading data from {filepath}: {e}")
        return [], []

# --- Define file paths ---
dataset_base_dir = 'dakshina_dataset_v1.0' 
language = 'hi' # Hindi

train_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.train.tsv')
dev_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.dev.tsv')
test_file = os.path.join(dataset_base_dir, language, 'lexicons', f'{language}.translit.sampled.test.tsv')

# Load data
input_texts_train_full, target_texts_train_full = load_data(train_file)
input_texts_val, target_texts_val = load_data(dev_file)
input_texts_test, target_texts_test = load_data(test_file)

# For sweeps, we can use the same split as before or the full training data
input_texts_train, target_texts_train = input_texts_train_full, target_texts_train_full

print(f"Training samples: {len(input_texts_train)}")
print(f"Validation samples: {len(input_texts_val)}")
print(f"Test samples: {len(input_texts_test)}")

if len(input_texts_train) > 0 and len(target_texts_train) > 0:
    print("\nSample training data:")
    for i in range(min(3, len(input_texts_train))):
        print(f"Input: {input_texts_train[i]}, Target: {target_texts_train[i]}")
else:
    print("No training data loaded. Please check file paths and content.")

Training samples: 44202
Validation samples: 4358
Test samples: 4502

Sample training data:
Input: an, Target: अं
Input: ankganit, Target: अंकगणित
Input: uncle, Target: अंकल


# Data Preprocessing - Vocabulary, Tokenization, Padding

In [4]:
# --- Character sets and tokenization ---
input_characters = set()
target_characters = set()

all_input_texts_for_vocab = input_texts_train_full + input_texts_val + input_texts_test
all_target_texts_for_vocab = target_texts_train_full + target_texts_val + target_texts_test


for text in all_input_texts_for_vocab:
    for char in str(text): # Ensuring text is string
        if char not in input_characters:
            input_characters.add(char)

for text in all_target_texts_for_vocab:
    for char in str(text): # Ensuring text is string
        if char not in target_characters:
            target_characters.add(char)

# Adding special tokens
SOS_TOKEN = '\t' # Start Of Sequence
EOS_TOKEN = '\n' # End Of Sequence
target_characters.add(SOS_TOKEN)
target_characters.add(EOS_TOKEN)


input_char_list = sorted(list(input_characters))
target_char_list = sorted(list(target_characters))

num_encoder_tokens = len(input_char_list)
num_decoder_tokens = len(target_char_list)

input_token_index = {char: i for i, char in enumerate(input_char_list)}
target_token_index = {char: i for i, char in enumerate(target_char_list)}

reverse_input_char_index = {i: char for char, i in input_token_index.items()}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

# Determining max sequence lengths from all data splits
all_input_texts_combined = input_texts_train_full + input_texts_val + input_texts_test
all_target_texts_combined = target_texts_train_full + target_texts_val + target_texts_test

max_encoder_seq_length = max(len(str(text)) for text in all_input_texts_combined)
max_decoder_seq_length = max(len(str(text)) for text in all_target_texts_combined) + 2 # +2 for SOS and EOS

print(f"\nNumber of unique input tokens: {num_encoder_tokens}")
print(f"Number of unique output tokens: {num_decoder_tokens}")
print(f"Max sequence length for inputs: {max_encoder_seq_length}")
print(f"Max sequence length for outputs: {max_decoder_seq_length}")

# --- Vectorizing the data ---
def vectorize_data(input_texts, target_texts):
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
    decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length), dtype="float32")
    decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(str(input_text)): # Ensuring input_text is string
            if t < max_encoder_seq_length and char in input_token_index:
                 encoder_input_data[i, t] = input_token_index[char]
        
        processed_target_text = SOS_TOKEN + str(target_text) + EOS_TOKEN # Ensuring target_text is string
        for t, char in enumerate(processed_target_text):
            if t < max_decoder_seq_length:
                if char in target_token_index:
                    decoder_input_data[i, t] = target_token_index[char]
                    if t > 0: 
                        decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 
    
    return encoder_input_data, decoder_input_data, decoder_target_data

# Vectorizing the training and validation sets (using the 'train' subset for initial sweep training if preferred)
encoder_input_train, decoder_input_train, decoder_target_train = vectorize_data(input_texts_train, target_texts_train)
encoder_input_val, decoder_input_val, decoder_target_val = vectorize_data(input_texts_val, target_texts_val)
# Test data will be vectorized later before final evaluation.

print("\nShape of encoder_input_train:", encoder_input_train.shape)
print("Shape of decoder_input_train:", decoder_input_train.shape)
print("Shape of decoder_target_train:", decoder_target_train.shape)
print("Shape of encoder_input_val:", encoder_input_val.shape)


Number of unique input tokens: 26
Number of unique output tokens: 65
Max sequence length for inputs: 20
Max sequence length for outputs: 21

Shape of encoder_input_train: (44202, 20)
Shape of decoder_input_train: (44202, 21)
Shape of decoder_target_train: (44202, 21, 65)
Shape of encoder_input_val: (4358, 20)


# Bahdanau Attention Layer Definition

In [5]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units) # For encoder outputs
        self.W2 = tf.keras.layers.Dense(units) # For decoder hidden state
        self.V = tf.keras.layers.Dense(1)      # To compute the score

    def call(self, query, values):
        # query shape == (batch_size, hidden_size) (decoder hidden state)
        # values shape == (batch_size, max_len, hidden_size) (encoder outputs)

        # Expand query to broadcast addition along sequence length dimension
        # query_with_time_axis shape == (batch_size, 1, hidden_size)
        query_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(query_with_time_axis)))

        # attention_weights shape == (batch_size, max_len, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# Attention Model Building Function

In [6]:
def build_attention_seq2seq_model(config):
    # Encoder
    encoder_inputs = Input(shape=(None,), name="encoder_inputs")
    enc_emb = Embedding(num_encoder_tokens, config.input_embedding_size, name="encoder_embedding")(encoder_inputs)

    # Selecting RNN cell type
    if config.cell_type == "LSTM":
        RNNCellEncoder = LSTM
        RNNCellDecoder = LSTM
    elif config.cell_type == "GRU":
        RNNCellEncoder = GRU
        RNNCellDecoder = GRU
    else:
        RNNCellEncoder = keras.layers.SimpleRNN
        RNNCellDecoder = keras.layers.SimpleRNN

    # For simplicity and effective attention, encoder returns all sequences.
    # Using a single layer for encoder as suggested for simplicity, but configurable.
    encoder_outputs_list = []
    current_encoder_output = enc_emb
    encoder_states = [] # For the final state

    for i in range(config.encoder_layers):
        is_last_layer = (i == config.encoder_layers - 1)
        encoder_rnn = RNNCellEncoder(config.hidden_size,
                                     return_sequences=True, # Crucial for attention
                                     return_state=True,
                                     dropout=config.dropout_rate,
                                     name=f"encoder_{config.cell_type}_{i}")
        if config.cell_type == "LSTM":
            current_encoder_output, state_h, state_c = encoder_rnn(current_encoder_output)
            if is_last_layer:
                encoder_states = [state_h, state_c]
        else: # GRU or SimpleRNN
            current_encoder_output, state_h = encoder_rnn(current_encoder_output)
            if is_last_layer:
                encoder_states = [state_h]
    
    encoder_all_outputs = current_encoder_output 

    # Decoder
    decoder_inputs = Input(shape=(None,), name="decoder_inputs")
    dec_emb_layer = Embedding(num_decoder_tokens, config.input_embedding_size, name="decoder_embedding")
    dec_emb = dec_emb_layer(decoder_inputs)

    # Decoder RNN - single layer as suggested for simplicity, but configurable
    # It will process one timestep at a time in the training model for clarity,
    # though Keras handles the loop.
    decoder_rnn_layer = RNNCellDecoder(config.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   dropout=config.dropout_rate,
                                   name=f"decoder_{config.cell_type}")

    # Attention Layer
    attention_layer = BahdanauAttention(config.hidden_size) # Attention units often same as hidden_size

    all_decoder_outputs = []

    # Initial decoder hidden state:
    decoder_hidden_states = encoder_states # From last encoder layer
    
    # Let's use the decoder_rnn_layer to process the entire sequence of decoder embeddings
    if config.cell_type == "LSTM":
        decoder_rnn_outputs, _, _ = decoder_rnn_layer(dec_emb, initial_state=decoder_hidden_states)
    else: # GRU / SimpleRNN
        decoder_rnn_outputs, _ = decoder_rnn_layer(dec_emb, initial_state=decoder_hidden_states)
    
    context_vector_seq, attention_weights_seq = tf.keras.layers.AdditiveAttention(name="attention_layer")(
        [decoder_rnn_outputs, encoder_all_outputs], return_attention_scores=True
    )
    # context_vector_seq shape: (batch_size, target_seq_len, encoder_hidden_size)
    # attention_weights_seq shape: (batch_size, target_seq_len, input_seq_len)

    # Concatenating context vector with decoder RNN output
    decoder_concat_input = Concatenate(axis=-1, name="concat_layer")([decoder_rnn_outputs, context_vector_seq])

    # Final output layer
    decoder_dense = Dense(num_decoder_tokens, activation="softmax", name="decoder_output_dense")
    decoder_outputs_final = decoder_dense(decoder_concat_input)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs_final)
    
    # Compile
    optimizer_choice = tf.keras.optimizers.Adam(learning_rate=config.learning_rate)
    # Allow other optimizers from config if specified
    if hasattr(config, 'optimizer'):
        if config.optimizer == 'nadam':
            optimizer_choice = tf.keras.optimizers.Nadam(learning_rate=config.learning_rate)
        elif config.optimizer == 'rmsprop':
            optimizer_choice = tf.keras.optimizers.RMSprop(learning_rate=config.learning_rate)
    
    model.compile(optimizer=optimizer_choice, loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# Inference Models for Attention Model

In [7]:
def build_attention_inference_models(training_model, config):
    # --- Encoder Model ---
    encoder_inputs_inf = training_model.get_layer("encoder_inputs").input
    encoder_embedding_inf = training_model.get_layer("encoder_embedding")(encoder_inputs_inf)
    
    current_encoder_output_inf = encoder_embedding_inf
    encoder_states_inf = [] # Final states of the last encoder layer
    
    for i in range(config.encoder_layers):
        encoder_rnn_layer_inf = training_model.get_layer(f"encoder_{config.cell_type}_{i}")
        if config.cell_type == "LSTM":
            current_encoder_output_inf, state_h_enc, state_c_enc = encoder_rnn_layer_inf(current_encoder_output_inf)
            if i == config.encoder_layers - 1: # Last layer
                encoder_states_inf = [state_h_enc, state_c_enc]
        else: # GRU or SimpleRNN
            current_encoder_output_inf, state_h_enc = encoder_rnn_layer_inf(current_encoder_output_inf)
            if i == config.encoder_layers - 1: # Last layer
                encoder_states_inf = [state_h_enc]
    
    encoder_all_outputs_inf = current_encoder_output_inf # All hidden states from last encoder layer
    encoder_model_inf = Model(encoder_inputs_inf, [encoder_all_outputs_inf] + encoder_states_inf)

    # --- Decoder Model for Inference ---
    decoder_hidden_size = config.hidden_size # Assuming decoder hidden size is same as attention units
    
    # Inputs for the decoder step
    decoder_input_single_step = Input(shape=(1,), name="decoder_input_single_step") # One token
    encoder_all_outputs_as_input = Input(shape=(max_encoder_seq_length, config.hidden_size), name="encoder_all_outputs_as_input") # From encoder

    # Decoder initial states (list of tensors, one for h, one for c if LSTM)
    decoder_initial_states_inputs = []
    if config.cell_type == "LSTM":
        decoder_state_input_h = Input(shape=(decoder_hidden_size,), name="decoder_state_h_input")
        decoder_state_input_c = Input(shape=(decoder_hidden_size,), name="decoder_state_c_input")
        decoder_initial_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    else: # GRU or SimpleRNN
        decoder_state_input_h = Input(shape=(decoder_hidden_size,), name="decoder_state_h_input")
        decoder_initial_states_inputs = [decoder_state_input_h]

    # Getting layers from the trained model
    dec_emb_layer_inf = training_model.get_layer("decoder_embedding")
    decoder_rnn_layer_inf = training_model.get_layer(f"decoder_{config.cell_type}") # Assumes single layer decoder for simplicity
    attention_layer_inf = training_model.get_layer("attention_layer") # Keras AdditiveAttention
    decoder_dense_inf = training_model.get_layer("decoder_output_dense")
    concat_layer_inf = training_model.get_layer("concat_layer")

    # Decoder step execution
    dec_emb_single_step = dec_emb_layer_inf(decoder_input_single_step) # Shape (batch, 1, embedding_dim)

    # Decoder RNN step
    if config.cell_type == "LSTM":
        decoder_rnn_output_step, state_h_dec, state_c_dec = decoder_rnn_layer_inf(
            dec_emb_single_step, initial_state=decoder_initial_states_inputs
        )
        decoder_new_states = [state_h_dec, state_c_dec]
    else: # GRU or SimpleRNN
        decoder_rnn_output_step, state_h_dec = decoder_rnn_layer_inf(
            dec_emb_single_step, initial_state=decoder_initial_states_inputs
        )
        decoder_new_states = [state_h_dec]
    # decoder_rnn_output_step shape (batch, 1, decoder_hidden_size)
    
    # Attention step
    # query is decoder_rnn_output_step, value is encoder_all_outputs_as_input
    context_vector_step, attention_weights_step = attention_layer_inf(
        [decoder_rnn_output_step, encoder_all_outputs_as_input], return_attention_scores=True
    )
    # context_vector_step shape: (batch, 1, encoder_hidden_size)
    # attention_weights_step shape: (batch, 1, input_seq_len) (for AdditiveAttention)
    # Squeeze out the time dimension from attention weights for plotting: (batch, input_seq_len)
    squeezed_attention_weights = tf.squeeze(attention_weights_step, axis=1)


    # Concatenate context vector with decoder RNN output for this step
    decoder_concat_input_step = concat_layer_inf([decoder_rnn_output_step, context_vector_step])
    # decoder_concat_input_step shape (batch, 1, combined_hidden_size)

    # Final dense layer for this step
    decoder_output_final_step = decoder_dense_inf(decoder_concat_input_step) # Shape (batch, 1, num_decoder_tokens)
    # Squeezing out the time dimension for output probabilities
    squeezed_decoder_output = tf.squeeze(decoder_output_final_step, axis=1)


    decoder_model_inf = Model(
        [decoder_input_single_step, encoder_all_outputs_as_input] + decoder_initial_states_inputs, 
        [squeezed_decoder_output, squeezed_attention_weights] + decoder_new_states
    )
    
    return encoder_model_inf, decoder_model_inf

# Beam Search Decode Function for Attention Model

In [8]:
def decode_sequence_beam_search_attention(input_seq_vectorized, encoder_model, decoder_model, beam_width, config):
    # Encoding the input to get all encoder outputs and final states
    encoder_outputs = encoder_model.predict(input_seq_vectorized, verbose=0)
    encoder_all_hidden_states = encoder_outputs[0] # This is the `values` for attention
    
    # Initial decoder states come from the rest of encoder_outputs
    # If LSTM: [state_h, state_c], if GRU/RNN: [state_h]
    initial_decoder_states = encoder_outputs[1:] 

    # Starting with the SOS token
    start_token_idx = target_token_index[SOS_TOKEN]

    initial_beam = [([start_token_idx], 0.0, initial_decoder_states, [])] 
    live_hypotheses = initial_beam

    completed_hypotheses = []

    for _ in range(max_decoder_seq_length): 
        new_hypotheses_candidates = []
        
        # If all live hypotheses have ended or no live hypotheses left
        if not live_hypotheses or all(h[0][-1] == target_token_index[EOS_TOKEN] for h in live_hypotheses if len(h[0]) > 1):
            break

        for seq_indices, score, current_decoder_states, attn_weights_list in live_hypotheses:
            # If EOS token is the last token, this hypothesis is complete
            if seq_indices[-1] == target_token_index[EOS_TOKEN] and len(seq_indices) > 1:
                completed_hypotheses.append((seq_indices, score / (len(seq_indices)-1), current_decoder_states, attn_weights_list)) # Normalize score
                continue

            # Preparing decoder input for the next step
            last_token_idx_input = np.array([[seq_indices[-1]]])
            
            decoder_model_inputs = [last_token_idx_input, encoder_all_hidden_states] + current_decoder_states
            
            # Predicting next token probabilities, attention_weights, and new decoder states
            decoder_pred_outputs = decoder_model.predict(decoder_model_inputs, verbose=0)
            
            output_token_probs = decoder_pred_outputs[0]      # Shape (batch=1, num_decoder_tokens)
            attention_weights_step = decoder_pred_outputs[1]  # Shape (batch=1, input_seq_len)
            new_decoder_states = decoder_pred_outputs[2:]     # List of state tensors

            # Using log probabilities
            log_probs = np.log(output_token_probs[0] + 1e-9) # Addng epsilon for stability
            
            # Getting top N candidates (N=beam_width)
            top_k_indices = np.argsort(log_probs)[-beam_width:]
            
            for token_idx in top_k_indices:
                new_seq_indices = seq_indices + [token_idx]
                new_score = score + log_probs[token_idx]
                new_attn_weights_list = attn_weights_list + [attention_weights_step[0]] # Store current step's attention
                new_hypotheses_candidates.append((new_seq_indices, new_score, new_decoder_states, new_attn_weights_list))

        # sorting
        if new_hypotheses_candidates:
            live_hypotheses = sorted(new_hypotheses_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
        else:
            live_hypotheses = [] 

    # Adding any remaining live hypotheses to completed (if they didn't end with EOS but max_len reached)
    for seq_indices, score, states, attn_list in live_hypotheses:
         completed_hypotheses.append((seq_indices, score / (len(seq_indices)-1) if len(seq_indices) > 1 else score, states, attn_list))

    if not completed_hypotheses: # Handling cases where no hypothesis completes (e.g. very short max_decoder_seq_length)
        if live_hypotheses:
             best_hypothesis = max(live_hypotheses, key=lambda x: x[1]/len(x[0]) if len(x[0]) > 1 else x[1])
        else:
            return "", np.array([]) # Returning empty string and empty attention weights
    else:
        # Choosing the best hypothesis from completed ones (highest normalized score)
        best_hypothesis = max(completed_hypotheses, key=lambda x: x[1])
        
    decoded_sentence_indices = best_hypothesis[0]
    final_attention_weights_list = best_hypothesis[3] # List of attention arrays for each decoded step
    
    # Converting indices to characters
    decoded_sentence = ""
    for token_idx in decoded_sentence_indices:
        if token_idx == target_token_index[SOS_TOKEN]:
            continue
        if token_idx == target_token_index[EOS_TOKEN]:
            break
        if token_idx in reverse_target_char_index:
             decoded_sentence += reverse_target_char_index[token_idx]
    
    if final_attention_weights_list:
        attention_matrix = np.array(final_attention_weights_list)
        # Ensuring attention_matrix is 2D, e.g. (len_output_seq, len_input_seq)
        if attention_matrix.ndim == 1: 
            attention_matrix = np.expand_dims(attention_matrix, axis=0)
    else:
        attention_matrix = np.array([]) 

    return decoded_sentence, attention_matrix

# Training and Evaluation Function for Sweep (train_evaluate_attention)

In [9]:
def train_evaluate_attention():
    keras.backend.clear_session()
    run = wandb.init() # Project/entity inherited from sweep
    config = wandb.config

    print(f"--- Attention Model: Building model for run {run.id if run else 'N/A'} with config: {dict(config)} ---")
    attention_training_model = build_attention_seq2seq_model(config)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=config.early_stopping_patience, restore_best_weights=True, verbose=1)
    wandb_metrics_logger = WandbMetricsLogger(log_freq="epoch")

    print(f"--- Attention Model: Starting training for run {run.id if run else 'N/A'} ---")
    history = attention_training_model.fit(
        [encoder_input_train, decoder_input_train],
        decoder_target_train,
        batch_size=config.batch_size,
        epochs=config.epochs,
        validation_data=([encoder_input_val, decoder_input_val], decoder_target_val),
        callbacks=[early_stopping, wandb_metrics_logger],
        verbose=1
    )

    wandb.log({"val_exact_match_accuracy_attention": history.history['val_accuracy'][-1]})

# Wandb Sweep Configuration for Attention Model

In [10]:
sweep_config_attention = {
    'method': 'bayes',  # Bayesian optimization
    'metric': {
        'name': 'val_exact_match_accuracy_attention',
        'goal': 'maximize'   
    },
    'parameters': {
        'input_embedding_size': {
            'values': [32, 64, 128] 
        },
        'hidden_size': {
            'values': [64, 128, 256] 
        },
        'encoder_layers': {
            'values': [1, 2]
        },
        'decoder_layers': {
            'values': [1, 2]
        },
        'cell_type': {
            'values': ['GRU', 'LSTM']
        },
        'dropout_rate': {
            'values': [0.2, 0.3]
        },
        'learning_rate': {
            'values': [0.001]
        },
        'batch_size': {
            'values': [64, 128, 256]
        },
        'epochs': { 
            'values': [50] 
        },
        'early_stopping_patience': {
            'values': [5]
        },
        'beam_size': { 
            'values': [1, 3, 5] 
        },
        'optimizer': {
            'values': ['adam', 'nadam']
        }
    }
}

# Start Sweep Agent for Attention Model

In [None]:
# --- Initialize Sweep for Attention Model ---
sweep_id_attention = wandb.sweep(
    sweep_config_attention, 
    entity="ce21b097-indian-institute-of-technology-madras", # Replace with your entity
    project="CE21B097 - DA6401 - Assignment 3" # Example: New project or tag runs
)

# --- Run Agent for Attention Model ---
wandb.agent(sweep_id_attention, function=train_evaluate_attention, count=15)

print("\n--- Attention Model Sweep Finished ---")
print("Go to your W&B project page to see the results.")

# (OR) directly evaluate best model on test data

In [21]:
predictions_attention = pd.read_csv("predictions_attention/test_predictions_attention.tsv", sep = '\t')

In [25]:
predictions_attention.tail(10)

Unnamed: 0,Input (Roman),Actual (Devanagari),Predicted (Devanagari)
4492,have,हैव,हावे
4493,hong,हॉन्ग,होंग
4494,half,हॉफ,हाल्फा
4495,hoaf,हॉफ,होफ
4496,hounga,होऊंगा,होंग
4497,holding,होल्डिंग,होल्डिंग
4498,hoshangabaad,होशंगाबाद,होशनागबाद
4499,hoshangabad,होशंगाबाद,होशनंगद
4500,hostes,होस्टेस,होस्टेस
4501,hostess,होस्टेस,होस्टेस
