In [None]:
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata
import os

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
with open('Datasets\\Spa-Eng\\Spa_Eng_train_file.txt') as file:
  train = [line.rstrip() for line in file]

# Separate the input (Spa) and target (Eng) sentences into separate lists.
SEPARATOR = '\t'
train_target, train_input = map(list, zip(*[pair.split(SEPARATOR)[:2] for pair in train]))

# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

# Preprocess both the source and target sentences.
train_preprocessed_input = [preprocess_sentence(s) for s in train_input]
train_preprocessed_target = [preprocess_sentence(s) for s in train_target]

def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)

# Tokenizer for the Hungarian input sentences. Note how we're not filtering punctuation.
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_preprocessed_input)
source_tokenizer.get_config()

source_vocab_size = len(source_tokenizer.word_index) + 1

# Tokenizer for the English target sentences.
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_tagged_preprocessed_target)
target_tokenizer.get_config()

target_vocab_size = len(target_tokenizer.word_index) + 1

train_encoder_inputs = source_tokenizer.texts_to_sequences(train_preprocessed_input)

def generate_decoder_inputs_targets(sentences, tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
  decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.

  return decoder_inputs, decoder_targets

train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_tagged_preprocessed_target, 
                                                                              target_tokenizer)

max_encoding_len = len(max(train_encoder_inputs, key=len))
max_decoding_len = len(max(train_decoder_inputs, key=len))

padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

target_tokenizer.sequences_to_texts([padded_train_decoder_inputs[0]])

with open('Datasets\\Spa-Eng\\Spa_Eng_Val_file.txt') as file:
  val = [line.rstrip() for line in file]

def process_dataset(dataset):

  # Split the Hungarian and English sentences into separate lists.
  output, input = map(list, zip(*[pair.split(SEPARATOR)[:2] for pair in dataset]))

  # Unicode normalization and inserting spaces around punctuation.
  preprocessed_input = [preprocess_sentence(s) for s in input]
  preprocessed_output = [preprocess_sentence(s) for s in output]

  # Tag target sentences with <sos> and <eos> tokens.
  tagged_preprocessed_output = tag_target_sentences(preprocessed_output)

  # Vectorize encoder source sentences.
  encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)

  # Vectorize and create decoder input and target sentences.
  decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output, 
                                                                    target_tokenizer)
  
  # Pad all collections.
  padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
  padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
  padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

  return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets

# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets = process_dataset(val)

In [None]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 128
epochs = 30

# Fixed input shape: (batch_size=1, sequence_length=79 for encoder, 81 for decoder)
encoder_inputs = layers.Input(shape=(78,), name='encoder_inputs')

# Fixed decoder input shape: (batch_size=1, sequence_length=81)
decoder_inputs = layers.Input(shape=(81,), name='decoder_inputs')

# Keep the rest of the model structure the same
encoder_embeddings = layers.Embedding(source_vocab_size, 
                                       embedding_dim, 
                                       mask_zero=True, 
                                       name='encoder_embeddings')(encoder_inputs)

encoder_outputs, encoder_state = layers.GRU(hidden_dim, 
                                            return_sequences=True, 
                                            return_state=True, 
                                            dropout=default_dropout, 
                                            name='encoder_gru')(encoder_embeddings)

decoder_embeddings = layers.Embedding(target_vocab_size, 
                                       embedding_dim, 
                                       mask_zero=True, 
                                       name='decoder_embeddings')(decoder_inputs)

decoder_outputs, decoder_state = layers.GRU(hidden_dim, 
                                            return_sequences=True, 
                                            return_state=True, 
                                            dropout=default_dropout, 
                                            name='decoder_gru')(decoder_embeddings, 
                                                                initial_state=encoder_state)

attention_scores = layers.Dot(axes=[2, 2], name='attention_scores')([decoder_outputs, encoder_outputs])
attention_weights = layers.Activation('softmax', name='attention_weights')(attention_scores)
context_vector = layers.Dot(axes=[2, 1], name='context_vector')([attention_weights, encoder_outputs])
decoder_combined_context = layers.Concatenate(axis=-1, name='decoder_combined_context')([context_vector, decoder_outputs])

y_proba = layers.Dense(target_vocab_size, activation='softmax', name='decoder_dense')(decoder_combined_context)

In [7]:
# Define the model with encoder and decoder inputs
model_with_attention = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='fixed_input_gru_luong_attention')

# Compile the model
model_with_attention.compile(optimizer='adam', 
                             loss='sparse_categorical_crossentropy', 
                             metrics=['sparse_categorical_accuracy'])

# Display model summary
model_with_attention.summary()

Model: "fixed_input_gru_luong_attention"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 78)]         0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 81)]         0           []                               
                                                                                                  
 encoder_embeddings (Embedding)  (None, 78, 128)     3210624     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embeddings (Embedding)  (None, 81, 128)     1711744     ['decoder_inputs[0][0]']         
                                                                    

In [8]:
# Define callbacks
checkpoint_filepath = 'Weights/FixedInputGRULuongAttention.weights.h5'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                         save_weights_only=True,
                                                         verbose=1)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model_with_attention.load_weights("Weights/FixedInputGRULuongAttention.weights.h5")

# Train the model
history = model_with_attention.fit(
    [padded_train_encoder_inputs, padded_train_decoder_inputs], 
    padded_train_decoder_targets,
    batch_size=batch_size,
    epochs=1,
    validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
    callbacks=[checkpoint_callback, early_stopping_callback]
)

Epoch 1: saving model to Weights\FixedInputGRULuongAttention.weights.h5


In [None]:
model_with_attention.save('SavedModel/FixedInputGRULuongAttention.keras')


In [None]:
source_tokenizer_json = source_tokenizer.to_json()
with io.open('WordTokenizers/fixed_source_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('WordTokenizers/fixed_target_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

with open('WordTokenizers\\fixed_source_tokenizer.json') as f:
    data = json.load(f)
    source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('WordTokenizers\\fixed_target_tokenizer.json') as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

In [11]:
with open('Datasets\\Spa-Eng\\Spa_Eng_test_file.txt') as file:
  test = [line.rstrip() for line in file]

# Preprocess test dataset
padded_test_encoder_inputs, padded_test_decoder_inputs, padded_test_decoder_targets = process_dataset(test)

# Evaluate the model on the test set.
model_with_attention.evaluate([padded_test_encoder_inputs, padded_test_decoder_inputs], padded_test_decoder_targets)

# These are the layers of our trained model.
[layer.name for layer in model_with_attention.layers]



['encoder_inputs',
 'decoder_inputs',
 'encoder_embeddings',
 'decoder_embeddings',
 'encoder_gru',
 'decoder_gru',
 'attention_scores',
 'attention_weights',
 'context_vector',
 'decoder_combined_context',
 'decoder_dense']

In [12]:
def translate_with_attention(input_sentence, model, source_tokenizer, target_tokenizer, max_encoding_len, max_decoding_len):
    """
    Translate a Spanish sentence to English using a trained Seq2Seq model with Luong Attention.

    Parameters:
    - input_sentence: str, input sentence in Spanish.
    - model: tf.keras.Model, the trained model with attention.
    - source_tokenizer: Tokenizer for the source language (Spanish).
    - target_tokenizer: Tokenizer for the target language (English).
    - max_encoding_len: int, maximum encoding sequence length.
    - max_decoding_len: int, maximum decoding sequence length.

    Returns:
    - Translated sentence as a string.
    """
    # Preprocess input sentence
    preprocessed_sentence = preprocess_sentence(input_sentence)

    # Tokenize and pad the input sentence
    input_sequence = source_tokenizer.texts_to_sequences([preprocessed_sentence])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=max_encoding_len, padding='post')

    # Pass the input through the encoder to get the encoder outputs and initial decoder state
    encoder_outputs, encoder_state = model.get_layer('encoder_gru')(
        model.get_layer('encoder_embeddings')(padded_input_sequence)
    )

    # Initialize the decoder input with the <sos> token
    start_token_index = target_tokenizer.word_index['<sos>']
    decoder_input = np.array([[start_token_index]])

    # Initialize the decoder state
    decoder_state = encoder_state

    # Translation result
    translated_sentence = []

    # Iteratively decode each timestep
    for _ in range(max_decoding_len):
        # Get decoder embedding for the current input
        decoder_embedding_output = model.get_layer('decoder_embeddings')(decoder_input)

        # Pass through the decoder GRU
        decoder_output, decoder_state = model.get_layer('decoder_gru')(decoder_embedding_output, initial_state=decoder_state)

        # Compute attention scores
        attention_scores = tf.matmul(decoder_output, encoder_outputs, transpose_b=True)  # Dot product for Luong Attention

        # Compute attention weights (softmax over scores)
        attention_weights = tf.nn.softmax(attention_scores, axis=-1)

        # Compute the context vector as the weighted sum of encoder outputs
        context_vector = tf.matmul(attention_weights, encoder_outputs)

        # Concatenate context vector and decoder output
        decoder_combined_context = tf.concat([context_vector, decoder_output], axis=-1)

        # Pass through the dense layer to generate token probabilities
        token_probs = model.get_layer('decoder_dense')(decoder_combined_context)

        # Get the token with the highest probability
        predicted_token_index = tf.argmax(token_probs[0, 0]).numpy()

        # Stop if <eos> token is predicted
        if predicted_token_index == target_tokenizer.word_index['<eos>']:
            break

        # Append the predicted word to the translation
        translated_sentence.append(target_tokenizer.index_word.get(predicted_token_index, '<unk>'))

        # Update the decoder input with the predicted token
        decoder_input = np.array([[predicted_token_index]])

    return ' '.join(translated_sentence)

# Test the inference pipeline with a Spanish sentence
spanish_sentence = "Lo mejor de las suertes"
translated_english_sentence = translate_with_attention(
    spanish_sentence, 
    model_with_attention, 
    source_tokenizer, 
    target_tokenizer, 
    max_encoding_len, 
    max_decoding_len
)

print(f"Spanish: {spanish_sentence}")
print(f"English: {translated_english_sentence}")

Spanish: Lo mejor de las suertes
English: do you prefer the best .


In [13]:
# Convert the model to TFLite format
converter = tf.lite.TFLiteConverter.from_keras_model(model_with_attention)

# Enable support for Select TensorFlow ops
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

# Disable experimental lowering of tensor list operations
converter._experimental_lower_tensor_list_ops = False

# Enable optimization for size and latency
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Convert the model
tflite_model = converter.convert()

# Save the TFLite model to disk
with open('TFLiteModel\\fixed_input_gru_luong_attention.tflite', 'wb') as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: C:\Users\jstha\AppData\Local\Temp\tmpqgp7aalh\assets


INFO:tensorflow:Assets written to: C:\Users\jstha\AppData\Local\Temp\tmpqgp7aalh\assets
