In [1]:
import unicodedata
import re
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

max_encoding_len = 79
max_decoding_len = 81


In [2]:
# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path='TFLiteModel/fixed_input_gru_luong_attention.tflite')
# Allocate tensors
interpreter.allocate_tensors()

In [3]:
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input Details:", input_details)
print("Output Details:", output_details)

with open('WordTokenizers\\fixed_source_tokenizer.json') as f:
    data = json.load(f)
    source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('WordTokenizers\\fixed_target_tokenizer.json') as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

Input Details: [{'name': 'serving_default_encoder_inputs:0', 'index': 0, 'shape': array([ 1, 78]), 'shape_signature': array([-1, 78]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'serving_default_decoder_inputs:0', 'index': 1, 'shape': array([ 1, 81]), 'shape_signature': array([-1, 81]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Output Details: [{'name': 'StatefulPartitionedCall:0', 'index': 138, 'shape': array([    1,     1, 13373]), 'shape_signature': array([   -1,    -1, 13373]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtyp

In [4]:
import unicodedata
import re
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load tokenizers
def load_tokenizers():
    with open('WordTokenizers\\fixed_source_tokenizer.json') as f:
        data = json.load(f)
        source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

    with open('WordTokenizers\\fixed_target_tokenizer.json') as f:
        data = json.load(f)
        target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

    return source_tokenizer, target_tokenizer

# Define the function to normalize Unicode characters
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

# Define the function to preprocess a sentence
def preprocess_sentence(s):
    s = normalize_unicode(s)
    # Add spaces around punctuation marks (., ?, !, etc.)
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    # Replace multiple spaces with a single space
    s = re.sub(r'[" "]+', " ", s)
    # Strip leading and trailing spaces
    s = s.strip()
    return s

# Define the function to tag target sentences
def tag_target_sentences(sentences):
    tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
    return list(tagged_sentences)

# Combined function to preprocess the input and return padded sequence
def preprocess_input_sequence(user_input, max_encoding_len=78):
    # Load tokenizers
    source_tokenizer, target_tokenizer = load_tokenizers()

    # Preprocess the input sentence
    unicode_input = normalize_unicode(user_input)
    preprocess_input = preprocess_sentence(unicode_input)
    
    # Tag target sentences (assuming the user input is a target sentence)
    tagged_input = tag_target_sentences([preprocess_input])
    
    # Convert the sentence to a sequence using source tokenizer
    train_encoder_inputs = source_tokenizer.texts_to_sequences(tagged_input)
    
    # Pad the sequence to the maximum encoding length
    padded_input_sequence = pad_sequences(train_encoder_inputs, maxlen=max_encoding_len, padding='post')
    
    return np.array(padded_input_sequence, dtype=np.float32)

In [5]:
def decode_sequence_tflite(interpreter, encoder_input, target_tokenizer, max_decoding_len=81):

    start_token_index = target_tokenizer.word_index['<sos>']
    end_token_index = target_tokenizer.word_index['<eos>']

    decoder_input = np.zeros((1, 81), dtype=np.float32)
    decoder_input[0, 0] = start_token_index  # First token is <sos>

    translated_tokens = []

    for i in range(max_decoding_len):
        # Set encoder and decoder inputs
        interpreter.set_tensor(0, encoder_input)  # Encoder input at index 0
        interpreter.set_tensor(1, decoder_input)  # Decoder input at index 1

        # Run inference
        interpreter.invoke()

        # Get the output probabilities
        output_probs = interpreter.get_tensor(138)  # Output at index 138
        predicted_token_index = np.argmax(output_probs[0, 0])  # Pick token with max probability

        # Check for end of sentence
        if predicted_token_index == end_token_index:
            break

        # Append token to result
        translated_tokens.append(target_tokenizer.index_word.get(predicted_token_index, '<unk>'))

        # Update decoder input
        if i + 1 < 81:
            decoder_input[0, i + 1] = predicted_token_index

    return ' '.join(translated_tokens)


In [6]:
def encode_input(input_sequence):
    # Set the encoder input tensor
    interpreter.set_tensor(input_details[0]['index'], input_sequence)
    interpreter.invoke()

    # Extract the encoder outputs and state
    encoder_outputs = interpreter.get_tensor(output_details[0]['index'])
    encoder_state = interpreter.get_tensor(output_details[1]['index'])
    return encoder_outputs, encoder_state


In [7]:
def translate_tflite(input_sentence, interpreter, source_tokenizer, target_tokenizer, max_encoding_len=79, max_decoding_len=81):
    encoder_input = preprocess_input_sequence(input_sentence)
    translated_sentence = decode_sequence_tflite(interpreter, encoder_input, target_tokenizer, max_decoding_len)
    return translated_sentence


In [15]:
# Example Usage:
spanish_sentence = "Right"
# Translate the sentence
translated_english_sentence = translate_tflite(
    spanish_sentence, 
    interpreter, 
    source_tokenizer, 
    target_tokenizer,
)

print(f"Spanish: {spanish_sentence}")
print(f"English: {translated_english_sentence}")


Spanish: Right
English: let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let let
