# Encoder Decoder 

### Imports

In [1]:
import pandas as pd
import numpy as np
import re
import string
from datasets import load_dataset
from sklearn.utils import shuffle

### Importing Data

In [59]:
#df = pd.read_csv('ASL_English.csv')
df = pd.read_csv('/kaggle/input/english-gloss-dataset/train.csv')
df.head()

Unnamed: 0,gloss,text
0,﻿MEMBERSHIP PARLIAMENT SEE MINUTE\n,﻿membership of parliament see minutes\n
1,APPROVAL MINUTE DESC-PREVIOUS SIT SEE MINUTE\n,approval of minutes of previous sitting see mi...
2,MEMBERSHIP PARLIAMENT SEE MINUTE\n,membership of parliament see minutes\n
3,VERIFICATION CREDENTIALS SEE MINUTE\n,verification of credentials see minutes\n
4,DOCUMENT RECEIVE SEE MINUTE\n,documents received see minutes\n


In [60]:
df.shape

(87710, 2)

### Preprocessing

Note:
- Replace the numbers/digits
- Check regarding Finger spellings
- Check if it is required to add start and end tokens to target sequences

In [61]:
replacements = {'1': " one ", '2': " two ", '3': " three ", '4': " four ", 
                '5': " five ", '6': " six ", '7': " seven ", '8': " eight ", 
                '9': " nine ", '0': " zero "}

# Apply number replacement to text (English input)
df['text'] = df['text'].apply(lambda x: re.sub(r'(\d)', lambda m: replacements[m.group()], x))

# Apply number replacement to gloss (ASL output)
df['gloss'] = df['gloss'].apply(lambda x: re.sub(r'(\d)', lambda m: replacements[m.group()], x))

df['gloss'] = df['gloss'].apply(lambda x: re.sub(r'\s*\(wh\)\s*', ' ', x))

# Remove extra spaces
df['text'] = df['text'].apply(lambda x: x.strip())
df['gloss'] = df['gloss'].apply(lambda x: x.strip())

# Convert to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())
df['gloss'] = df['gloss'].apply(lambda x: x.lower())

# Remove double spaces
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['gloss'] = df['gloss'].apply(lambda x: re.sub(r'\s+', ' ', x))

# Remove punctuation
df['text'] = df['text'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))
df['gloss'] = df['gloss'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))

# **Fix: Properly remove "(wh)" from gloss**


# Remove any additional double spaces left after removal
df['gloss'] = df['gloss'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Add tokens to gloss (since it's now the target output)
df['gloss'] = df['gloss'].apply(lambda x: 'START_ ' + x.strip() + ' _END')

In [62]:
df.head(10)

Unnamed: 0,gloss,text
0,START_ ﻿membership parliament see minute _END,﻿membership of parliament see minutes
1,START_ approval minute descprevious sit see mi...,approval of minutes of previous sitting see mi...
2,START_ membership parliament see minute _END,membership of parliament see minutes
3,START_ verification credentials see minute _END,verification of credentials see minutes
4,START_ document receive see minute _END,documents received see minutes
5,START_ write statement and descoral question t...,written statements and oral questions tabling ...
6,START_ petition see minute _END,petitions see minutes
7,START_ text agreement descforward by council s...,texts of agreements forwarded by the council s...
8,START_ action take on parliament xposs resolut...,action taken on parliaments resolutions see mi...
9,START_ agenda for next sit see minute _END,agenda for next sitting see minutes


In [63]:
df.tail(10)

Unnamed: 0,gloss,text
87700,START_ document receive see minute _END,documents received see minutes
87701,START_ write declaration include in register r...,written declarations included in the register ...
87702,START_ forwarding text adopt during sit see mi...,forwarding of texts adopted during the sitting...
87703,START_ date descforthcoming sit see minute _END,dates of forthcoming sittings see minutes
87704,START_ name you what _END,what is your name
87705,START_ name you what _END,what is your name
87706,START_ name you what _END,whats your name
87707,START_ name you what _END,whats your name
87708,START_ eat apple _END,i eat an apple
87709,START_ eat orange _END,i eat an orange


In [None]:
# df['ASL Gloss'].str.len().sort_values(ascending=False).head()
# df['English'].str.len().sort_values(ascending=False).head()

In [64]:
# Vocabulary of English (text column)
all_eng_words = set()
for eng in df['text']:
    for word in eng.split():
        all_eng_words.add(word)

# Vocabulary of ASL (gloss column)
all_ASL_words = set()
for asl in df['gloss']:
    for word in asl.split():
        all_ASL_words.add(word)


In [65]:
# Compute max length for English (source) text
length_list = [len(sentence.split()) for sentence in df['text']]  # English as input
max_length_src = np.max(length_list)  
print("Max length source (English):", max_length_src)

Max length source (English): 64


In [66]:
# Compute max length for ASL Gloss (target)
length_list = [len(sentence.split()) for sentence in df['gloss']]  # ASL Gloss as output
max_length_tar = np.max(length_list)  
print("Max length target (ASL Gloss):", max_length_tar)

Max length target (ASL Gloss): 57


In [67]:
input_words = sorted(list(all_eng_words))  # English as input
target_words = sorted(list(all_ASL_words))  # ASL Gloss as output

# Calculate vocab size for both source (English) and target (ASL Gloss)
num_encoder_tokens = len(all_eng_words) + 1  # English vocab size
num_decoder_tokens = len(all_ASL_words) + 1  # ASL Gloss vocab size

num_encoder_tokens, num_decoder_tokens


(20583, 15111)

In [68]:
# Add 1 for zero padding in both encoder (English) and decoder (ASL Gloss)
num_encoder_tokens += 1  # Padding for English input
num_decoder_tokens += 1  # Padding for ASL Gloss output

num_encoder_tokens, num_decoder_tokens


(20584, 15112)

In [69]:
# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [70]:
def write_list_to_file(var_list):
    outputFile = open( "myVars.txt", "w")
    outputFile.write(str(var_list))
    outputFile.flush()
    outputFile.close()

var_list = [max_length_src, max_length_tar, num_encoder_tokens, num_decoder_tokens, input_token_index, target_token_index, reverse_target_char_index]
write_list_to_file(var_list)

In [54]:
shuffle(df).head(10)

Unnamed: 0,gloss,text
25558,START_ descvoluntarily xwe have take xit upon ...,voluntarily we have taken it upon ourselves t...
58890,START_ this will aid integration in case futur...,this will aid integration in case of future eu...
32446,START_ descre be no revolution descn descjust ...,there is no revolution then just some clarifi...
39883,START_ increase in temperature will descconsid...,the increase in temperature will considerably ...
4214,START_ address generalise lack information be ...,addressing the generalised lack of information...
27521,START_ for rest xi believe that descre be cons...,for the rest i believe that there is a consen...
19310,START_ xi would like to take this opportunity ...,i would like to take this opportunity to urge ...
85066,START_ descso as xyou see xwe be act on descse...,so as you see we are acting on several fronts
30849,START_ consumer descoften pay for service se c...,consumers often pay for the services of these ...
59625,START_ descn descre will be no need for quota ...,then there will be no need for quotas


Make a 90–10 train and test split and write a Python generator function to load the data in batches as follows:

In [71]:
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import tensorflow as tf

In [72]:
# Train - Test Split
X, y = df['text'], df['gloss']  # English as input (X), ASL Gloss as output (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train.shape, X_test.shape

((78939,), (8771,))

Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [None]:
#X_train.to_pickle('Weights_ASL/X_train.pkl')
#X_test.to_pickle('Weights_ASL/X_test.pkl')

In [73]:
import numpy as np

def generate_batch(X=X_train, y=y_train, batch_size=128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            current_batch_size = min(batch_size, len(X) - j)  # Adjust for the last batch
            
            encoder_input_data = np.zeros((current_batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((current_batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((current_batch_size, max_length_tar, num_decoder_tokens), dtype='float32')

            for i, (input_text, target_text) in enumerate(zip(X.iloc[j:j+current_batch_size], y.iloc[j:j+current_batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index.get(word, 0)  # Handle unknown words

                for t, word in enumerate(target_text.split()):
                    if t < len(target_text.split()) - 1:
                        decoder_input_data[i, t] = target_token_index.get(word, 0)  # Handle unknown words
                    if 0 < t < max_length_tar:  # Ensure valid indexing
                        decoder_target_data[i, t - 1, target_token_index.get(word, 0)] = 1.  # One-hot encoding

            yield ([encoder_input_data, decoder_input_data], decoder_target_data)


Encoder - Decoder Model Architecture

In [74]:
latent_dim = 50

In [75]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [76]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [77]:
# Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     1029200     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     755600      input_2[0][0]                    
______________________________________________________________________________________________

In [79]:
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=[tf.keras.metrics.Accuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [80]:
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
                                        mode="min", patience=5,
                                        restore_best_weights=True, verbose = 1)

In [84]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 30
# epochs = 10

In [None]:
model.fit(generate_batch(X_test, y_test, batch_size = batch_size),
            batch_size = batch_size,
            steps_per_epoch = train_samples//batch_size,
            epochs=epochs,
            validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
            validation_steps = val_samples//batch_size, callbacks=[earlystopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30

Always remember to save the weights

In [None]:
model.save_weights('nmt_weights_v5.h5')
model.save('model_v5.h5')

model.save('model_v4.h5')

Load the weights, if you close the application

In [None]:
#model.load_weights('nmt_weights_v4.h5')

Inference Setup

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

Finally, we generate the output sequence by invoking the above setup in a loop as follows

Decode sample sequeces

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

Evaluation on Train Dataset

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input ASL sentence:', X_train.iloc[k:k+1].values[0])
print('Actual English Translation:', y_train.iloc[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])

In [None]:
asl_sentence = []
true_eng_trans = []
pred_eng_trans = []

for i in range(10):
    k+=1
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    asl_sentence.append(X_train.iloc[k:k+1].values[0])
    true_eng_trans.append(y_train.iloc[k:k+1].values[0][6:-4])
    pred_eng_trans.append(decoded_sentence[:-4])

for i in range(10):
    print('Input ASL sentence:', asl_sentence[i])
    print('Actual English Translation:', true_eng_trans[i])
    print('Predicted English Translation:', pred_eng_trans[i])
    print()

In [None]:
def preprocess_sentence(sentence):
    # lower case to standardize the sentence and remove extra spaces
    sentence = sentence.lower().strip()
    # if QM-wig or 6 Ws or How is in the sentence, then it is a question
    words = ['who', 'what', 'when', 'where', 'why', 'how']
    question_flag = 0
    if 'qm-wig' in sentence or any(word in sentence for word in words):
        question_flag = 1
    sentence = sentence.replace('qm-wig', '')

    # remove punctuation (isn't required but im still including it)
    sentence = re.sub(r"([?.!,])", "", sentence)
    # replace numbers with words
    number_replacements = {'1': " one ", '2':" two ", '3':" three ", '4':" four ", 
                           '5':" five ", '6':" six ", '7':" seven ", '8':" eight ", 
                           '9':" nine ", '0':" zero "}
    for key, value in number_replacements.items():
        sentence = sentence.replace(key, value)
    # remove extra spaces
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()

    words = sentence.split()
    result = []
    # Empty temporary list to store single letters
    temp = []
    for word in words:
        if len(word) == 1:
            temp.append(word)
        else:
            # If there are any single letters in the temporary list,
            # join them with a dash and append to the result list
            if temp:
                result.append('-'.join(temp))
                temp = []
            # Append the non-single letter word to the result list
            result.append(word)
    if temp:
        result.append('-'.join(temp))
    
    # Save the dashed words in a list so that it can be replaced later
    replaced_words = [match for match in result if "-" in match]
    # Replace the single letters with 'XXXXX' in the result list
    result = ["xxxxx" if '-' in element else element for element in result]
    # Join the words in the result list back into a string sentence
    sentence = ' '.join(result)

    return sentence, question_flag, replaced_words

In [None]:
sentences_to_test = ['CITY YOU LIVE','TODAY, YOUR LAST CLASS WHAT',
                     'YOUR NEXT CLASS WHAT','YOUR NAME WHAT','YOU LIKE YOUR WORK',
                     'YOU WORK WHERE','YOUR NAME WHAT','HELLO MY NAME B O B',
                     'HOW YOU','ME BUSY BUSY BUSY','ME HAPPY SEE YOU','HOW YOUR DAY',
                     'ALL DAY WORK ME','YOU WORK YOU DODO','SCHOOL ME WORK']
sentences_translation = []
sentences_counter = 0

In [None]:
import re
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import numpy as np
import time
import ast

class colors:
    RED_BOLD = '\033[91m' + '\033[1m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    UNDERLINE = '\033[4m'
    UNDERLINE_GREEN = '\033[4m' + '\033[92m'

def read_list_from_file():
    inputFile = open( "/kaggle/working/myVars.txt", "r")
    lines = inputFile.readlines()

    objects = []
    for line in lines:
        objects.append(ast.literal_eval(line))
    
    return objects[0][0], objects[0][1], objects[0][2], objects[0][3], objects[0][4], objects[0][5], objects[0][6]

# get the start time
st_final = time.time()
st = time.time()

max_length_src, max_length_tar, num_encoder_tokens, num_decoder_tokens, input_token_index, target_token_index, reverse_target_char_index = read_list_from_file()

print(colors.UNDERLINE_GREEN + 'Importing Variables:' + colors.ENDC, round(time.time() - st, 2), 'seconds')
st = time.time()

latent_dim = 50

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

'''
We set up our decoder to return full output sequences, and to return internal states as well. 
We don't use the return states in the training model, but we will use them in inference.
'''
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

print(colors.UNDERLINE_GREEN + 'Setting up Model:' + colors.ENDC, round(time.time() - st, 2), 'seconds')
st = time.time()

model.load_weights('/kaggle/working/nmt_weights_v5.h5')

print(colors.UNDERLINE_GREEN + 'Loading Weights:' + colors.ENDC, round(time.time() - st, 2), 'seconds')
st = time.time()

### INFERENCING ###
encoder_model = Model(encoder_inputs, encoder_states) # Encode the input sequence to get the "thought vectors"

# Decoder setup - Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

print(colors.UNDERLINE_GREEN + 'Setting up Decoder:' + colors.ENDC, round(time.time() - st, 2), 'seconds')
st = time.time()

# Reverse-lookup token index to decode sequences back to something readable.
def decode_sequence(input_text):
    encoder_input_data = np.zeros((1, max_length_src), dtype='float32')
    error_word = ''
    try:
        for i, input_text in enumerate([input_text]):
            #print(colors.WARNING + "i:", i, " | input_text: ", input_text, "" + colors.ENDC)
            for t, word in enumerate(input_text.split()):
                error_word = word
                encoder_input_data[i, t] = input_token_index[word]
    except:
        return colors.RED_BOLD + '"' + error_word + '" doesn\'t exist in the dataset.' + colors.ENDC
    
    states_value = encoder_model.predict(encoder_input_data)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['START_']
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' ' + sampled_char
        
        if (sampled_char == '_END' or len(decoded_sentence) > 50):
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return decoded_sentence[:-4]

def preprocess_sentence(sentence):
    # lower case to standardize the sentence and remove extra spaces
    sentence = sentence.lower().strip()
    # if QM-wig or 6 Ws or How is in the sentence, then it is a question
    words = ['who', 'what', 'when', 'where', 'why', 'how']
    question_flag = 0
    if 'qm-wig' in sentence or any(word in sentence for word in words):
        question_flag = 1
    sentence = sentence.replace('qm-wig', '')

    # remove punctuation (isn't required but im still including it)
    sentence = re.sub(r"([?.!,])", "", sentence)
    # replace numbers with words
    number_replacements = {'1': " one ", '2':" two ", '3':" three ", '4':" four ", 
                           '5':" five ", '6':" six ", '7':" seven ", '8':" eight ", 
                           '9':" nine ", '0':" zero "}
    for key, value in number_replacements.items():
        sentence = sentence.replace(key, value)
    # remove extra spaces
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()

    words = sentence.split()
    result = []
    # Empty temporary list to store single letters
    temp = []
    for word in words:
        if len(word) == 1:
            temp.append(word)
        else:
            # If there are any single letters in the temporary list,
            # join them with a dash and append to the result list
            if temp:
                result.append('-'.join(temp))
                temp = []
            # Append the non-single letter word to the result list
            result.append(word)
    if temp:
        result.append('-'.join(temp))
    
    # Save the dashed words in a list so that it can be replaced later
    replaced_words = [match for match in result if "-" in match]
    # Replace the single letters with 'XXXXX' in the result list
    result = ["xxxxx" if '-' in element else element for element in result]
    # Join the words in the result list back into a string sentence
    sentence = ' '.join(result)

    return sentence, question_flag, replaced_words

sentences_to_test = ['CITY YOU LIVE','TODAY, YOUR LAST CLASS WHAT',
                     'YOUR NEXT CLASS WHAT','YOUR NAME WHAT','YOU LIKE YOUR WORK',
                     'YOU WORK WHERE','YOUR NAME WHAT','HELLO MY NAME B O B',
                     'HOW YOU','ME BUSY BUSY BUSY','ME HAPPY SEE YOU','HOW YOUR DAY',
                     'ALL DAY WORK ME','YOU WORK YOU DODO','SCHOOL ME WORK']
sentences_translation = []
sentences_counter = 0

while sentences_counter < len(sentences_to_test):
    #input_text = input(colors.WARNING + 'Input ASL sentence: ' + colors.ENDC)
    #prep_input, question_flag, replaced_words = preprocess_sentence(input_text)
    prep_input, question_flag, replaced_words = preprocess_sentence(sentences_to_test[sentences_counter])
    if prep_input == 'exit':
        break
    
    # if only 1 word is given, then no need to decode
    decoded_sentence = decode_sequence(prep_input) if len(prep_input.split()) > 1 else prep_input

    # if '?' not in decoded sentence and original input had 'QM-wig' then add '?' at the end
    if '?' not in decoded_sentence and question_flag == 1:
        decoded_sentence = decoded_sentence.strip() + '?'

    # Replace the 'XXXXX' with the original single letter words
    for word in replaced_words:
        decoded_sentence = decoded_sentence.replace('xxxxx', word.replace('-',''), 1)
    decoded_sentence = decoded_sentence.replace('xxxxx', '')
    
    # if decoded sentence contains ['who', 'what', 'when', 'where', 'why', 'how'] then add '?' at the end
    if any(word in decoded_sentence for word in ['who', 'what', 'when', 'where', 'why', 'how']) and '?' not in decoded_sentence:
        decoded_sentence = decoded_sentence.strip() + '?'
     
    sentences_translation.append(decoded_sentence)
    # Outputs 
    """
    print(colors.WARNING + '\nInput ASL sentence:' + colors.ENDC + "'" + input_text + "'")
    print(colors.WARNING + 'Preprocessed Input:' + colors.ENDC + "'" + prep_input + "'")
    print(colors.WARNING + 'Predicted English Translation:' + colors.ENDC, decoded_sentence)
    print(colors.UNDERLINE_GREEN + 'Decoding Sequence:' + colors.ENDC, round(time.time() - st, 2), 'seconds')
    """
    sentences_counter += 1

# Print sentences to test and their translations

for i, sentence in enumerate(sentences_to_test):
    print(colors.WARNING + 'Input ASL sentence:' + colors.ENDC + "'" + sentence + "'")
    print(colors.WARNING + 'Predicted English Translation:' + colors.ENDC, sentences_translation[i])
    print()

print(colors.UNDERLINE_GREEN + 'Total Execution time:' + colors.ENDC, round(time.time() - st_final, 2), 'seconds')