# Machine Translator

For this project, we will be building a translation tool using Keras models with seq2seq neural networks! This model will be a character-level translation.

In [60]:
## To allow for GPU computation
import os
os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin")

## Library Imports
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model

In [61]:
## Model Hyperparameters
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [62]:
## Import training translations
data_path = "fra.txt"

# Define lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

In [63]:
## Data Preprocessing
# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

for line in lines[:min(num_samples, len(lines)-1)]:
    input_doc, target_doc = line.split('\t')[:2]
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_doc = "\t" + target_doc + "\n"
    input_docs.append(input_doc)
    target_docs.append(target_doc)

    # Now we split up each sentence into words
    # and add each unique word to our vocabulary set
    for token in input_doc:
        if token not in input_tokens:
            input_tokens.add(token)
    
    for token in target_doc:
        if token not in target_tokens:
            target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in input_docs])
max_decoder_seq_length = max([len(txt) for txt in target_docs])

print("Number of samples:", len(input_docs))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

# For one-hot encoding, create a token to index dictionary
# Tokens are really just characters
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

# Encoder and Decoder accepts vectors as input, let's vectorize. 
# These matrices will hold the one-hot encodings for every character of every word of every sentence
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

# Populate the above matrices
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

    for timestep, token in enumerate(input_doc):
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.0
    encoder_input_data[line, timestep + 1 :, input_features_dict[" "]] = 1.0

    for timestep, token in enumerate(target_doc):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        # Technique of 'teacher forcing'
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.0
        if timestep > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.0
    decoder_input_data[line, timestep + 1 :, target_features_dict[" "]] = 1.0
    decoder_target_data[line, timestep:, target_features_dict[" "]] = 1.0

Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 93
Max sequence length for inputs: 15
Max sequence length for outputs: 59


In [64]:
## Building the Model
# Define an input sequence and process it 
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)

# Discard 'encoder_outputs' and keep only the states
encoder_states = [state_hidden, state_cell]

# Set up the decoder using 'encoder_states' as initial state
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will be trained, this sets up the model parameters
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [65]:
## Training the Model
training_model.compile(
    optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']
)

training_model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

training_model.save('training_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [66]:
## Setup for Testing
# Define sampling models
# Restore the model and construct the encoder and decoder.
# Note: the model we used for training our network only works when we already know the target sequence. 
# This time, we have no idea what the Spanish should be for the English we pass in! 
# So we need a model that will decode step-by-step instead of using teacher forcing.
training_model = load_model('training_model.h5')

encoder_inputs = training_model.input[0] 
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = training_model.input[1]
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

decoder_lstm = training_model.layers[3]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]

decoder_dense = training_model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Create a index to token dictionary 
reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())

def decode_sequence(test_input):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(test_input)

    # Generate empty target sequence of length 1. 
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first token of target sequence with the start token.capitalize
    target_seq[0, 0, target_features_dict['\t']] = 1.0

    # Sampling loop for a batch of sequences 
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        # Run the decoder model to get possible output tokens (with probabilities) & states
        output_tokens, hidden_state, cell_state = decoder_model.predict(
            [target_seq] + states_value)

        # Choose token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_features_dict[sampled_token_index]
        decoded_sentence += sampled_token

        # Exit condition: either hit max length or find stop token
        if (sampled_token == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [hidden_state, cell_state]
    return decoded_sentence

In [69]:
## Testing
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    seq_index += 100
    test_input = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(test_input)
    print('-')
    print('Input sentence:', input_docs[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Hop in.
Decoded sentence: Montez.

-
Input sentence: Hug me.
Decoded sentence: Serrez-moi dans vos bras !

-
Input sentence: Hug me.
Decoded sentence: Serrez-moi dans vos bras !

-
Input sentence: I fell.
Decoded sentence: Je suis tombée.

-
Input sentence: I fell.
Decoded sentence: Je suis tombée.

-
Input sentence: I fled.
Decoded sentence: J'ai fui.

-
Input sentence: I knit.
Decoded sentence: Je tricote.

-
Input sentence: I know.
Decoded sentence: Je sais.

-
Input sentence: I left.
Decoded sentence: Je suis partie.

-
Input sentence: I left.
Decoded sentence: Je suis partie.

-
Input sentence: I lied.
Decoded sentence: J'ai menti.

-
Input sentence: I lost.
Decoded sentence: J'ai perdu.

-
Input sentence: I paid.
Decoded sentence: Je payai.

-
Input sentence: I paid.
Decoded sentence: Je payai.

-
Input sentence: I paid.
Decoded sentence: Je payai.

-
Input sentence: I quit.
Decoded sentence: J'arrête.

-
Input sentence: I quit.
Decoded sentence: J'arrête.

-
In

The current model works quite well for English-French. However, it performs poorly with a English-Spanish translation. Next steps are to employ Word2Vec in place of one-hot encodings to allow for context to be tracked and allow for faster computation.