### Leverages a state-of-the-art Transformer model for high-quality translation.

In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Dropout, LayerNormalization
from keras.layers import MultiHeadAttention, GlobalAveragePooling1D, Add
from keras.optimizers import Adam

# Load the datasets
data_rom = pd.read_csv("csv/data_rom_500.csv", header=None)  # Romanized text (input)
data_kh = pd.read_csv("csv/data_kh_500.csv", header=None)    # Khmer text (target)

batch_size = 32  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

# Prepare datasets for Roman-to-Khmer
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for input_text in data_rom[0]:  # Romanized text is now the input
    input_text = str(input_text).strip()
    input_texts.append(input_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)

for target_text in data_kh[0]:  # Khmer text is now the target
    target_text = '\t' + str(target_text).strip() + '\n'
    target_texts.append(target_text)
    for char in str(target_text):
        if char not in target_characters:
            target_characters.add(char)

# Sort characters to ensure consistent token indexing
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

# Create token indices
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

# Initialize input and output data for the model
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype="float32")
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32")
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32")

# Populate the data arrays
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # Decoder target data is offset by one timestep
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

# Define Transformer block (Self-Attention and Feed Forward)
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    # Self-Attention Layer
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = Dropout(dropout)(attention)
    attention = LayerNormalization()(attention)
    
    # Feed Forward Layer
    ff = Dense(ff_dim, activation="relu")(attention)
    ff = Dropout(dropout)(ff)
    ff = Dense(inputs.shape[-1])(ff)
    return Add()([inputs, ff])

def transformer_decoder(inputs, encoder_output, head_size, num_heads, ff_dim, dropout=0.1):
    # Self-Attention Layer for Decoder
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = Dropout(dropout)(attention)
    attention = LayerNormalization()(attention)
    
    # Encoder-Decoder Attention Layer
    cross_attention = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(attention, encoder_output)
    cross_attention = Dropout(dropout)(cross_attention)
    cross_attention = LayerNormalization()(cross_attention)
    
    # Feed Forward Layer
    ff = Dense(ff_dim, activation="relu")(cross_attention)
    ff = Dropout(dropout)(ff)
    ff = Dense(inputs.shape[-1])(ff)
    return Add()([attention, ff])

# Transformer Parameters
head_size = 64
num_heads = 8
ff_dim = 256
num_layers = 4

# Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
x = encoder_inputs
for _ in range(num_layers):
    x = transformer_encoder(x, head_size, num_heads, ff_dim)
encoder_output = x

# Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
x = decoder_inputs
for _ in range(num_layers):
    x = transformer_decoder(x, encoder_output, head_size, num_heads, ff_dim)
decoder_output = Dense(num_decoder_tokens, activation="softmax")(x)

# Define the model that combines encoder and decoder
model = Model([encoder_inputs, decoder_inputs], decoder_output)

# Compile the model
model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

# Save the model
model.save("transformer_rom_to_khmer.h5")

# Create inference models
encoder_model = Model(encoder_inputs, encoder_output)

decoder_state_input = Input(shape=(None, num_decoder_tokens))
decoder_output = transformer_decoder(decoder_state_input, encoder_output, head_size, num_heads, ff_dim)
decoder_output = Dense(num_decoder_tokens, activation="softmax")(decoder_output)
decoder_model = Model([decoder_state_input], decoder_output)

# Reverse-lookup token index to decode sequences back to text
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


2025-10-08 19:27:46.244145: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Number of samples: 500
Number of unique input tokens: 23
Number of unique output tokens: 68
Max sequence length for inputs: 20
Max sequence length for outputs: 13
Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 553ms/step - accuracy: 0.0362 - loss: 1.6617 - val_accuracy: 0.0769 - val_loss: 1.8048
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 317ms/step - accuracy: 0.0658 - loss: 1.4607 - val_accuracy: 0.0769 - val_loss: 1.7089
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 355ms/step - accuracy: 0.0642 - loss: 1.4226 - val_accuracy: 0.0769 - val_loss: 1.6358
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 401ms/step - accuracy: 0.0525 - loss: 1.4493 - val_accuracy: 0.0769 - val_loss: 1.6393
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 433ms/step - accuracy: 0.0390 - loss: 1.5592 - val_accuracy: 0.0769 - val_loss: 1.6785
Epoch 6/100
[1m13/13

