In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import re
import unicodedata
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2025-12-14 14:23:18.694336: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-14 14:23:18.694577: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-14 14:23:18.729419: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-14 14:23:20.288550: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

In [3]:
# config
EMBED_DIM = 128
LATENT_DIM = 256
NUM_HEADS = 4
BATCH_SIZE = 64
EPOCHS = 50

BASE_DIR = os.path.dirname(os.path.abspath('.'))
DATA_PATH = os.path.join(BASE_DIR, "data", "raw", "eng_khm_data.csv")
MODEL_PATH = os.path.join(BASE_DIR, "models", "transformer_romanizer.keras")
ASSETS_PATH = os.path.join(BASE_DIR, "data", "processed", "transformer_romanization_assets.pkl")
RESULTS_PATH = os.path.join(BASE_DIR, "results")

os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
os.makedirs(os.path.dirname(ASSETS_PATH), exist_ok=True)
os.makedirs(RESULTS_PATH, exist_ok=True)

In [4]:
# load and preprocess data
df = pd.read_csv(DATA_PATH)
dataset = []

for _, row in df.iterrows():
    normalized_khm = re.sub(r"[^\u1780-\u17FF]", "", row['khm'])
    normalized_khm = unicodedata.normalize('NFC', normalized_khm)
    
    normalized_eng = re.sub(r"[^a-z]", "", row['eng'].lower())

    dataset.append((normalized_khm, normalized_eng))

print(f"Dataset size: {len(dataset)}")
print("Sample pairs:", dataset[:3])

Dataset size: 28576
Sample pairs: [('ប្រដែ', 'brodae'), ('អសង្ខៃយ', 'aasangkheyy'), ('ឆាតកភ័យ', 'chhatkophey')]


In [5]:
# tokenize
khm_tokenizer = Tokenizer(char_level=True, filters='', oov_token='<unk>')
khm_tokenizer.fit_on_texts([pair[0] for pair in dataset])

eng_tokenizer = Tokenizer(char_level=True, filters='', oov_token='<unk>')
eng_tokenizer.fit_on_texts(["\t", "\n"] + [pair[1] for pair in dataset])

print(f"Khmer vocab size: {len(khm_tokenizer.word_index) + 1}")
print(f"English vocab size: {len(eng_tokenizer.word_index) + 1}")

Khmer vocab size: 79
English vocab size: 30


In [7]:
# create sequences
encoder_inputs, decoder_inputs, decoder_targets = [], [], []
max_khm_len = max(len(pair[0]) for pair in dataset) or 1
max_eng_len = max(len(pair[1]) for pair in dataset) or 1

for khm, eng in dataset:
    # encoder sequence (khmer)
    khm_seq = khm_tokenizer.texts_to_sequences([khm])[0]
    encoder_inputs.append(khm_seq)

    # decoder sequences (english)
    eng_seq = eng_tokenizer.texts_to_sequences([eng])[0]
    decoder_input = [eng_tokenizer.word_index['\t']] + eng_seq
    decoder_target = eng_seq + [eng_tokenizer.word_index['\n']]
    
    decoder_inputs.append(decoder_input)
    decoder_targets.append(decoder_target)

X_train_full = pad_sequences(encoder_inputs, maxlen=max_khm_len, padding='post')
decoder_input_data_full = pad_sequences(decoder_inputs, maxlen=max_eng_len + 1, padding='post')
decoder_target_data_full = pad_sequences(decoder_targets, maxlen=max_eng_len + 1, padding='post')

# Train/test split
X_train, X_test, dec_in_train, dec_in_test, dec_tgt_train, dec_tgt_test = train_test_split(
    X_train_full, decoder_input_data_full, decoder_target_data_full, 
    test_size=0.2, random_state=42
)

num_encoder_tokens = len(khm_tokenizer.word_index) + 1
num_decoder_tokens = len(eng_tokenizer.word_index) + 1

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Max Khmer length: {max_khm_len}")
print(f"Max English length: {max_eng_len}")

Training samples: 22860
Test samples: 5716
Max Khmer length: 24
Max English length: 25


In [8]:
# build transformer model
# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
enc_emb = Embedding(input_dim=num_encoder_tokens, output_dim=EMBED_DIM)(encoder_inputs)
encoder_dense = Dense(LATENT_DIM, activation='relu')(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
dec_emb = Embedding(input_dim=num_decoder_tokens, output_dim=EMBED_DIM)(decoder_inputs)
decoder_dense_input = Dense(LATENT_DIM, activation='relu')(dec_emb)

# Multi-head attention (encoder->decoder)
attention_output = tf.keras.layers.MultiHeadAttention(
    num_heads=NUM_HEADS, key_dim=LATENT_DIM
)(decoder_dense_input, encoder_dense)
attention_output = LayerNormalization()(attention_output + decoder_dense_input)
dropout_output = Dropout(0.2)(attention_output)

# Output layer
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(dropout_output)

# Build and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='transformer_seq2seq')
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

E0000 00:00:1765697120.550638  168107 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1765697120.560534  168107 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [9]:
# train
checkpoint = ModelCheckpoint(MODEL_PATH, save_best_only=True, monitor='val_loss', verbose=1)

history = model.fit(
    [X_train, dec_in_train],
    np.expand_dims(dec_tgt_train, -1),
    validation_data=([X_test, dec_in_test], np.expand_dims(dec_tgt_test, -1)),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[checkpoint]
)

Epoch 1/50
[1m110/358[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m16s[0m 65ms/step - accuracy: 0.7061 - loss: 1.0200

KeyboardInterrupt: 

In [None]:
# save assets
assets = {
    "khm_tokenizer": khm_tokenizer,
    "eng_tokenizer": eng_tokenizer,
    "max_khm_len": max_khm_len,
    "max_eng_len": max_eng_len
}

with open(ASSETS_PATH, "wb") as file:
    pickle.dump(assets, file)

print(f"Model saved to: {MODEL_PATH}")
print(f"Assets saved to: {ASSETS_PATH}")

In [None]:
# plot training curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Transformer Seq2Seq Loss (Khmer->English)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Transformer Seq2Seq Accuracy (Khmer->English)')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_PATH, 'transformer_training_curves.png'))
plt.show()


NameError: name 'DATA_PATH' is not defined