In [None]:
import os

import pandas as pd
import sentencepiece as spm

# Load dataset
df_train = pd.read_csv("en_train.csv").drop(columns=["Unnamed: 0"]).dropna()

# File of token model
TOKEN_MODEL_FILE: str = "subword_model"
TOKEN_MODEL_FILE_FULL: str = f"{TOKEN_MODEL_FILE}.model"

# Train a SentencePiece model (if not already trained)
if not os.path.exists(TOKEN_MODEL_FILE_FULL):
    print("SentencePiece model does not exist, training one right now!")
    spm.SentencePieceTrainer.Train(
        f"--input=en_train.csv --model_prefix={TOKEN_MODEL_FILE}"
    )

# Load SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load(TOKEN_MODEL_FILE_FULL)


# Tokenization functions
def encode_text(text):
    return sp.encode(text, out_type=int)


def decode_text(sequence):
    return sp.decode(sequence)

In [None]:
import numpy as np
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text to sequences
X_train = df_train["before"].apply(encode_text)
y_train = df_train["after"].apply(encode_text)

# Define sequence length
max_len = 20

# Ensure proper right-padding
X_train_padded = pad_sequences(
    X_train, maxlen=max_len, padding="post", truncating="post"
)
y_train_padded = pad_sequences(
    y_train, maxlen=max_len, padding="post", truncating="post"
)

# Vocabulary sizes
input_vocab_size = sp.GetPieceSize()
output_vocab_size = input_vocab_size  # Both input and output vocab are the same

# Model Hyperparameters
embedding_dim = 128
hidden_units = 256

# Encoder
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(input_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(
    hidden_units, return_state=True, recurrent_initializer="glorot_uniform", unroll=True
)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len,))
dec_emb_layer = Embedding(output_vocab_size, embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(
    hidden_units,
    return_sequences=True,
    return_state=True,
    recurrent_initializer="glorot_uniform",
    unroll=True,
)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Prepare decoder target data (shifted output sequences)
y_train_shifted = np.zeros_like(y_train_padded)
y_train_shifted[:, :-1] = y_train_padded[:, 1:]

# Train the model
model.fit(
    [X_train_padded, y_train_padded],
    y_train_shifted,
    epochs=10,
    validation_split=0.2,
)

# Save Model
model.save("text_normalization_model.keras")

In [None]:
# Inference Function
def predict_normalized_text(input_text):
    sequence = encode_text(input_text)
    sequence = pad_sequences(
        [sequence], maxlen=max_len, padding="post", truncating="post"
    )

    pred_seq = model.predict([sequence, np.zeros_like(sequence)])
    pred_indices = np.argmax(pred_seq, axis=-1)[0]

    return decode_text(pred_indices.tolist())


# Example Predictions
print(predict_normalized_text("12:47"))  # Expected: "twelve forty-seven"
print(predict_normalized_text("$3.16"))  # Expected: "three dollars, sixteen cents"