In [None]:
import os

import pandas as pd
import sentencepiece as spm

# Load dataset
df_train = pd.read_csv("en_train.csv")

# File of token model
TOKEN_MODEL_FILE: str = "subword_model"
TOKEN_MODEL_FILE_FULL: str = f"{TOKEN_MODEL_FILE}.model"

# Train a SentencePiece model (if not already trained)
if not os.path.exists(TOKEN_MODEL_FILE_FULL):
    print("SentencePiece model does not exist, training one right now!")
    spm.SentencePieceTrainer.Train(
        f"--input=en_train.csv --model_prefix={TOKEN_MODEL_FILE} "
    )

# Load SentencePiece model
sp: spm.SentencePieceProcessor = spm.SentencePieceProcessor()
sp.Load(TOKEN_MODEL_FILE_FULL)


# Tokenization functions
def encode_text(text):
    return sp.encode(text)


def decode_text(sequence):
    return sp.decode(sequence)

In [None]:
import numpy as np
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text to sequences
X_train = df_train["before"].apply(encode_text)
y_train = df_train["after"].apply(encode_text)

# Define sequence length
max_len = 10

# Ensure proper right-padding
X_train_padded = pad_sequences(
    X_train, maxlen=max_len, padding="post", truncating="post"
)
y_train_padded = pad_sequences(
    y_train, maxlen=max_len, padding="post", truncating="post"
)

# Vocabulary sizes
input_vocab_size = sp.GetPieceSize()
output_vocab_size = input_vocab_size  # Both input and output vocab are the same

# Model Hyperparameters
embedding_dim = 128
hidden_units = 256

# Encoder
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(input_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len,))
dec_emb = Embedding(output_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True)
decoder_outputs = decoder_lstm(dec_emb, initial_state=encoder_states)

# Output layer
output = Dense(output_vocab_size, activation="softmax")(decoder_outputs)


MODEL_NAME: str = "text_normalization_model.keras"

# Load the best model saved during training
if os.path.exists(MODEL_NAME):
    model = load_model(MODEL_NAME)
else:
    # Define and compile model
    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

In [None]:
# Prepare target data (shifted by one position for teacher forcing)
y_train_shifted = np.zeros_like(y_train_padded)
y_train_shifted[:, :-1] = y_train_padded[:, 1:]

# Add callbacks for better training
callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ModelCheckpoint(
        MODEL_NAME, monitor="val_loss", save_best_only=True, mode="min", verbose=1
    ),
]

# Train with validation split and callbacks
history = model.fit(
    [X_train_padded, y_train_padded],
    y_train_shifted,
    epochs=50,  # More epochs with early stopping
    validation_split=0.2,
    callbacks=callbacks,
)

In [None]:
model = load_model(MODEL_NAME)


# Inference Function
def predict_normalized_text(input_text):
    sequence = encode_text(input_text)
    sequence = pad_sequences(
        [sequence], maxlen=max_len, padding="post", truncating="post"
    )

    pred_seq = model.predict([sequence, np.zeros_like(sequence)])
    pred_indices = np.argmax(pred_seq, axis=-1)[0]

    return decode_text(pred_indices.tolist())


# Example Predictions
print(predict_normalized_text("Jul 15"))
print(predict_normalized_text("November 26, 1903"))
print(predict_normalized_text("Australian"))