# 9. Encoder-Decoder for converting dates

_Exercise: Train an Encoder–Decoder model that can convert a date string from one format to another (e.g., from "April 22, 2019" to "2019-04-22")._

## Set up environment

In [89]:
import datetime

import random

from sklearn.model_selection import train_test_split

import keras

import tensorflow as tf

import numpy as np

## Prepare data

In [64]:
SAMPLE_SIZE = 10000

In [65]:
def generate_random_dates(start_year, end_year, count):
    random_dates = []
    for _ in range(count):
        year = random.randint(start_year, end_year)
        month = random.randint(1, 12)
        day = random.randint(1, 28)  # To avoid invalid dates
        random_date = datetime.date(year, month, day)
        random_dates.append(random_date)
    return random_dates

In [66]:
dates = generate_random_dates(1900, 2023, SAMPLE_SIZE)
X = [date.strftime("%B %d, %Y") for date in dates]
y = list(map(lambda date: date.isoformat(), dates))

In [67]:
X[:5]

['September 10, 1917',
 'June 21, 2011',
 'January 07, 1969',
 'September 06, 1917',
 'April 15, 2001']

In [68]:
y[:5]

['1917-09-10', '2011-06-21', '1969-01-07', '1917-09-06', '2001-04-15']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5)

print(len(X_train))
print(len(X_valid))
print(len(X_test))

8000
1000
1000


## Train model

In [70]:
START_OF_SEQ = "<"
END_OF_SEQ = ">"

DELIMITER = "-"
COMMA = ","
SPACE = " "

In [71]:
DAYS_MONTHS = [f"{day:02}" for day in range(1, 32)]
MONTHS = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]
YEARS = [str(year) for year in range (1900, 2100)]

In [72]:
@tf.function
def split_dash(input_str):
  return tf.strings.split(input_str, sep="-")

In [73]:
text_vec_layer_read = keras.layers.TextVectorization(standardize="strip_punctuation",
    vocabulary=DAYS_MONTHS + MONTHS + YEARS
)

text_vec_layer_iso = keras.layers.TextVectorization(standardize=None, split=split_dash,
    vocabulary=DAYS_MONTHS + YEARS + [START_OF_SEQ, END_OF_SEQ, DELIMITER]
)

In [74]:
VOCAB_SIZE_READ = text_vec_layer_read.vocabulary_size()
print(f"Readable format vocabulary size is {VOCAB_SIZE_READ}")

VOCAB_SIZE_ISO = text_vec_layer_iso.vocabulary_size()
print(f"ISO format vocabulary size is {VOCAB_SIZE_ISO}")

Readable format vocabulary size is 245
ISO format vocabulary size is 236


In [75]:
print(text_vec_layer_read.get_vocabulary())

['', '[UNK]', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984'

In [76]:
print(text_vec_layer_iso.get_vocabulary())

['', '[UNK]', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', 

In [77]:
print(text_vec_layer_read("November 02, 1966"))
print(text_vec_layer_iso(f"{START_OF_SEQ}-1932-10-24-{END_OF_SEQ}"))

tf.Tensor([ 43   3 111], shape=(3,), dtype=int64)
tf.Tensor([233  65  11  25 234], shape=(5,), dtype=int64)


In [78]:
# X_train_dec = tf.constant([f"{START_OF_SEQ}-{date}" for date in y_train])
# X_valid_dec = tf.constant([f"{START_OF_SEQ}-{date}" for date in y_valid])
# X_test_dec = tf.constant([f"{START_OF_SEQ}-{date}" for date in y_test])

X_train_dec = text_vec_layer_read(X_train)
X_valid_dec = text_vec_layer_read(X_valid)
X_test_dec = text_vec_layer_read(X_test)

# y_train_dec = text_vec_layer_iso([f"{date}-{END_OF_SEQ}" for date in y_train])
# y_valid_dec = text_vec_layer_iso([f"{date}-{END_OF_SEQ}" for date in y_valid])
# y_test_dec = text_vec_layer_iso([f"{date}-{END_OF_SEQ}" for date in y_test])

y_train_dec = text_vec_layer_iso(y_train)
y_valid_dec = text_vec_layer_iso(y_valid)
y_test_dec = text_vec_layer_iso(y_test)

In [None]:
X_train = tf.constant(X_train)
X_valid = tf.constant(X_valid)
X_test = tf.constant(X_test)

In [56]:
text_vec_layer_read(X_train[:5])

<tf.Tensor: shape=(5, 3), dtype=int64, numpy=
array([[ 40,   7,  60],
       [ 36,   9,  67],
       [ 33,  25,  90],
       [ 42,  28, 149],
       [ 37,  15,  53]])>

In [57]:
text_vec_layer_iso(X_test_dec[:5])

<tf.Tensor: shape=(5, 4), dtype=int64, numpy=
array([[233,  96,   3,  10],
       [233,  67,   5,  19],
       [233, 109,   9,  24],
       [233, 126,   4,   5],
       [233,  84,   4,  17]])>

In [58]:
y_train_dec[:5]

<tf.Tensor: shape=(5, 3), dtype=int64, numpy=
array([[ 48,   9,   7],
       [ 55,   5,   9],
       [ 78,   2,  25],
       [137,  11,  28],
       [ 41,   6,  15]])>

In [34]:
encoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)

In [35]:
EMBED_SIZE_READ = VOCAB_SIZE_READ // 2
EMBED_SIZE_ISO = VOCAB_SIZE_ISO // 2

encoder_input_ids = text_vec_layer_read(encoder_inputs)
decoder_input_ids = text_vec_layer_iso(decoder_inputs)

encoder_embedding_layer = keras.layers.Embedding(
    VOCAB_SIZE_READ, EMBED_SIZE_READ
)
decoder_embedding_layer = keras.layers.Embedding(
    VOCAB_SIZE_ISO, EMBED_SIZE_ISO
)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [36]:
encoder = tf.keras.layers.LSTM(128, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [37]:
decoder = tf.keras.layers.LSTM(128, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [38]:
output_layer = tf.keras.layers.Dense(EMBED_SIZE_ISO, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [39]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
)
model.fit(
    (X_train, X_train_dec),
    y_train_dec,
    epochs=10,
    validation_data=((X_valid, X_valid_dec), y_valid_dec),
)

Epoch 1/10


[1m120/250[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m2s[0m 22ms/step - accuracy: 3.4954e-04 - loss: nan

KeyboardInterrupt: 

#### Simple seq2seq model

In [80]:
embedding_size = 32
max_output_length = 3

early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=10, restore_best_weights=True
)

encoder = keras.Sequential(
    [
        keras.layers.Embedding(
            input_dim=VOCAB_SIZE_READ + 1, output_dim=embedding_size, input_shape=[None]
        ),
        keras.layers.LSTM(128),
    ]
)

decoder = keras.Sequential(
    [
        keras.layers.LSTM(128, return_sequences=True),
        keras.layers.Dense(VOCAB_SIZE_ISO + 1, activation="softmax"),
    ]
)

model = keras.Sequential(
    [encoder, keras.layers.RepeatVector(max_output_length), decoder]
)

optimizer = keras.optimizers.Nadam()
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
history = model.fit(
    X_train_dec, y_train_dec, epochs=20, validation_data=(X_valid_dec, y_valid_dec), callbacks=[early_stopping_cb]
)

  super().__init__(**kwargs)


Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.0478 - loss: 4.4100 - val_accuracy: 0.0753 - val_loss: 3.4241
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.1019 - loss: 3.3245 - val_accuracy: 0.3190 - val_loss: 2.7823
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.3930 - loss: 2.5645 - val_accuracy: 0.5427 - val_loss: 2.1615
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.5962 - loss: 2.0286 - val_accuracy: 0.6690 - val_loss: 1.7947
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.6721 - loss: 1.7184 - val_accuracy: 0.6727 - val_loss: 1.6719
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.6773 - loss: 1.6087 - val_accuracy: 0.6787 - val_loss: 1.5572
Epoch 7/20
[1m250/250

In [103]:
def convert(date_read):
    date_vec = text_vec_layer_read([date_read]) # needs to pass array because that is the shape that the model was train on
    predictions = model.predict(date_vec)
    
    word_idxs = np.argmax(predictions, axis=2) # choose word indexes with highest probability
    word_idxs = word_idxs.reshape((-1)) # drop first axis (we are only passing single date)

    result = []
    for index in word_idxs:
        result.append(text_vec_layer_iso.get_vocabulary()[index])

    return DELIMITER.join(result)

In [106]:
result = convert("April 22, 2019")
assert result == "2019-04-22"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


#### Encoder-Decoder

In [None]:
SOS_ID = VOCAB_SIZE_ISO + 1

def shifted_output_sequences(Y):
    sos_tokens = tf.fill(dims=(len(Y), 1), value=SOS_ID)
    sos_tokens = tf.cast(sos_tokens, dtype=Y.dtype)  # Ensure dtype matches Y
    return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

X_train_decoder = shifted_output_sequences(y_train_dec)
X_valid_decoder = shifted_output_sequences(y_valid_dec)
X_test_decoder = shifted_output_sequences(y_test_dec)

In [116]:
X_train_decoder

<tf.Tensor: shape=(8000, 3), dtype=int64, numpy=
array([[237, 115,   7],
       [237, 139,   5],
       [237, 140,   9],
       ...,
       [237,  67,   3],
       [237,  69,   3],
       [237,  51,   7]])>

In [119]:
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)

encoder_embedding = keras.layers.Embedding(
    input_dim=VOCAB_SIZE_READ + 1,
    output_dim=encoder_embedding_size)(encoder_input)

_, encoder_state_h, encoder_state_c = keras.layers.LSTM(
    lstm_units, return_state=True)(encoder_embedding)

encoder_state = [encoder_state_h, encoder_state_c]

decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding = keras.layers.Embedding(
    input_dim=VOCAB_SIZE_ISO + 2,
    output_dim=decoder_embedding_size)(decoder_input)
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(
    decoder_embedding, initial_state=encoder_state)
decoder_output = keras.layers.Dense(VOCAB_SIZE_ISO + 1,
                                    activation="softmax")(decoder_lstm_output)

model = keras.Model(inputs=[encoder_input, decoder_input],
                           outputs=[decoder_output])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit([X_train_dec, X_train_decoder], y_train_dec, epochs=10,
                    validation_data=([X_valid_dec, X_valid_decoder], y_valid_dec))

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.0488 - loss: 4.4242 - val_accuracy: 0.3003 - val_loss: 3.0404
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.3484 - loss: 2.7374 - val_accuracy: 0.5250 - val_loss: 2.1726
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.5968 - loss: 1.9524 - val_accuracy: 0.7297 - val_loss: 1.4394
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.8094 - loss: 1.1966 - val_accuracy: 0.9780 - val_loss: 0.6073
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9932 - loss: 0.4259 - val_accuracy: 0.9987 - val_loss: 0.1812
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9999 - loss: 0.1298 - val_accuracy: 0.9997 - val_loss: 0.0787
Epoch 7/10
[1m250/250

In [135]:
def convert_encoder_decoder(date_read):
    # Convert the input date to a tensor using the text vectorization layer
    date_vec = text_vec_layer_read([date_read])  # Pass as a list

    # Initialize the decoder input with the SOS_ID
    decoder_input = tf.constant([[SOS_ID]])

    # Initialize an empty list to store the predicted tokens
    predicted_tokens = []

    # Iterate over the maximum output length
    for _ in range(max_output_length):
        # Predict the next token
        predictions = model.predict([date_vec, decoder_input])
        next_token = np.argmax(predictions[0, -1, :])  # Get the token with the highest probability

        # Append the predicted token to the list
        predicted_tokens.append(next_token)

        # Update the decoder input for the next iteration
        decoder_input = tf.concat([decoder_input, tf.constant([[next_token]])], axis=1)

    # Convert the predicted tokens to their corresponding strings
    result = DELIMITER.join([text_vec_layer_iso.get_vocabulary()[token] for token in predicted_tokens])

    return result

result = convert_encoder_decoder("April 22, 2019")
assert result == "2019-04-22"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


#### Transformer