In [1]:
from datetime import date
import numpy as np
import tensorflow as tf
from tensorflow import keras

2024-01-02 15:44:56.355712: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 15:44:56.390584: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-02 15:44:56.390621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-02 15:44:56.391386: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-02 15:44:56.396277: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 15:44:56.397368: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
MONTHS_DICT = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
              }

In [3]:
def generate_data(n_samples):
    X, y = [], []
    ordinal_min = date(1000, 1, 1).toordinal()
    ordinal_max = date(9999, 12, 31).toordinal()
    
    ordinal_random = np.random.randint(ordinal_max - ordinal_min, size=n_samples) + ordinal_min

    for ordinal in ordinal_random:
        dt = date.fromordinal(ordinal)
        month = MONTHS_DICT[dt.month]
        y.append(dt.isoformat())
        X.append(month + " " + dt.strftime("%d, %Y"))
        
    return X,y

In [4]:
INPUT_CHARS = sorted(set("".join(MONTHS_DICT.values()) + "1234567890, "))
OUTPUT_CHARS = "0123456789-"
def vectorize_input(data):
    return [INPUT_CHARS.index(char) for char in data]

def vectorize_output(data):
    return [OUTPUT_CHARS.index(char) for char in data]

In [5]:
def create_dataset(n_samples, batch_size=32):
    x, y = generate_data(n_samples)
    
    X = [vectorize_input(dt) for dt in x]
    Y = [vectorize_output(dt) for dt in y]
    X, Y = tf.ragged.constant(X, ragged_rank=1), tf.ragged.constant(Y,ragged_rank=1)

    X, Y = (X + 1).to_tensor(), (Y + 1).to_tensor()
    
    dataset = tf.data.Dataset.from_tensor_slices((X,Y))
    # dataset = dataset.shuffle(n_samples)
    dataset = dataset.batch(batch_size)
    
    return dataset, X.shape, Y.shape
    


In [6]:
train_dataset, MAX_INPUT_SHAPE, MAX_OUTPUT_SHAPE = create_dataset(n_samples=15000)
test_dataset, _, _ = create_dataset(n_samples=3000)
val_dataset, _, _ = create_dataset(n_samples=2000)

2024-01-02 15:45:02.955529: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-02 15:45:02.990585: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
embedding_size = 32

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1, output_dim=embedding_size, input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")   
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(MAX_OUTPUT_SHAPE[1]),
    decoder
])

In [8]:
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.fit(train_dataset, epochs=10, validation_data=val_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f9166e944f0>

In [9]:
def prepare_input(date_strs):
    ids = [vectorize_input(str) for str in date_strs]
    X = tf.ragged.constant(ids, ragged_rank=1)
    return (X + 1).to_tensor()

In [10]:
X_new = prepare_input(["September 17, 2009", "July 14, 1789"])

In [11]:
ids = np.argmax(model.predict(X_new), axis=-1)



In [12]:
for id in ids:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

2009-09-17
1789-07-14


In [13]:
X_new = prepare_input(["May 02, 2020", "July 14, 1789"])

In [14]:
ids = np.argmax(model.predict(X_new), axis=-1)
for id in ids:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

2020-02-02
1789-02-14


In [15]:
X_new = prepare_input(["May 02, 2020", "September 17, 2009"])

In [16]:
X_new[0]

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([17, 21, 38,  1,  3,  5,  2,  1,  5,  3,  5,  3,  0,  0,  0,  0,  0,
        0], dtype=int32)>

In [17]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [vectorize_input(dt) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor()

In [18]:
def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < MAX_INPUT_SHAPE[1]:
        X = tf.pad(X, [[0, 0], [0, MAX_INPUT_SHAPE[1] - X.shape[1]]])
    return X

def convert_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    ids = np.argmax(model.predict(X), axis=-1)
    return ids

In [19]:
pred = convert_date_strs(["July 14, 1789", "May 01, 2020", "August 01, 1993", "November 14, 1996", 
                           "May 22, 4322", "May 01, 1999", "July 14, 4111"])



In [20]:
for id in pred:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

1789-07-14
2020-05-01
1993-08-01
1996-11-14
4322-05-22
1999-05-01
4111-07-14


## Teacher Forcing

In [21]:
TOKEN_ID = len(OUTPUT_CHARS) + 1
def shift_output(dataset):
    X = np.concatenate([X for X,Y in dataset], axis=0)
    Y = np.concatenate([Y for X,Y in dataset], axis=0)

    sos_token = tf.fill(dims=(Y.shape[0], 1), value=TOKEN_ID)
    X_decoder = np.concatenate([sos_token, Y[:, :-1]], axis=1)
    
    return tf.constant(X), X_decoder, tf.constant(Y)

In [22]:
X_train, X_train_decoder, y_train = shift_output(train_dataset)
X_test, X_test_decoder, y_test = shift_output(test_dataset)
X_val, X_val_decoder, y_val = shift_output(val_dataset)

In [23]:
encoder_input_layer = keras.layers.Input(shape=[None], dtype=tf.int32)
encoder_embedding_layer = keras.layers.Embedding(input_dim=len(INPUT_CHARS)+ 1, output_dim=512)(encoder_input_layer)
output, encoder_h_state, encoder_c_state = keras.layers.LSTM(128, return_state=True)(encoder_embedding_layer)

encoder_state = [encoder_h_state, encoder_c_state]

decoder_input_layer = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding_layer = keras.layers.Embedding(input_dim=len(OUTPUT_CHARS)+ 2, output_dim=512)(decoder_input_layer)
decoder_LSTM_output = keras.layers.LSTM(128, return_sequences=True)(decoder_embedding_layer, initial_state=encoder_state)
decoder_output = keras.layers.Dense(len(OUTPUT_CHARS)+1, activation="softmax")(decoder_LSTM_output)

model = keras.models.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=[decoder_output])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit([X_train, X_train_decoder], y_train, epochs=20, validation_data=([X_val, X_val_decoder], y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
test_dates = ["July 14, 1789", "May 01, 2020"]

In [25]:
def predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    print(X)
    Y_pred = tf.fill(dims=[len(X), 1], value=TOKEN_ID)
    for index in range(MAX_OUTPUT_SHAPE[1]):
        X_decoder = tf.pad(Y_pred, [[0,0],[0, MAX_OUTPUT_SHAPE[1] - Y_pred.shape[1]]])
        pred_indices = np.argmax(model.predict([X, X_decoder])[:,index:index+1], axis=-1)
        Y_pred = np.concatenate((Y_pred, pred_indices), axis=1)
    return Y_pred

In [28]:
y_pred = predict_date_strs(["July 14, 1789", "May 01, 2020", "August 01, 1993", "November 14, 1996", 
                           "May 22, 4322", "May 01, 1999", "July 14, 4111"])

tf.Tensor(
[[16 36 28 38  1  4  7  2  1  4 10 11 12  0  0  0  0  0]
 [17 21 38  1  3  4  2  1  5  3  5  3  0  0  0  0  0  0]
 [13 36 25 36 34 35  1  3  4  2  1  4 12 12  6  0  0  0]
 [18 31 37 24 29 22 24 33  1  4  7  2  1  4 12 12  9  0]
 [17 21 38  1  5  5  2  1  7  6  5  5  0  0  0  0  0  0]
 [17 21 38  1  3  4  2  1  4 12 12 12  0  0  0  0  0  0]
 [16 36 28 38  1  4  7  2  1  7  4  4  4  0  0  0  0  0]], shape=(7, 18), dtype=int32)


In [29]:
for dt in y_pred[:,1:]:
    print(''.join([OUTPUT_CHARS[index -1] for index in dt]))

1789-07-14
2020-05-01
1993-08-01
1996-11-14
4322-05-22
1999-05-01
4111-07-14
