In [1]:
from datetime import date
import numpy as np
import tensorflow as tf
from tensorflow import keras

2023-12-27 16:34:26.426262: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-27 16:34:26.459320: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-27 16:34:26.459349: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-27 16:34:26.460311: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-27 16:34:26.465220: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-27 16:34:26.465989: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
N_SAMPLES = 10

MONTHS_DICT = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
              }

ordinal_min = date(999, 1, 1).toordinal()
ordinal_max = date(9999, 12, 31).toordinal()
ordinal_random = np.random.randint(ordinal_min, ordinal_max)

dt = date.fromordinal(ordinal_random)
tm_year, tm_month, tm_day = dt.timetuple().tm_year, dt.timetuple().tm_mon, dt.timetuple().tm_mday

In [3]:
def generate_data(n_samples):
    X, y = [], []
    ordinal_min = date(1000, 1, 1).toordinal()
    ordinal_max = date(9999, 12, 31).toordinal()
    
    ordinal_random = np.random.randint(ordinal_max - ordinal_min, size=n_samples) + ordinal_min

    for ordinal in ordinal_random:
        dt = date.fromordinal(ordinal)
        tm_year, tm_month, tm_day = dt.timetuple().tm_year, dt.timetuple().tm_mon, dt.timetuple().tm_mday
        month = MONTHS_DICT[tm_month]
        y.append(dt.isoformat())
        X.append(f"{month} {tm_day}, {tm_year}")
        
    return X,y

In [4]:
INPUT_CHARS = sorted(set("".join(MONTHS_DICT.values()) + "1234567890, "))
OUTPUT_CHARS = "0123456789-"
def vectorize_input(data):
    return [INPUT_CHARS.index(char) for char in data]

def vectorize_output(data):
    return [OUTPUT_CHARS.index(char) for char in data]

In [5]:
def create_dataset(n_samples, batch_size=32):
    x, y = generate_data(n_samples)
    
    X = [vectorize_input(dt) for dt in x]
    Y = [vectorize_output(dt) for dt in y]
    X, Y = tf.ragged.constant(X, ragged_rank=1), tf.ragged.constant(Y,ragged_rank=1)

    X, Y = (X + 1).to_tensor(), (Y + 1).to_tensor()
    
    dataset = tf.data.Dataset.from_tensor_slices((X,Y))
    dataset = dataset.shuffle(n_samples)
    dataset = dataset.batch(batch_size)
    
    return dataset, X.shape, Y.shape
    


In [6]:
train_dataset, MAX_INPUT_SHAPE, MAX_OUTPUT_SHAPE = create_dataset(n_samples=15000)
test_dataset, _, _ = create_dataset(n_samples=3000)
val_dataset, _, _ = create_dataset(n_samples=2000)

2023-12-27 16:34:33.512150: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-27 16:34:33.532898: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
embedding_size = 32

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1, output_dim=embedding_size, input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")   
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(MAX_OUTPUT_SHAPE[1]),
    decoder
])

In [8]:
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.fit(train_dataset, epochs=20, validation_data=val_dataset)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f00326eef10>

In [9]:
def prepare_input(date_strs):
    ids = [vectorize_input(str) for str in date_strs]
    X = tf.ragged.constant(ids, ragged_rank=1)
    return (X + 1).to_tensor()

In [10]:
X_new = prepare_input(["September 17, 2009", "July 14, 1789"])

In [11]:
ids = np.argmax(model.predict(X_new), axis=-1)



In [12]:
for id in ids:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

2009-09-17
1789-07-14


In [13]:
X_new = prepare_input(["May 02, 2020", "July 14, 1789"])

In [14]:
ids = np.argmax(model.predict(X_new), axis=-1)
for id in ids:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

2020-04-20
1789-09-14


In [15]:
X_new = prepare_input(["May 02, 2020", "September 17, 2009"])

In [16]:
X_new[0]

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([17, 21, 38,  1,  3,  5,  2,  1,  5,  3,  5,  3,  0,  0,  0,  0,  0,
        0], dtype=int32)>

In [17]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [vectorize_input(dt) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor()

In [18]:
def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    print(X)
    if X.shape[1] < MAX_INPUT_SHAPE[1]:
        X = tf.pad(X, [[0, 0], [0, MAX_INPUT_SHAPE[1] - X.shape[1]]])
        print(X)
    return X

def convert_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    ids = np.argmax(model.predict(X), axis=-1)
    return ids

In [19]:
pred = convert_date_strs(["May 11, 2020", "September 17, 2009"])

tf.Tensor(
[[17 21 38  1  4  4  2  1  5  3  5  3  0  0  0  0  0  0]
 [20 24 32 35 24 29 22 24 33  1  4 10  2  1  5  3  3 12]], shape=(2, 18), dtype=int32)


In [20]:
for id in pred:
    print(''.join([OUTPUT_CHARS[index -1] for index in id]))

2020-05-11
2009-09-17
