In [1]:
import argparse 
import datetime
import random

import numpy as np
import tensorflow as tf


args = argparse.Namespace()
args.seed = 101
args.epochs = 20
args.batch_size = 32
    
def random_date_output():
    ordinal_min = datetime.date(1000, 1, 1).toordinal()
    ordinal_max = datetime.date(9999, 12, 31).toordinal()
    return datetime.date.fromordinal(random.randint(ordinal_min, ordinal_max))

def output_date_to_input(date) -> str:
    months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]
    
    return "{} {}, {}".format(months[date.month - 1], date.day, date.year)


def date_to_digits(str_date):
    str_date = str_date.lower()
    return [ord(s) for s in str_date]

def output_date_to_digits(str_date):
    return [10 if s=='-' else int(s) for s in str_date]

def input_tensor_to_date(np_date) -> str:
    date_str = "".join([chr(i) for i in np_date])
    return date_str[0].upper() + date_str[1:]

def output_tensor_to_date(tensor_date) -> str:
    return "".join(["-" if i==10 else str(i.numpy()) for i in tensor_date])

def prediction_to_output_tensor(prediction):
    return tf.argmax(prediction, axis=1)

def build_dataset(dates_num):
    input_list = []
    output_list = []
    for i in range(dates_num):
        rd = random_date_output()
        ird = output_date_to_input(rd)
        ird = date_to_digits(ird)
        rd = output_date_to_digits(str(rd))
        input_list.append(ird)
        output_list.append(rd)
    return tf.ragged.constant(input_list), tf.constant(output_list)


train_dataset = build_dataset(20000)
val_dataset = build_dataset(10000)

in_num = ord("z")
out_num = 11
max_out_len = len(train_dataset[1][0])

2022-02-08 15:00:32.009478: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def simple_enc_dec():    
    encoder = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(in_num, 32, input_shape=[None]),
        tf.keras.layers.LSTM(64, return_sequences=False)])

    # RepeatVector is a must. we translate +- 17 words seq into 10 words seq. return_sequences = True for encoder leads up to 17 words output
    # with return_sequences = False, it won't be possible to reconstruct a full seq, but one letter.
    decoder = tf.keras.models.Sequential([
        tf.keras.layers.RepeatVector(max_out_len),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.Dense(out_num, activation="softmax")])
    model = tf.keras.models.Sequential([encoder, decoder])

    model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
    monitor='val_loss'
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor=monitor, patience=3, mode='auto', restore_best_weights=True, verbose=1)
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=2, min_delta=1e-4, mode='auto', verbose=1)
    model.summary()
    model.fit(train_dataset[0], train_dataset[1], batch_size=args.batch_size, epochs=args.epochs, validation_data=(val_dataset[0], val_dataset[1]), callbacks=[early_stopping, reduce_lr_on_plateau])

    for i in range(5):
        print("***")
        print(val_dataset[0][i])
        print(input_tensor_to_date(val_dataset[0][i]))
        prediction = model(val_dataset[0])[i]
        output_tensor = prediction_to_output_tensor(prediction)
        print(output_tensor)
        print(output_tensor_to_date(output_tensor))
    
if True:
    simple_enc_dec()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 64)                28736     
                                                                 
 sequential_1 (Sequential)   (None, 10, 11)            33739     
                                                                 
Total params: 62,475
Trainable params: 62,475
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 12/20
Epoch 00012: early stopping
***
tf.Tensor([ 97 112 114 105 108  32  50  48  44  32  57  48  51  50], shape=(14,), dtype=int32)
April 20, 9032
tf.Tensor([ 9  0  3  2 10  0  4 10  2  0], shape=(10,), dtype=int64)
9032-04-20
***
tf.Tensor([ 97 112 114 105 108  32  57  44  32  57  56  51  54], shape=(13,), dtype=int32)
April 9, 9836
tf.Tensor([ 9  8  3  6 10  0  4 10  0  9], shape=(10,), dtype=int64)
9836-04-09
***
tf.Tensor([102 101  98 114 117  97 114 121  32  53  44  32  56  53  53  52], shape=(16,), dtype=int32)
February 5, 8554
tf.Tensor([ 8  5  5  4 10  0  2 10  0  5], shape=(10,), dtype=int64)
8554-02-05
***
tf.Tensor([106 117 108 121  32  49  49  44  32  57  49  56  51], shape=(13,), dtype=int32)
July 11, 9183
tf.Tensor([ 9  1  8  3 10  0  7 10  1  1], shape=(10,), dtype=int64)
91