# Exercice de formatage de date sur différents modèles d'encodeur

In [58]:
from datetime import date
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

# cannot use strftime()'s %B format since it depends on the locale
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates]
    return x, y

np.random.seed(42)

n_dates = 3
x_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print("{:25s}{:25s}".format(x_example[idx], y_example[idx]))

# Liste de tous les char possibles en entrée et en sortie.
INPUT_CHARS = "".join(sorted(set("".join(MONTHS)))) + "0123456789, "
#'ADFJMNOSabceghilmnoprstuvy01234567890, '

OUTPUT_CHARS = "0123456789-"

# Ce que nous avons besoin de faire:
# - nous devons attribuer à chaque char un identifiant.
# - puis pour chaque "phrase", le tokenizer, ici ça sera simplement récupérer chaque caractère
# - On train un modèle dessus

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [76]:
# Pour créer une table de lookup, mais que si le vocabulaire est au niveau du mot
def create_lookup_table(phrases): # phrases est une liste de liste de mots. Autrement dit, les mots doivent déjà être tokenizé
    # Créer la table de lookup
    words = tf.constant(phrases)
    word_ids = tf.range(len(phrases), dtype=tf.int64)
    print(f"words: {words}")
    print(f"word_ids: {word_ids}")

    num_oov_buckets = 1000
    vocab_init = tf.lookup.KeyValueTensorInitializer(keys=words, values=word_ids)
    encoder_table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

    def encode_words(X_batch):
        return encoder_table.lookup(X_batch)

    return encoder_table, encode_words, words, word_ids

def split_string(string):
    return [s for s in string]
    
# decoder_vocab_init = tf.lookup.KeyValueTensorInitializer(keys=word_ids, values=words)
# decoder_table = tf.lookup.StaticVocabularyTable(decoder_vocab_init, num_oov_buckets)


# def decode_words(X_batch):
#     return decoder_vocab_init.lookup(X_batch)

encoder_table, encode_words, _, _ = create_lookup_table([char for char in INPUT_CHARS])
encode_words(tf.constant(split_string('September 20, 7075')))

words: [b'A' b'D' b'F' b'J' b'M' b'N' b'O' b'S' b'a' b'b' b'c' b'e' b'g' b'h'
 b'i' b'l' b'm' b'n' b'o' b'p' b'r' b's' b't' b'u' b'v' b'y' b'0' b'1'
 b'2' b'3' b'4' b'5' b'6' b'7' b'8' b'9' b',' b' ']
word_ids: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]


<tf.Tensor: shape=(18,), dtype=int64, numpy=
array([ 7, 11, 19, 22, 11, 16,  9, 11, 20, 37, 28, 26, 36, 37, 33, 26, 33,
       31])>

In [74]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, split='') # we want the space as a char too
tokenizer.fit_on_texts([INPUT_CHARS, OUTPUT_CHARS])
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count
print(f"max_id:{max_id}")
print(f"dataset_size:{dataset_size}")

def tokenize_encode_lines(lines):
    return tokenizer.texts_to_sequences(lines)


max_id:34
dataset_size:2


[[5, 21, 26, 28, 21, 2, 19, 21, 27, 33, 8, 6, 32, 33, 13, 6, 13, 11],
 [5, 21, 26, 28, 21, 2, 19, 21, 27, 33, 8, 6, 32, 33, 13, 6, 13, 12]]

In [139]:
import tensorflow as tf

def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return x, y

def shifted_output_sequences(Y):
    sos_id = 0
    sos_tokens = tf.fill(dims=(1,), value=sos_id)
    return tf.concat([sos_tokens, Y], axis=0)

def disp_dataset(ds, limit=None, print_elements=True):
    count = 0
    for x in ds:
        count += 1
        if print_elements:
            print(x)
        if limit is not None and count >= limit:
            break
    print(f"Count:{count}")

dataset = create_dataset(10000) # return X de taille 10000 et Y de taille 10000
# On transforme les phrases en liste d'id
dataset = (tokenize_encode_lines(dataset[0]), tokenize_encode_lines(dataset[1]))
# On transforme la liste en tenseurs. comme les phrases n'ont pas toutes la même longeueur, on rajoute un padding d'id 0: c'est l'idée du ragged.constant.to_tensor()
dataset = (tf.ragged.constant(dataset[0]).to_tensor(), tf.ragged.constant(dataset[1]).to_tensor())
dataset = tf.data.Dataset.from_tensor_slices(dataset)
dataset = dataset.map(lambda x,y: ((x, shifted_output_sequences(y)),y))
# Nous aurons besoin de 2 inputs : 1 pour l'encodeur, et l'autre pour le décodeur, qui n'est que la cible y, où on met un token d'id 0 au début pour shifter. On choisit le token d'id 0 arbitrairement, car nous n'avons pas de token <sos> ou <eos>, mais un simple 0 pour essayer d'indiquer l'inimportance.
disp_dataset(dataset, limit=1)

test_set = dataset.take(1000)
train_set = dataset.skip(1000)
valid_set = train_set.take(1000)
train_set = train_set.skip(1000)

train_set = train_set.batch(32).prefetch(1)
valid_set = valid_set.batch(32).prefetch(1)
test_set = test_set.batch(32).prefetch(1)


((<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([ 5, 21, 26, 28, 21,  2, 19, 21, 27, 33,  8,  6, 32, 33, 13,  6, 13,
       11], dtype=int32)>, <tf.Tensor: shape=(11,), dtype=int32, numpy=array([ 0, 13,  6, 13, 11, 34,  6, 15, 34,  8,  6], dtype=int32)>), <tf.Tensor: shape=(10,), dtype=int32, numpy=array([13,  6, 13, 11, 34,  6, 15, 34,  8,  6], dtype=int32)>)
Count:1


In [143]:
import tensorflow_addons as tfa

np.random.seed(42)
tf.random.set_seed(42)

def create_simple_model():
    encoder_embedding_size = 32
    decoder_embedding_size = 32
    units = 128

    encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

    vocab_size = len(INPUT_CHARS) + 1 # on prend en compte le token d'id 0 qui est le padding
    embeddings = keras.layers.Embedding(len(INPUT_CHARS) + 1, encoder_embedding_size)
    encoder_embeddings = embeddings(encoder_inputs)
    decoder_embeddings = embeddings(decoder_inputs)

    encoder = keras.layers.LSTM(units)
    encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
    encoder_state = [state_h, state_c]

    sampler = tfa.seq2seq.sampler.TrainingSampler()

    decoder_cell = keras.layers.LSTMCell(units)
    output_layer = keras.layers.Dense(vocab_size)

    decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                    sampler,
                                                    output_layer=output_layer)
    final_outputs, final_state, final_sequence_lengths = decoder(
        decoder_embeddings,
        initial_state=encoder_state,
        sequence_lengths=sequence_lengths)

    Y_proba =tf.nn.softmax(final_outputs.rnn_outputs)
    model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                            outputs=[Y_proba])
    return model

model = create_simple_model()
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(train_set, epochs=15,
                    validation_data=valid_set)

Epoch 1/15


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  Incompatible shapes: [32,10] vs. [32,11]
	 [[node Equal (defined at <ipython-input-143-9d3f51f4b3b6>:48) ]]
	 [[gradient_tape/model_4/embedding_8/embedding_lookup/Reshape/_136]]
  (1) Invalid argument:  Incompatible shapes: [32,10] vs. [32,11]
	 [[node Equal (defined at <ipython-input-143-9d3f51f4b3b6>:48) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_21103]

Function call stack:
train_function -> train_function


In [134]:
Y = tf.constant([2,3,4])
sos_tokens = tf.fill(dims=(1,), value=0)
sos_tokens
tf.concat([sos_tokens, Y], axis=0)

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 2, 3, 4], dtype=int32)>