In [1]:

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time



In [2]:
path_to_file = '/content/iliadaodysseia.txt'


#%%

# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')


#%%

print(text[:250])


#%%

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')





#%%

ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))



#%%


#%%

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True,encoding='utf-8')


#%%

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


#%%

all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids


#%%

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


#%%

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))


#%%

seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)


#%%

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))


#%%

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())


#%%

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


#%%

dataset = sequences.map(split_input_target)


#%%

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


#%%

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset


#%%

# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


#%%
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x
#%%


Length of text: 1573038 characters

Τη µάνητα, θεά, τραγουδά µας του ξακουστού Αχιλλέα,

ανάθεµα τη, πίκρες που 'δωκε στους Αχαιούς περίσσιες
και πλήθος αντρειωµένες έστειλε ψυχές στον Άδη κάτω
παλικαριών, στους σκύλους ρίχνοντας να φανέ τα κορµιά τους
και στα όρνια ολούθε έτσι το θέλ
119 unique characters


Τ
η
 
µ
ά
ν
η
τ
α
tf.Tensor(
[b'\n' b'\xce\xa4' b'\xce\xb7' b' ' b'\xc2\xb5' b'\xce\xac' b'\xce\xbd'
 b'\xce\xb7' b'\xcf\x84' b'\xce\xb1' b',' b' ' b'\xce\xb8' b'\xce\xb5'
 b'\xce\xac' b',' b' ' b'\xcf\x84' b'\xcf\x81' b'\xce\xb1' b'\xce\xb3'
 b'\xce\xbf' b'\xcf\x85' b'\xce\xb4' b'\xce\xac' b' ' b'\xc2\xb5'
 b'\xce\xb1' b'\xcf\x82' b' ' b'\xcf\x84' b'\xce\xbf' b'\xcf\x85' b' '
 b'\xce\xbe' b'\xce\xb1' b'\xce\xba' b'\xce\xbf' b'\xcf\x85' b'\xcf\x83'
 b'\xcf\x84' b'\xce\xbf' b'\xcf\x8d' b' ' b'\xce\x91' b'\xcf\x87'
 b'\xce\xb9' b'\xce\xbb' b'\xce\xbb' b'\xce\xad' b'\xce\xb1' b',' b'\n'
 b'\n' b'\xce\xb1' b'\xce\xbd' b'\xce\xac' b'\xce\xb8' b'\xce\xb5'
 b'\xc2\xb5' b'\xce\xb1' b' ' b'

In [None]:

model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


#%%

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


#%%

model.summary()


#%%

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()


#%%

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())


#%%

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)


#%%

example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)
print(tf.exp(mean_loss).numpy())

#%%

model.compile(optimizer='adam', loss=loss)


#%%

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


#%%


In [None]:

EPOCHS = 20
# history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
history = model.fit(dataset, epochs=EPOCHS)

In [7]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [8]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [12]:
start = time.time()
states = None
next_char = tf.constant(['Και είπε τοτε '])
result = [next_char]

for n in range(2000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Και είπε τοτε µάνταλο' και πήδησαν οι δούλες
 να τρέξει στα καράβια πρόθυµα στο αρχοντικό σου µέσα
µε τον τρανό, κουράγιο του άκουσεν ο Πάτροκλος και πλάντου,
 και µια µωρό το πρώτα σύγνεφο πηδούν να βγουν πάχτη,

που οι δυο τους Αχαιούς, που εγώ τη φλέπνο µας, στον κάµπο, µες στο νου του.
Κι όπως τον είδε, ο θείος Αγαδήνορας, και βούλιαξαν στον κάµπο·
στον ώµο ο ξεχωρίζεις τότε µας, που από κριγιούς κουράγιο.
 Κάθε φορά η κουλήλιος λάθεψα, και µόνο αλήθεια ζεύεικες κρατούσε το κοντάρι,
το γιο να µένουν άκουσε και µου' κει ποι όλη έστεια του Κρόνου ο γιος του Αρχίσου,
 σε γιο τους Τρώες στους λιονταρόκαρδους υγιούς του Ατρέα και ρίχτες,
πια µην ντραπείς και µαύρη µοίρα,
να ίδες βοσκοί που στέκουνταν στο φως τα πλήθια κράζει:
« Ποιος τότε αναγελώντας τον, να ξεπεράσει Μειείτε,
να µη γλιτώσει, υγιούς εγώ δε σώκουµαι στον κόχτιο αµαξουνό, να σέρνει
καρτέρι' είµαι τ' αρνιά τους βρήκαν και τις κόρες του, του ρήγα τα χοριάζει,
ως που πια τα φουσάτα παίρνοντας την πέρφανη καρδιά του µόνο
και 

In [3]:
class CustomTraining(MyModel):
  @tf.function
  def train_step(self, inputs):
      inputs, labels = inputs
      with tf.GradientTape() as tape:
          predictions = self(inputs, training=True)
          loss = self.loss(labels, predictions)
      grads = tape.gradient(loss, model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

      return {'loss': loss}

In [4]:
model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [5]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [10]:
model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fbcf67c2310>