<a href="https://colab.research.google.com/github/JonasMarma/TG-Eng-Info-UFABC/blob/main/text_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bibliografia:

https://www.tensorflow.org/text/tutorials/text_generation

Ver as notas com estrelinhas para sugestões de como melhorar o código!

Ideias minhas:

E se fizer com que o vocabulário seja composto por palavras?

Acho que só isso kkkj

**If you want the model to generate text faster the easiest thing you can do is batch the text generation. In the example below the model generates 5 outputs in about the same time it took to generate 1 above.**

# Preparação dos dados

In [None]:
import tensorflow as tf

import numpy as np
import os
import time

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Length of text: 1115394 characters
65 unique characters


In [None]:
# Criação da camada de transformação: char -> id
ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab),
    mask_token=None)

# Criação da camada de transformação: id -> char
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),
    invert=True,
    mask_token=None)

# Função para transformar ids diretamente em textos
# https://www.tensorflow.org/api_docs/python/tf/strings/reduce_join
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
# Separar o texto em caracteres
all_chars = tf.strings.unicode_split(text, 'UTF-8')

# Passar todos esses caracteres pela camada de conversão para ids
all_ids = ids_from_chars(all_chars)
print(all_ids)

# Converter essa sequência de ids em um dataset
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print(ids_dataset)

tf.Tensor([19 48 57 ... 46  9  1], shape=(1115394,), dtype=int64)
<TensorSliceDataset shapes: (), types: tf.int64>


In [None]:
# Para cada exemplo do treinamento, utilizar uma sequência de 100 caracteres
seq_length = 100
# O número de exemplos a cada época é tam_texto/tam_seq
examples_per_epoch = len(text)//(seq_length+1)

In [None]:
# Criar o batch de sequências
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
print(sequences)

# Mostrar 5 exemplos
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

<BatchDataset shapes: (101,), types: tf.int64>
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [None]:
# Função que pega uma sequência e transforma em input & target
# Ex:
# seq = tensorflow
# input = tensorflo
# target = ensorflow
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
# Aplicar a função nos exemplos (batches) para gerar um dataset de treino:
dataset = sequences.map(split_input_target)

In [None]:
# Dividir e randomizar

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Definição do Modelo

**tf.keras.layers.Embedding:**

The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;

**tf.keras.layers.GRU:**

A type of RNN with size units=rnn_units (You can also use an **LSTM** layer here.)

**tf.keras.layers.Dense:**

The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    # Passar o sinal pela camada de embeding
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    # Passar o sinal pela RNN podendo já ter um estado e estando ou não em treino
    x, states = self.gru(x, initial_state=states, training=training)
    # Finalmente, passar o sinal pela camada densa
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

# Rodando o modelo sem treinamento

In [None]:
# Priemiro só checando como fica o output:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(64, 100, 66) # (batch_size, sequence_length, vocab_size)
Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  16896     
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  67650     
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b"hy blood,\nCongeal'd with this, do make me wipe off both.\n3 KING HENRY VI\n\nYORK:\nThe army of the quee"

Next Char Predictions:
 b"o&psxF;meXXIPHjbFlQkmdAL BrTCW$PA--lN PcJ[UNK]bEltCX:eJyE;k.s[UNK]gGNmpfuQaJgZ;.DTewl?S\nJnPgW,&KiQXFe',Ovj-s"


In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         4.1898317


# Treinamento do modelo

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 20

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

Rodar o onestep em loop para gerar texto:

In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(500):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
Such puttingued graves of us he violent his need,
And by the hand of maids be his life.
Masters all, good request; here it is.
Down with that spring, for, lords on Richard's lord,--

OFFBROKE:
O Clifford, your spirit, in fear it.

LUCIO:
I warrant their touchments, be married.

ANTONIO:
And how my minds to you befate you grace:
And I will trius. Siciling!
Blunt, the matter,--I must confess or grief on't.

MENENIUS:
Go to have so many great estame.

JULIET:
It is not thunder; and so to cut it, d 

________________________________________________________________________________

Run time: 2.8446035385131836


# Salvar o modelo

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')





INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


Rodando o modelo salvo:

In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(500):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

ROMEO:
Howest thy tears and swift? a cunning,
I ne'er she were so, death's contexts: 'twere pretteen
Which his true quanis.

ESCALUS:
This is a pather-bawd. They are in a time.
Your fellows to seek an house, ewes had lint--
For hasty man! would you be heard?

EDABESBY:
About her vergels are.

AUTOLYCUS:
If you this wilt that were with honour from my father's
Defe in my tent? marry her able then together
To die Claudio does from the orcasa wide ait
There is no vanian's garments, for my hands
Showing it
