<a href="https://colab.research.google.com/github/Kozzlov/nlp_tf_text_generation_rnn/blob/main/nlp_tf_text_generation_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np 
import os
import time

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

#read, then decode for py2 compat
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
#length of the text is the number of characters in it
print('Length of the: {} characters'.format(len(text)))
#unique characters
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Length of the: 1115394 characters
65 unique characters


In [3]:
#text processing
#vectorising the text
example_texts = ['abcdefg', 'xyz']
chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
#it converts from the character ids, padding with 0
ids = ids_from_chars(chars)
ids
#stringlookup allows to recover human-readable text
chars_from_ids= tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)
chars = chars_from_ids(ids)
chars
#tf.strings.reduce_join allows to join the characters into strings
tf.strings.reduce_join(chars, axis=-1).numpy()
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [4]:
#the prediction task
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(5):
  print(chars_from_ids(ids).numpy().decode('utf-8'))

seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
# for seq in sequences.take(1):
#   print(chars_from_ids(seq))
# for seq in sequences.take(5):
#   print(text_from_ids(seq).numpy())

def split_input_target(sequences):
  input_text = sequences[:-1]
  target_text= sequences[1:]
  return input_text, target_text
# split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
  print("Input :", text_from_ids(input_example).numpy())
  print("Target:", text_from_ids(target_example).numpy())

F
i
r
s
t
Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [5]:
#create training batches
BATCH_SIZE = 64 
BUFFER_SIZE = 10000
#buffersize to shuffle the dataset
#(TF data is designed to work with possibly infinite sequences,
#so it doesn't attempt to shuffle the entire sequence in memory. Instead,
#it maintains a buffer in which it shuffles elements).

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [6]:
#build the model 
#length of the vocabulary in chain 
vocab_size = len(vocab)
#the embedding dimension
embedding_dim = 256
#number of rnn units 
rnn_units = 1024

class Model(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(
        rnn_units,
        return_sequences=True,
        return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [7]:
model = Model(
    #vocabulary size must match the 'StringLookup layers
    vocab_size = len(ids_from_chars.get_vocabulary()),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units)

# For each character the model looks up the embedding,
# runs the GRU one timestep with the embedding as input,
# and applies the dense layer to generate logits predicting 
# the log-likelihood of the next character:


In [8]:
#model summary details
#checking the shape of an output
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 67) # (batch_size, sequence_length, vocab_size)


In [9]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  17152     
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  68675     
Total params: 4,024,131
Trainable params: 4,024,131
Non-trainable params: 0
_________________________________________________________________


In [10]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
#first example in the batch 
sampled_indices
#decoding the text predicted by the untrained model
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next char predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b" I fly to 'scape their hands?\nAh, tutor, look where bloody Clifford comes!\n\nCLIFFORD:\nChaplain, away"

Next char predictions:
 b"$f -J.loGD'NO::Ta& ugUzSnPQqm!v'-pnHBezDrPK!zj',KHu!UBEq?Bsl3ajI?[UNK]gIvoNs?Lu:$ kFf\njIx,aOrg;aIwLQ"


In [11]:
#train the model
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("pred shape: ", example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
print("mean loss:  ", mean_loss)
#checking model's awareness in providing incorrect answers
tf.exp(mean_loss).numpy()

model.compile(optimizer='adam', loss=loss)

#configuring if checkpoints were saved during training 
checkpoint_dir = './training_checkpoints'
#naming the checkpoints file
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

pred shape:  (64, 100, 67) # (batch_size, sequence_length, vocab_size)
mean loss:   4.206151


In [24]:
#execute training 
EPOCHS = 25 
history = model.fit(dataset, 
                    epochs=EPOCHS,
                    callbacks=[checkpoint_callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [25]:
#generate text 
class Step(tf.keras.Model):
  def __init__(self, model, chars_froom_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars
  
    #create a mask to prevent "" or "[UNK]" from being generated
    skip_ids = self.ids_from_chars(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # put -inf to each bad index
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        #match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    #convert strings to tokens ids
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()
    #running the model
    #predicted_logist.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids,
                                          states=states,
                                          return_state=True)
    #use only the last prediction 
    predicted_logits = predicted_logits[:, -1, :]
    precicted_logits = predicted_logits/self.temperature
    #apply the predictions mask: prevent "" or "[UNK]" form being generated
    predicted_logits = predicted_logits + self.prediction_mask
    #sample the output logits to generate token ids
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=1)
    #convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)
    #return the characters and model state
    return predicted_chars, states

In [26]:
one_step_model = Step(model, chars_from_ids, ids_from_chars)
#try to generate text in a loop
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(f"\nRun time: {end - start}")

ROMEO:
Menerity, my meed hath gentle a vole.

Nurse:
My bushles will mark you this earth again,
Not Mortageous Cautia? she must needs above;
But execution, am I at their backs,
Spuking him from thence, that this hard hate his brows:
For never was my follying to the worse.
Here, Awaking, that he here.

DORCAS:
If the law which way, though we will confess
I then lure and long tribute's majesty
To strive the kities of doom and neighbours.
Now, nurse, go thy way, into me,
And what you will, if she be, 'sint thou keep her
Solicits, and with him on his breast, and joy
The more envointed of his lady's heir!

TRANIO:
Fear me not.

EXETER:
No mother, boy.

DUKE VINCENTIO:
Do you thanks.

DUKE VINCENTIO:
He followeds: defend us breathe against the house
Ireld that be revenged on those I am,
For her no deep desperate which you have
To hear thee say and you my servants
From our better to the blore. Although
I do not like the loss of mine,
Which burn in meaning, if thou art all us.

ROMEO:
Good art