# Text generation with an RNN
https://www.tensorflow.org/tutorials/text/text_generation

In [4]:
import os
import time
import json

import tensorflow as tf
import numpy as np

In [5]:
dataset = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)

In [6]:
text = open(dataset, "r").read()
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [7]:
vocabulary = sorted(set(text))
print(f"{len(vocabulary)} unique characters in dataset")
print(vocabulary)

65 unique characters in dataset
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
character_to_index = {
    character: index
    for index, character
    in enumerate(vocabulary)
}
index_to_character = np.array(vocabulary)

vectorized_dataset = np.array([
    character_to_index[character]
    for character in text
])

In [9]:
print(json.dumps(character_to_index, indent=4)[:100] + "...")

{
    "\n": 0,
    " ": 1,
    "!": 2,
    "$": 3,
    "&": 4,
    "'": 5,
    ",": 6,
    "-": 7,
 ...


In [10]:
print("Character to integer mapping example")
print(text[:13])
print(vectorized_dataset[:13])

Character to integer mapping example
First Citizen
[18 47 56 57 58  1 15 47 58 47 64 43 52]


In [11]:
maximum_sequence_length = 100
examples_per_epoch = len(text) // (maximum_sequence_length + 1)
print(f"Training with {examples_per_epoch} examples per epoch")

Training with 11043 examples per epoch


In [12]:
dataset_helper = tf.data.Dataset.from_tensor_slices(vectorized_dataset)
for i in dataset_helper.take(5):
    print(index_to_character[i.numpy()])

F
i
r
s
t


In [13]:
sequences = dataset_helper.batch(
    maximum_sequence_length + 1,
    drop_remainder=True
)
for item in sequences.take(5):
    print(repr("".join(index_to_character[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [14]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [15]:
for input_example, target_example in prepared_dataset.take(1):
    print(f"Input data:", repr("".join(index_to_character[input_example.numpy()])))
    print(f"Target data:", repr("".join(index_to_character[target_example.numpy()])))

Input data: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [16]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

shuffled_dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [17]:
vocabulary_size = len(vocabulary)
# Tutorial had the embedding dimension at 256, but after looking up some
# metrics and what it should be based on, I decided to drop it down to 64.
# See https://en.wikipedia.org/wiki/Word2vec#Dimensionality
# Also https://datascience.stackexchange.com/a/48194
embedding_dimension = 64
rnn_units = 1024

In [19]:
checkpoint_dir = "./training-checkpoints/text-generation-with-an-rnn"
def build_model(vocabulary_size, embedding_dimension, rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.Embedding(
            vocabulary_size,
            embedding_dimension,
            batch_input_shape=[batch_size, None]
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.Dense(vocabulary_size),
    ])
    return model

model = build_model(
    vocabulary_size=vocabulary_size,
    embedding_dimension=embedding_dimension,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 64)            4160      
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3348480   
_________________________________________________________________
dense_1 (Dense)              (64, None, 65)            66625     
Total params: 3,419,265
Trainable params: 3,419,265
Non-trainable params: 0
_________________________________________________________________


In [20]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, vocabulary_size)")

(64, 100, 65) # (batch_size, sequence_length, vocabulary_size)


In [21]:
# Apparently random sampling should be used rather than argmax to avoid loops.
# So this piece of code uses a the output value as a probability, rather
# than just choosing the one that's highest.
sampled_indices = tf.random.categorical(predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([25, 21, 34, 20, 39, 30, 16,  1, 21, 21, 21, 10,  0, 22, 52,  1,  1,
       61, 44,  1, 52, 53, 59,  1,  1, 43, 50, 60,  1, 57,  1, 50, 53, 55,
       53, 51, 40, 43, 43, 58, 41, 43, 12, 39, 47, 43, 52, 45, 57, 47, 53,
       59,  1,  1,  6, 50, 60,  0,  0, 32, 19, 33, 17, 17, 26,  1, 25, 24,
       21, 38, 13, 14, 17, 32, 20, 10,  0, 26, 43, 58,  1, 39, 46, 39, 59,
       45, 57, 53, 57, 57, 58,  1, 57, 43, 57, 50,  5, 46, 63,  1],
      dtype=int64)

In [22]:
print("Input:\n", repr("".join(index_to_character[input_batch[0]])), "\n")
print("Output:\n", repr("".join(index_to_character[sampled_indices])))

Input:
 " RICHARD III:\nAy, if yourself's remembrance wrong yourself.\n\nQUEEN ELIZABETH:\nBut thou didst kill my" 

Output:
 "MIVHaRD III:\nJn  wf nou  elv s loqombeetce?aiengsiou  ,lv\n\nTGUEEN MLIZABETH:\nNet ahaugsosst sesl'hy "


In [23]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )

batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(64, 100, 65) 

scalar_loss: 1.310571


In [24]:
model.compile(optimizer="adam", loss=loss)

In [25]:
checkpoint_prefix = os.path.abspath(
    os.path.join(checkpoint_dir, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [28]:
EPOCHS = 30

In [29]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
)

Train for 172 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
model = build_model(
    vocabulary_size,
    embedding_dimension,
    rnn_units,
    batch_size=1,
)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 64)             4160      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3348480   
_________________________________________________________________
dense_2 (Dense)              (1, None, 65)             66625     
Total params: 3,419,265
Trainable params: 3,419,265
Non-trainable params: 0
_________________________________________________________________


In [31]:
def generate_text(model, start_string):
    characters_to_generate = 1000
    
    input_eval = [
        character_to_index[character]
        for character in start_string
    ]
    # tf.expand_dims inserts a dimension at the specified index.
    # In this case it converts our shape from (n,) to (1, n,)
    input_eval = tf.expand_dims(input_eval, 0)
    
    generated_output = []
    
    temperature = 1.0
    
    model.reset_states()
    for i in range(characters_to_generate):
        predictions = model(input_eval)
        # tf.squeeze here does the opposite of tf.expand_dims
        predictions = tf.squeeze(predictions, 0)
        
        predictions /= temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)
        predicted_id = predicted_id[-1, 0].numpy()
        
        # Pass in the predicted character as input on the next round
        input_eval = tf.expand_dims([predicted_id], 0)
        generated_output.append(index_to_character[predicted_id])
    
    return f"{start_string}{''.join(generated_output)}"

In [32]:
print(generate_text(model, start_string="ROMEO: "))

ROMEO: Catulban
And prince is call'd friendship: 'tis the fapalioor opes
And plague in mighty fenLy my service or speed
Whether for partly to rest to-night,
Intending two swore to be part.

KING RICHARD II:
Thanks, good Lord deliver, you much.

SAMPSON:
Let us like a great day will hunt this wolk and leave and happy prove
A serve the garments name my fortunes to we know your royal rest!
And yet I come in; but old dangers.
Would not they seek again: if any be, if he calls?

First Soldier:
Nor I, boys.

Second Keeper:
Help, nerd-applauded in thy veins,
That companious villain in thy waychings I left to us,
And every day to cure this feat, or dies
Give signifies the have with him, proud attession
To the present de you have dark declined;
And high A man of worship,
When, fairs a fortunation?

FLORIZEL:
We were fit
To the poor king's demands will out of such
d was here.
You swear to then?

ANGELO:
He that hath been with the other credal order out, Romeo may die but fast By me, I, being aged