https://www.tensorflow.org/tutorials/sequences/text_generation

In [1]:
import tensorflow as tf
import numpy as np
import os
import time

  from ._conv import register_converters as _register_converters


In [2]:
tf.__version__
tf.enable_eager_execution()

'1.13.1'

In [5]:
with open('./data/shakespeare.txt', 'r') as f:
    text = f.read()

In [10]:
vocab = sorted(set(text))
len(vocab)

65

#### Preprocessing

In [11]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

#### Create training examples and targets

In [14]:
seq_length = 100
examples_per_epoch = len(text) // seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(i)
    print(idx2char[i.numpy()])

Instructions for updating:
Colocations handled automatically by placer.
tf.Tensor(18, shape=(), dtype=int64)
F
tf.Tensor(47, shape=(), dtype=int64)
i
tf.Tensor(56, shape=(), dtype=int64)
r
tf.Tensor(57, shape=(), dtype=int64)
s
tf.Tensor(58, shape=(), dtype=int64)
t


In [15]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [29]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [17]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [18]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


#### Create training batches

In [30]:
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000

steps_per_epoch = examples_per_epoch // BATCH_SIZE

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#### Build Model

In [20]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

Note that we're using stateful RNN,  
A nice tutorial for `stateful`: https://fairyonice.github.io/Stateful-LSTM-model-training-in-Keras.html

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                           recurrent_activation='sigmoid',
                           return_sequences=True,
                           recurrent_initializer='glorot_uniform',
                           # note here we're using stateful
                           stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [34]:
for input_example_batch, target_example_batch in dataset.take(1): 
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3935232   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,018,497
Trainable params: 4,018,497
Non-trainable params: 0
_________________________________________________________________


To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary. 

Note: It is important to _sample_ from this distribution as taking the _argmax_ of the distribution can easily get the model stuck in a loop.

Try it for the first example in the batch:

In [44]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], 1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([46, 61,  7, 38, 21, 24, 35, 36, 14, 42, 53, 50, 15,  6, 22, 23, 14,
       22,  4, 37, 41, 16, 16, 54, 11, 47, 49, 17, 20, 46, 14, 30, 61, 48,
       37, 57, 54, 35, 59, 57, 60, 33,  3, 22, 22, 32,  0, 20,  1, 36, 56,
       22, 64, 23, 21, 35,  4, 44, 53, 60, 43, 26, 16, 27, 61, 20, 38, 34,
       59, 51, 55, 63, 28, 54, 21, 18,  0, 53, 35, 57, 15, 47, 54, 31, 60,
       64, 22, 26, 11, 42, 56, 46,  5, 19, 46, 15, 58, 62, 49, 29])

In [46]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(
    optimizer=tf.train.AdamOptimizer(),
    loss=loss
)

In [49]:
# checkpoint
checkpoint_dir = './checkpoints/'
checkpoint_prefix = './checkpoints/text_generation_{epoch}'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

#### Train

In [50]:
EPOCHS = 3

In [51]:
history = model.fit(dataset.repeat(), 
                   epochs=EPOCHS,
                   steps_per_epoch=steps_per_epoch,
                   shuffle=False,
                   callbacks=[checkpoint_callback])

Epoch 1/3
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Epoch 2/3
Epoch 3/3


#### Generate text

To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built. 

To run the model with a different `batch_size`, we need to rebuild the model and restore the weights from the checkpoint.


In [54]:
tf.train.latest_checkpoint(checkpoint_dir)

'./checkpoints/text_generation_3'

In [59]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0xb3183f710>

In [69]:
num_generate = 1000
start_string = 'ROMEO: '

input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)

text_generated = list()

model.reset_states()
for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, axis=0)
    
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0]
    input_eval = tf.expand_dims([predicted_id], 0)
    
    text_generated.append(idx2char[predicted_id])
    
start_string + ''.join(text_generated)

"ROMEO: with goft die but\nA somains ponsulan one nibe:\nHe as it night ant nobleman dool mark not make essect\nThat disco. Hels I motter.\n\nCOMIILANUS:\nAy, I wall prove to promp of these tomd:\nAnd why then eeties have ames, the peruce? wast nole, O'll pety't here ap to\nAngolownd holdraly, by to him eams\nHomstunge of grous his buakes dofe, and thee, 'tis sir, his cancest,\nI have a gutient to such a tidness and for miss\nAnd we provest them not chemrings from the best mants shollem.\n\nFordeat:\nFor his raulise, and good you strange\nTo ach mane the but doging ryems, as anse, she betand.\n\nTINCENII:\nThy bate as veaces, thy inliact to you,\nOr well and oldible themble, sorsward,'-\nPENENIUS:\nNo, let's sir, hele, for the if\nthings wat you the heads will for's horese,\nYour hatten Ispice acking and shall bester\nAs sibuld prisons, I had you sin,\nHe do you mice, and my lord, and comes of not these: for. Signon hewer'd thithel\nAnd hathering bience. Came, we hold\nnot, prove here 

In [71]:
print(start_string + ''.join(text_generated))

ROMEO: with goft die but
A somains ponsulan one nibe:
He as it night ant nobleman dool mark not make essect
That disco. Hels I motter.

COMIILANUS:
Ay, I wall prove to promp of these tomd:
And why then eeties have ames, the peruce? wast nole, O'll pety't here ap to
Angolownd holdraly, by to him eams
Homstunge of grous his buakes dofe, and thee, 'tis sir, his cancest,
I have a gutient to such a tidness and for miss
And we provest them not chemrings from the best mants shollem.

Fordeat:
For his raulise, and good you strange
To ach mane the but doging ryems, as anse, she betand.

TINCENII:
Thy bate as veaces, thy inliact to you,
Or well and oldible themble, sorsward,'-
PENENIUS:
No, let's sir, hele, for the if
things wat you the heads will for's horese,
Your hatten Ispice acking and shall bester
As sibuld prisons, I had you sin,
He do you mice, and my lord, and comes of not these: for. Signon hewer'd thithel
And hathering bience. Came, we hold
not, prove here of it. She eles.

CORINIU
Se

## Advanced: Customized Training

The above training procedure is simple, but does not give you much control.

So now that you've seen how to run the model manually let's unpack the training loop, and implement it ourselves. This gives a starting point if, for example, to implement _curriculum learning_ to help stabilize the model's open-loop output. 

We will use `tf.GradientTape` to track the gradiends. You can learn more about this approach by reading the [eager execution guide](https://www.tensorflow.org/guide/eager).

The procedure works as follows:

* First, initialize the RNN state. We do this by calling the `tf.keras.Model.reset_states` method.

* Next, iterate over the dataset (batch by batch) and calculate the *predictions* associated with each.

* Open a `tf.GradientTape`, and calculate the predictions and loss in that context.

* Calculate the gradients of the loss with respect to the model variables using the `tf.GradientTape.grads` method.

* Finally, take a step downwards by using the optimizer's `tf.train.Optimizer.apply_gradients` method.



In [73]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

optimizer = tf.train.AdamOptimizer()

In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = model.reset_states()
    
    for (batch_n, (inp, target)) in enumerate(dataset):
        with tf.GradientTape() as tape:
            predictions = model(inp)
            loss = tf.losses.sparse_softmax_cross_entropy(target, predictions)
        
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))