In [None]:
import numpy as np
import os
import time

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,GRU,Embedding

tf.__version__

## Tensorflow Text generation

#### A character-based LSTM or GRU

* The dataset is of Shakespeare's writing from Andrej Karpathy's [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). 

* Given a sequence of characters from this data ("Shakespear"), train a model to predict the next character in the sequence ("e"). Longer sequences of text can be generated by calling the model repeatedly.

#### Sample output 

* The model was trained for 30 epochs, and started with the string "Q":

<pre>
QUEENE:
I had thought thou hadst a Roman; for the oracle,
Thus by All bids the man against the word,
Which are so weak of care, by old care done;
Your children were in your holy love,
And the precipitation through the bleeding throne.

BISHOP OF ELY:
Marry, and will, my lord, to weep in such a one were prettiest;
Yet now I was adopted heir
Of the world's lamentable day,
To watch the next way with his father with his face?

ESCALUS:
The cause why then we are all resolved more sons.

VOLUMNIA:
O, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, it is no sin it should be dead,
And love and pale as any will to that word.

QUEEN ELIZABETH:
But how long have I heard the soul for this world,
And show his hands of life be proved to stand.

PETRUCHIO:
I say he look'd on, if I must be content
To stay him from the fatal of our country's bliss.
His lordship pluck'd from this sentence then for prey,
And then let us twain, being the moon,
were she such a case as fills m
</pre>

* While some of the sentences are grammatical, most do not make sense. The model has not learned the meaning of words, but consider:

    - The model is character-based. When training started, the model did not know how to spell an English word, or that words were even a unit of text.

    - The structure of the output resembles a play—blocks of text generally begin with a speaker name, in all capital letters similar to the dataset.

    - The model is trained on small batches of text (100 characters each), and is still able to generate a longer sequence of text with coherent structure.

### Get the Shakespeare dataset


In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 
                                    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

text = open(path_to_file, 'rb').read().decode()
print ('Length of text: {} characters'.format(len(text)))

In [None]:
print(text[:250])

In [None]:
vocab = sorted(set(text)) # The unique characters in the file
print (f'{len(vocab)} unique characters')

### Vectorize the text

* Map strings to a numerical representation. 
* Create two lookup tables: 
    - One mapping characters to numbers
    - One mapping numbers to characters.

In [None]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
text_as_int.shape

#### integer representation for each character. 

In [None]:

for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))


#### The mapping of the first 13 characters from the text 

In [None]:
print (f'{repr(text[:13])} -> {text_as_int[:13]}')

### Create training examples and targets

* The text is divided into example sequences of characters of length 'seq_length'

* For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.


In [None]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)// (seq_length+1)
print(examples_per_epoch)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
print(type(char_dataset))
for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

In [None]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

#### Create input and target text

* For each sequence, duplicate and shift it to form the input and target text

In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

### Create training batches

* Shuffle the data and pack it into batches.

In [None]:
BATCH_SIZE,BUFFER_SIZE = 64, 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

### The Model

#### Model Class: Sequential
* Layers:
    - Embedding: The input layer. A trainable lookup table creates a vector of length 'embedding_dim' 
    - LSTM or GRU: with 'rnn_units' 
    - Dense: The output layer 'vocab_size' outputs.

In [None]:
vocab_size = len(vocab) 
embedding_dim = 256
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = Sequential([
    Embedding(vocab_size, embedding_dim,batch_input_shape=[batch_size, None]),
    LSTM(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
    Dense(vocab_size)
  ])
  return model

In [None]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

#### Operation 
1) For each character the model looks up the embedding,  
2) runs the LSTM or the GRU one timestep with the embedding as input, and   
3) applies the dense layer to generate logits predicting the log-likelihood of the next character  

![](text_generation_training.png)

### Test the model


In [None]:
print("(batch_size, sequence_length, vocab_size)")
for input_example_batch, target_example_batch in dataset.take(2):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape) 

In [None]:
model.summary()

* To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.

* This gives us, at each timestep, a prediction of the next character index:


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

In [None]:
print(f"Input: \n{repr(''.join(idx2char[input_example_batch[0]]))}")

In [None]:
print(f"Next Char Predictions: \n{repr(''.join(idx2char[sampled_indices ]))}")

### Adam Optimizer, and  sparse_categorical_crossentropy loss function


In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape)
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [None]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints to save the weights

* Use tf.keras.callbacks.ModelCheckpoint to save checkpoints during training.

In [None]:
# Directory and file name where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

### Train the Model

In [None]:
EPOCHS=30

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

### Generate text

#### Restore the latest checkpoint

* To keep this prediction step simple, we use a batch size of 1.
* Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built. Therefore to run the model with a different 'batch_size', we need to rebuild the model and restore the weights from the checkpoint.


In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

### The prediction loop

* Choose a starting string, initialize the RNN state and set the number of characters to generate.

* Get the prediction distribution of the next character using the start string and the RNN state.

* Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

* The RNN state returned by the model is fed back into the model so that it now has more context 
* After predicting the next word, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.


![](text_generation_sampling.png)



In [None]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000
  
  input_eval = [char2idx[s] for s in start_string] # Convert start string
  input_eval = tf.expand_dims(input_eval, 0)
  
  text_generated = [] # String to store our results

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  temperature = 1.0

  # Batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0) # remove the batch dimension

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # Pass the predicted word and along the previous hidden state as the next input to the model
      # 
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"ROMEO: "))

Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.