In [1]:
import json
import numpy as np
import tensorflow as tf

from typing import *

In [2]:
def split_input_target(text_chunk, split_index=1):
    """Splits text into two chunks representing the input to be fed 
    into the NN, and it's target label.
    
    Example
    -------
    >>> split_input_target("Python")
    "Pytho", "ython"
    """
    input_text = text_chunk[:-split_index]
    target_text = text_chunk[split_index:]
    
    return input_text, target_text

## Purpose
Create text using a character-based recurrent neural network. We will use the novel Great Expectations by Charles Dickens. We will train the network on this text so that, if we give it a character sequence such as thousan, it will produce the next character in the sequence, d. This process can be continued, and longer sequences of text created by calling the model repeatedly on the evolving sequence.

In [3]:
text_url = 'https://www.gutenberg.org/files/1400/1400-0.txt' # Great Expectations by Charles Dickens
file_path = tf.keras.utils.get_file('1400-0.txt', text_url) # Downloads to cache if it isn't already there

In [4]:
with open(file_path) as fp:
    text = fp.read()

print(f'Lenght of text: {len(text)} characters')

Lenght of text: 1013445 characters


The first 824 characters are not part of the book. They are notes and licencing information from Project Gutenberg and shouldn't be part of training so lets remove them

In [5]:
text = text[824:]

In [6]:
print(text[:300])

Chapter I

My father's family name being Pirrip, and my Christian name Philip, my
infant tongue could make of both names nothing longer or more explicit
than Pip. So, I called myself Pip, and came to be called Pip.

I give Pirrip as my father's family name, on the authority of his
tombstone and my s


Next, lets create a mapping from char to int so the characters can represented as integers

In [7]:
unique_chars = sorted(set(text)) # Gets distinct values
char2idx = {char:i for i, char in enumerate(unique_chars)}
idx2char = {v:k for k, v in char2idx.items()}
index_to_char = np.array(unique_chars)

In [8]:
## Save for use in inference
with open("char2idx.json", "w") as fp:
    json.dump(char2idx, fp, indent=4, sort_keys=True)
    
with open("idx2char.json", "w") as fp:
    json.dump(idx2char, fp, indent=4, sort_keys=True)

In [9]:
# Sample output
for (k, v), _ in zip(char2idx.items(), range(10)):
    print(f"{repr(k):4s}: {v}")

'\n': 0
' ' : 1
'!' : 2
'$' : 3
'%' : 4
'&' : 5
"'" : 6
'(' : 7
')' : 8
'*' : 9


In [10]:
book_vector = np.array([char2idx[char] for char in text])

# Sample mapping
print(f"{text[10:27]} ----> {book_vector[10:27]}")


My father's fami ----> [ 0 40 78  1 59 54 73 61 58 71  6 72  1 59 54 66 62]


In [11]:
# The maximum length sentence we want for a single input in characters
sequence_length = 100
examples_per_epoch = len(text) // sequence_length

In [12]:
char_dataset = tf.data.Dataset.from_tensor_slices(book_vector)

# Sanity check
for char in char_dataset.take(8):
    print(idx2char[char.numpy()])

C
h
a
p
t
e
r
 


In [13]:
# Because we're adding 1 to the sequence in this function, the batch size is 101
sequences = char_dataset.batch(sequence_length + 1, drop_remainder=True)
dataset = sequences.map(split_input_target)

In [14]:
for input_example, target_example in dataset.take(1):
    print ('Input data: ', repr(''.join(index_to_char[input_example.numpy()]))) #101 characters
    print ('Target data:', repr(''.join(index_to_char[target_example.numpy()])))

Input data:  "Chapter I\n\nMy father's family name being Pirrip, and my Christian name Philip, my\ninfant tongue coul"
Target data: "hapter I\n\nMy father's family name being Pirrip, and my Christian name Philip, my\ninfant tongue could"


In [15]:
for i, (input_index, target_index) in enumerate(zip(input_example[:5], target_example[:5])):
    print(f"Step {i:4d}", end="")
    print(f" input: {input_index} ({repr(idx2char[input_index.numpy()])})", end="")
    print(f" expected output: {target_index} ({repr(idx2char[target_index.numpy()])})")

Step    0 input: 30 ('C') expected output: 61 ('h')
Step    1 input: 61 ('h') expected output: 54 ('a')
Step    2 input: 54 ('a') expected output: 69 ('p')
Step    3 input: 69 ('p') expected output: 73 ('t')
Step    4 input: 73 ('t') expected output: 58 ('e')


In [16]:
## Training set up

# How many characters in a batch
batch_size = 64

# The number of training steps taken in each epoch
steps_per_epoch = examples_per_epoch // batch_size

# TF data maintains a buffer in memory to shuffle data since it's designed
# to work with the possibility of endless data
buffer = 1000

dataset = dataset.shuffle(buffer).batch(batch_size, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## The Model
![DNN Layout](images/dnn-layout.png "DNN Layout")

In [40]:
# The vocabulary length in characters
vocabulary_length = len(unique_chars)

# The embedding dimension 
embedding_dimension = 512

# The number of recurrent neural network units
num_rnn_units = 2048

In [41]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [74]:
def build_lstm_model(vocab_size: int, embedding_dim: int, num_rnn_units: int, batch_size: int):
#     if tf.test.is_gpu_available():
#         lstm = tf.keras.layers.CuDNNLSTM
#     else:
#         lstm = tf.keras.layers.LSTM
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(num_rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.LSTM(num_rnn_units,
#                              return_sequences=True,
#                              stateful=True,
#                              recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])

    return model


model = build_lstm_model(vocab_size=len(unique_chars), embedding_dim=embedding_dimension, 
                    num_rnn_units=num_rnn_units, batch_size=batch_size)

W0516 11:20:56.071878 139957010396928 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f46e83ea6a0>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [75]:
for batch_input_example, batch_target_example in dataset.take(1):
    batch_predictions_example = model(batch_input_example)
    print(batch_predictions_example.shape, "# (batch, sequence_length, vocabulary_length)")

(64, 100, 84) # (batch, sequence_length, vocabulary_length)


In [76]:
example_batch_loss = loss(batch_target_example, batch_predictions_example)
print("Prediction shape: ", batch_predictions_example.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss: ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 84)  # (batch_size, sequence_length, vocab_size)
scalar_loss:  4.4309316


In [77]:
import os

model.compile(optimizer='adam', loss=loss)

model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (64, None, 512)           43008     
_________________________________________________________________
unified_lstm_22 (UnifiedLSTM (64, None, 2048)          20979712  
_________________________________________________________________
dense_12 (Dense)             (64, None, 84)            172116    
Total params: 21,194,836
Trainable params: 21,194,836
Non-trainable params: 0
_________________________________________________________________


84 unique chars * 256 embedding dimms = 21,504

1024 GRU units * 84 unique chars + 84 bias units = 86,100

In [78]:
# Directory where the checkpoints will be saved
checkpoint_dir = './lstm_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [98]:
EPOCHS=10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
 23/156 [===>..........................] - ETA: 23s - loss: 0.4477

KeyboardInterrupt: 

## Prediction
Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different batch_size, we need to rebuild the model and restore the weights from the checkpoint.

In [99]:
pred_model = build_lstm_model(vocab_size=len(unique_chars), embedding_dim=embedding_dimension, 
                    num_rnn_units=num_rnn_units, batch_size=1)
pred_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

pred_model.build(tf.TensorShape([1, None]))

W0516 14:15:25.972876 139957010396928 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f46ad7a76a0>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


In [100]:
def generate_text(model, start_string, temperature=1.0):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [101]:
print(generate_text(pred_model, 'Chapter I\n\n', 0.3))

Chapter I

Castering among the table aside, and I saw a passpociate little corner
where the spoller with the torchlered white coat cards, and what the small
bank-oomears, too, to establish and about the old time. And when I saw
him through the signal flock that I had passed the least four
and thoughtful to think I ought to me, I
saw that the pavement was had shut off any sunshine. It is
not particularly help him to dead stop. I strengthered all
the strongest from the table while he thought his countraps he had usually left
me and would take us hole. The time was just when I stopped they all had so long d norselves, and that she
had talled a good fer could not decide or pursuit, then,
and conquired with the excitement he furnished, as we sat down
to close upon the dead legible works in my fortunes who never came to the file on the flighter's as she will do has work yourself to get him
as little and despering some word of a creature to do than we are in a business,--and as I approached t