# Text Generation with GRUs on Emma by Jane Austen

An example of text generation with gated recurrent units. The model is trained on data from Jane Austen's Emma. The model attempts to predict the following character based on a sequence lenght of 50 characters. After the model is trained, sample text predictions are shown at the bottom of the notebook with a handful of starting phrases to get the predictions going.

This code was adapted from the following tutorial on `tensorflow.org`: https://www.tensorflow.org/text/tutorials/text_generation

In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow import keras

import numpy as np
import os
import time

In [2]:
# READ IN ALL LINES FROM EMMA
file = 'GPT/emma.txt'

with open(file) as data:
    raw_dataset = [x for x in data.readlines()]

# Create clean version of emma
no_new_lines = [x.replace('\n', '').strip() for x in raw_dataset]
while("" in no_new_lines) : 
    no_new_lines.remove("")

dataset = no_new_lines[14:13696]

text = " ".join(dataset)
print(len(text))

880022


In [3]:
vocab = sorted(set(text))
print(f"# of unique characters in the text: {len(vocab)}")

# of unique characters in the text: 76


## Vectorize the text

In [4]:
# encodes characters to IDs
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)

In [5]:
# reverse encodes the IDs back into characters
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(),
                                                                         invert=True, mask_token=None)

In [6]:
# function to turn tensors of IDs back into text
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

## Creating Examples and Targets texts for GRUs

In [7]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

In [8]:
# a tensor of all the IDs in a numpy array equal to the length of the text
all_ids

<tf.Tensor: shape=(880022,), dtype=int64, numpy=array([25, 33, 33, ..., 63, 62,  9])>

In [9]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [10]:
seq_length = 50
examples_per_epoch = len(text)//(seq_length+1)

In [11]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [12]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'EMMA By Jane Austen VOLUME I CHAPTER I Emma Woodhou'
b'se, handsome, clever, and rich, with a comfortable '
b'home and happy disposition, seemed to unite some of'
b' the best blessings of existence; and had lived nea'
b'rly twenty-one years in the world with very little '


In [13]:
# This function takes each individual sequence and breaks breaks it into the first 1-100 char as input
# and 2-101 characters as target.
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [14]:
dataset = sequences.map(split_input_target)

In [15]:
for input_example, target_example in dataset.take(2):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'EMMA By Jane Austen VOLUME I CHAPTER I Emma Woodho'
Target: b'MMA By Jane Austen VOLUME I CHAPTER I Emma Woodhou'
Input : b'se, handsome, clever, and rich, with a comfortable'
Target: b'e, handsome, clever, and rich, with a comfortable '


### Prep dataset for training

In [16]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 50), (64, 50)), types: (tf.int64, tf.int64)>

# Building the model

In [17]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [18]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [19]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [20]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 50, 77) # (batch_size, sequence_length, vocab_size)


In [21]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  19712     
_________________________________________________________________
gru (GRU)                    multiple                  1182720   
_________________________________________________________________
dense (Dense)                multiple                  39501     
Total params: 1,241,933
Trainable params: 1,241,933
Non-trainable params: 0
_________________________________________________________________


# Training the Model

In [22]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [23]:
model.compile(optimizer='adam', loss=loss)

In [24]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [25]:
EPOCHS = 10

In [26]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Text Generation

In [27]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=0.6):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [28]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [32]:
start = time.time()
states = None
next_char = tf.constant(['Emma', 'The boy', 'Long ago', 'Once upon a time ', 'She knew'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

for result_ in result:
    print(result_.numpy().decode('utf-8'))
    print()

print('\n' + '_'*80 + '\nRun time:', end - start)

Emma. The heat what was with the same personal chance of the whole. You must take the sort of speaking o

The boy. Emma was soon concerned, and after all the screeg perhaps are quite as if he would not be so very 

Long ago. He was only to say that if she would never have thought of the ladies respect, and was not been a 

Once upon a time there not to be as much mischief of the engagement was by the last to the party from her to another 

She knewy more mistaken--and a few more than she found as to the party, however, she looked of any the Aston


________________________________________________________________________________
Run time: 0.13445401191711426
