In [1]:
import tensorflow as tf
import numpy as np
import os
import time

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')



In [2]:
text = open(path_to_file, 'rb').read().decode('utf-8')
print('Length of text: {} characters'.format(len(text)))
print(text[:100])



Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# Unique character in files (a, b, c...)
vocab = sorted(set(text))



In [4]:
# Vectorize
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert all character to int base on char2idx dict
text_as_int = np.array([char2idx[c] for c in text])


In [5]:
# Training examples  and targets
# Divide text into example sequences, each input sequence will
# contain seq_length characters from the text
# Each sequence, the targets contain the same seq_length of text, but shifted one character to right

seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])




F
i
r
s
t


In [6]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))


'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [7]:
# Map text to input and target (both input and target have
# the same seq_length but target is shifted to right one character)
def split_input_target(chunk):
    input_text = chunk[:-1] # take all except the last character
    target_text = chunk[1:] # take all except the first character

    return input_text, target_text

dataset = sequences.map(split_input_target)


In [8]:
for input_exp, target_exp in dataset.take(1):
    print('Input data', repr(''.join(idx2char[input_exp.numpy()])))
    print("Target data", repr(''.join(idx2char[target_exp.numpy()])))

Input data 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [9]:
"""
    Each index of these vectors is processed as a one
    time step. For the input at time step 0, the model
    receives the index for "F" and tries to predict
    the index for "i" as the next character. At the
    next timestep, it does the same thing but the RNN
    considers the previous step context in addition
    to the current input character.
"""

for i, (input_idx, target_idx) in enumerate(zip(input_exp[:5], target_exp[:5])):
    print("Step {:4d}".format(i))
    print("\tinput: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("\toutput: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


Step    0
	input: 18 ('F')
	output: 47 ('i')
Step    1
	input: 47 ('i')
	output: 56 ('r')
Step    2
	input: 56 ('r')
	output: 57 ('s')
Step    3
	input: 57 ('s')
	output: 58 ('t')
Step    4
	input: 58 ('t')
	output: 1 (' ')


In [10]:
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [11]:
# Model
# Embedding: input layer map the numbers of each character to a vector
# with embedding_dim
# GRU: special type of RNN with size units=rnn_units
# Dense: vocab_size outputs

vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, None)),
        tf.keras.layers.GRU(units=rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model


model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)


In [13]:
"""
    For each character the model looks up the
    embedding, runs the GRU one timestep with
    the embedding as input, and applies the dense
    layer to generate logits predicting the log-likelihood of the next character:
"""
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [15]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([45, 41, 58, 29, 41, 49, 33, 38, 21, 60, 14, 10, 44, 64, 50, 21, 14,
       25, 21,  6, 24, 13, 33, 18, 20, 38, 63, 54, 18, 59, 53, 40, 45, 11,
       32, 62, 38, 52, 51,  2, 16, 19, 19, 27, 11, 15, 55, 14, 44, 40, 25,
       37, 52, 11,  2, 21,  2, 14, 63, 10, 53, 56,  3, 46, 40,  8, 26,  3,
       58, 47, 30,  6, 53,  2, 14, 44, 12, 32, 15, 12, 56, 47,  6, 29, 60,
        3,  8,  4, 32, 44, 52, 22, 34, 11, 63, 42, 34, 38, 36, 61],
      dtype=int64)

In [16]:
print("Input: \n", repr(''.join(idx2char[input_example_batch[0]])))
print("Next Char Predictions: \n", repr(''.join(idx2char[sampled_indices])))

Input: 
 "se and liberty,\nWhich have for long run by the hideous law,\nAs mice by lions--hath pick'd out an act"
Next Char Predictions: 
 'gctQckUZIvB:fzlIBMI,LAUFHZypFuobg;TxZnm!DGGO;CqBfbMYn;!I!By:or$hb.N$tiR,o!Bf?TC?ri,Qv$.&TfnJV;ydVZXw'


In [17]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
example_batch_loss.numpy().mean()


4.1739507

In [18]:
model.compile(optimizer='adam', loss=loss)

checkpoint_dir = './training_checkpoint'

checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_predix,
    save_weights_only=True
)


In [19]:
# EPOCHS = 10

# model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])



In [20]:
model = build_model(vocab_size, embedding_dim=embedding_dim,
                    rnn_units=rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape((1, None)))
model.summary()


Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.embeddings.Embedding object at 0x000001B13852F198> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x000001B1385B1B70>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.recurrent_v2.GRU object at 0x000001B138590F98> and <tensorflow.python.keras.layers.embeddings.Embedding object at 0x000001B13852F198>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x000001B1385B1780> and <tensorflow.python.keras.layers.recurrent_v2.GRU object at 0x000001B138590F98>).
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1

In [21]:
def generate_text(model, start_string):
    """
    Chose a start string, init RNN state and set the number
    of characters to generate
    Get the prediction distribution of next character using the start string and RNN state
    Use categorical distribution to calculate the index of predicted character
    and use this predicted character as our next input
    The RNN state returned by the model is fed back into the model so that it now has more context,
    After predicting the next character, the modified RNN states are again fed back into the model
    :param model:
    :param start_string:
    :return:
    """
    num_generate = 1000

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) # convert to 2d tensor

    text_generated = []

    # Love results in more predictable text
    # High otherwise
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        # remove batch dimension
        predictions = tf.squeeze(predictions, axis=0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], axis=0)

        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

print(generate_text(model, start_string="ROMEO: "))




ROMEO: to
see it die, since you now. O chargish!
Will I remanning to't. Well, ward thy me!
What is the fund with dowry present shall not,
And then we foh your throne; and so dous absence,
He will dispose these town of carlick, cholericy speak.
Thou hadst been sting forth infection.

CAPULET:
And ports of Mercutio, she may;
For purthou slew'st it in, and lies a clean.
Your loves, you at done; and to refuris,
His royal success too!

MENENIUS:
You'll tell me. Tybruit'd in post.

OUFORD:
What doth you now, who?

CAPULET:
We help on his gentle Purpures, stoice of me!
Modeous, I have spoken: if doubtless woman
The whiced is heaven, his number new up,
And fear'st a priest had been unsaul't and time by the spleen unto
a wing transfur in!

LUCIO:
He cannot lers her, that stand complaint from gross: puph mine himself, and with a counterear
to bleed his ravedue.

HORTENSIO:
When I do content the name?

JULIET:
Every pity; sirs, let's seay but beggar:
But yet the ground If,--
First Marcius!
Come u