In [1]:
import tensorflow as tf
import numpy as np
import os
import time

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')



In [2]:
text = open(path_to_file, 'rb').read().decode('utf-8')
print('Length of text: {} characters'.format(len(text)))
print(text[:100])



Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# Unique character in files (a, b, c...)
vocab = sorted(set(text))
len(vocab)


65

In [4]:
# Vectorize
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert all character to int base on char2idx dict
text_as_int = np.array([char2idx[c] for c in text])
idx2char, text_as_int


(array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
        'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
       dtype='<U1'), array([18, 47, 56, ..., 45,  8,  0]))

In [5]:
# Training examples  and targets
# Divide text into example sequences, each input sequence will
# contain seq_length characters from the text
# Each sequence, the targets contain the same seq_length of text, but shifted one character to right

seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])


F
i
r
s
t


In [6]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))


'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [7]:
# Map text to input and target (both input and target have
# the same seq_length but target is shifted to right one character)
def split_input_target(chunk):
    input_text = chunk[:-1] # take all except the last character
    target_text = chunk[1:] # take all except the first character

    return input_text, target_text

dataset = sequences.map(split_input_target)
dataset


<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [8]:
for input_exp, target_exp in dataset.take(1):
    print('Input data', repr(''.join(idx2char[input_exp.numpy()])))
    print("Target data", repr(''.join(idx2char[target_exp.numpy()])))

Input data 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [9]:
"""
    Each index of these vectors is processed as a one
    time step. For the input at time step 0, the model
    receives the index for "F" and tries to predict
    the index for "i" as the next character. At the
    next timestep, it does the same thing but the RNN
    considers the previous step context in addition
    to the current input character.
"""

for i, (input_idx, target_idx) in enumerate(zip(input_exp[:5], target_exp[:5])):
    print("Step {:4d}".format(i))
    print("\tinput: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("\toutput: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


Step    0
	input: 18 ('F')
	output: 47 ('i')
Step    1
	input: 47 ('i')
	output: 56 ('r')
Step    2
	input: 56 ('r')
	output: 57 ('s')
Step    3
	input: 57 ('s')
	output: 58 ('t')
Step    4
	input: 58 ('t')
	output: 1 (' ')


In [10]:
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [11]:
# Model
# Embedding: input layer map the numbers of each character to a vector
# with embedding_dim
# GRU: special type of RNN with size units=rnn_units
# Dense: vocab_size outputs

vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, None)),
        tf.keras.layers.GRU(units=rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model


model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [16]:
dataset.take(1)

<TakeDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [17]:
"""
    For each character the model looks up the
    embedding, runs the GRU one timestep with
    the embedding as input, and applies the dense
    layer to generate logits predicting the log-likelihood of the next character:
"""
for input_example_batch, target_example_batch in dataset.take(1):
    print(input_example_batch)
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


tf.Tensor(
[[52  1 39 ... 57  8  0]
 [50 39 47 ...  0 23 21]
 [ 0  0 23 ... 39 49  1]
 ...
 [10  0 35 ... 53 59 50]
 [34 21 30 ... 46 43  1]
 [63  1 46 ...  0 20 43]], shape=(64, 100), dtype=int32)
(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [21]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_2 (Dense)              (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [22]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([11, 53, 56,  8, 62, 53, 16, 25, 40, 17, 26, 18,  4, 52,  3, 51,  3,
       29,  4, 35, 43, 26, 10,  8, 14, 42, 24, 42, 57, 40, 39, 11,  2, 21,
       25, 54, 22, 42, 39, 52, 52,  7, 54,  7, 36, 54, 61, 17, 44,  3, 39,
       45, 63,  5, 35, 39, 37, 12, 31, 61, 41, 39, 56, 52, 39, 42, 33, 25,
       31, 33,  0,  9, 64, 25, 10, 34, 50, 25, 12,  5,  4, 51, 33, 50, 39,
       22, 22, 44,  9,  2, 59, 54, 21, 13, 42, 36,  9, 21, 31, 60],
      dtype=int64)

In [23]:
print("Input: \n", repr(''.join(idx2char[input_example_batch[0]])))
print("Next Char Predictions: \n", repr(''.join(idx2char[sampled_indices])))

Input: 
 '.\nTo Bolingbroke are we sworn subjects now,\nWhose state and honour I for aye allow.\n\nDUCHESS OF YORK'
Next Char Predictions: 
 ";or.xoDMbENF&n$m$Q&WeN:.BdLdsba;!IMpJdann-p-XpwEf$agy'WaY?SwcarnadUMSU\n3zM:VlM?'&mUlaJJf3!upIAdX3ISv"


In [24]:
def loss_sparse(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss_sparse(target_example_batch, example_batch_predictions)
example_batch_loss.numpy().mean()


4.173761

In [25]:
# model.compile(optimizer='adam', loss=loss)
#
# checkpoint_dir = './training_checkpoint'
#
# checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
#
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_predix,
#     save_weights_only=True
# )
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(inp, target):
    with tf.GradientTape() as g:
        predictions = model(inp)
        loss = tf.reduce_mean(loss_sparse(target, predictions))

    gradients = g.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [1]:
EPOCHS = 10
checkpoint_dir = './training_checkpoint'

checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_predix,
    save_weights_only=True
)

for epoch in range(EPOCHS):
    start = time.time()

    # reset hidden state
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            print('Epoch {} Batch {} Loss {}'.format(epoch + 1, batch_n, loss))

    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_predix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


model.save_weights(checkpoint_predix.format(epoch=epoch))


NameError: name 'os' is not defined

In [None]:
def generate_text(model, start_string):
    """
    Chose a start string, init RNN state and set the number
    of characters to generate
    Get the prediction distribution of next character using the start string and RNN state
    Use categorical distribution to calculate the index of predicted character
    and use this predicted character as our next input
    The RNN state returned by the model is fed back into the model so that it now has more context,
    After predicting the next character, the modified RNN states are again fed back into the model
    :param model:
    :param start_string:
    :return:
    """
    num_generate = 1000

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) # convert to 2d tensor

    text_generated = []

    # Love results in more predictable text
    # High otherwise
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        # remove batch dimension
        predictions = tf.squeeze(predictions, axis=0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], axis=0)

        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

print(generate_text(model, start_string="ROMEO: "))


