In [37]:
import os
import time
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from matplotlib import pyplot
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## Part 1: Use existing model provided in the use-case

In [43]:
# Read, then decode for py2 compat.
path_to_file = "/content/wonderland.txt"
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 166963 characters


In [44]:
# Take a look at the first 250 characters in text
print(text[:250])

﻿*** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***
ALICE'S ADVENTURES IN WONDERLAND

Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of si


In [45]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

85 unique characters


Process the text

Vectorize the text

Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [46]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

The prediction task

Given a character, or a sequence of characters, what is the most probable next character? This is the task we're training the model to perform. The input to the model will be a sequence of characters, and we train the model to predict the output—the following character at each time step.

Since RNNs maintain an internal state that depends on the previously seen elements, given all the characters computed until this moment, what is the next character?

Create training examples and targets

Next divide the text into example sequences. Each input sequence will contain seq_length characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So break the text into chunks of seq_length+1. For example, say seq_length is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".

To do this first use the tf.data.Dataset.from_tensor_slices function to convert the text vector into a stream of character indices.

In [47]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

﻿
*
*
*
 


The batch method lets us easily convert these individual characters to sequences of the desired size.

In [48]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

"\ufeff*** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***\r\nALICE'S ADVENTURES I"
'N WONDERLAND\r\n\r\nLewis Carroll\r\n\r\nTHE MILLENNIUM FULCRUM EDITION 3.0\r\n\r\nCHAPTER I. Down the Rabbit-Hol'
'e\r\n\r\nAlice was beginning to get very tired of sitting by her sister on the\r\nbank, and of having nothi'
'ng to do: once or twice she had peeped into the\r\nbook her sister was reading, but it had no pictures '
"or conversations in\r\nit, 'and what is the use of a book,' thought Alice 'without pictures or\r\nconvers"


For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch:

In [49]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

Print the first examples input and target values:

In [50]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "\ufeff*** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***\r\nALICE'S ADVENTURES "
Target data: "*** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***\r\nALICE'S ADVENTURES I"


Create training batches

We used tf.data to split the text into manageable sequences. But before feeding this data into the model, we need to shuffle the data and pack it into batches.

In [51]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

Build The Model

Use tf.keras.Sequential to define the model. For this simple example three layers are used to define our model:

tf.keras.layers.Embedding: The input layer. A trainable lookup table that will map the numbers of each character to a vector with embedding_dim dimensions;

tf.keras.layers.GRU: A type of RNN with size units=rnn_units (You can also use a LSTM layer here.)

tf.keras.layers.Dense: The output layer, with vocab_size outputs.

In [52]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [53]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [54]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)



Try the model

Now run the model to see that it behaves as expected.

First check the shape of the output:

In [55]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 85) # (batch_size, sequence_length, vocab_size)


In the above example the sequence length of the input is 100 but the model can be run on inputs of any length:

In [56]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 256)           21760     
_________________________________________________________________
gru_3 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_3 (Dense)              (64, None, 85)            87125     
Total params: 4,047,189
Trainable params: 4,047,189
Non-trainable params: 0
_________________________________________________________________


To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.

Note: It is important to sample from this distribution as taking the argmax of the distribution can easily get the model stuck in a loop.

Try it for the first example in the batch:

In [57]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
#This gives us, at each timestep, a prediction of the next character index:
sampled_indices

array([78, 25, 80, 71, 55, 16,  5, 54, 59, 52, 46, 45, 41, 43, 10, 55, 62,
        2, 17, 41, 11, 67,  1, 16, 58, 74, 42, 11,  2, 62,  2, 72, 48, 46,
        7, 79, 84,  8,  3, 82, 28, 28, 84, 62,  9, 77, 41, 29, 73, 19, 45,
       13, 62, 35, 84, 45,  5, 82, 65, 59, 14, 70, 33, 35, 81, 10, 21, 84,
       17,  6, 72, 61, 27, 24, 80, 23, 10, 48, 16, 69, 11,  0, 81, 84, 52,
       21, 23, 53, 31, 29, 21, 24, 45, 44, 19, 70, 56, 30, 44, 65])

Train the model

At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

Attach an optimizer, and a loss function
The standard tf.keras.losses.sparse_categorical_crossentropy loss function works in this case because it is applied across the last dimension of the predictions.

Because our model returns logits, we need to set the from_logits flag.

In [58]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 85)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.441744


Configure the training procedure using the tf.keras.Model.compile method. We'll use tf.keras.optimizers.Adam with default arguments and the loss function.

In [59]:
model.compile(optimizer='adam', loss=loss)

Configure checkpoints

Use a tf.keras.callbacks.ModelCheckpoint to ensure that checkpoints are saved during training:

In [60]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

Execute the training

To keep training time reasonable, use 10 epochs to train the model. 

In [61]:
EPOCHS=10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Generate text

Restore the latest checkpoint
To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different batch_size, we need to rebuild the model and restore the weights from the checkpoint.

In [64]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 256)            21760     
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_5 (Dense)              (1, None, 85)             87125     
Total params: 4,047,189
Trainable params: 4,047,189
Non-trainable params: 0
_________________________________________________________________


The prediction loop

The following code block generates the text:

It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.

Get the prediction distribution of the next character using the start string and the RNN state.

Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one character. After predicting the next character, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters.

Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [65]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [66]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: RKIZDuc'Lgant: and it blise Fould gan ture adon, this liget the same of liver she this not a sreaint the turk I door than to founs for off trie Minch Hartel wentwhtrgain the was sien), of coint Thet to see tly tome toot,
  *' sowh to soy vers ins off to as turpres wivh a puston.

'fot ather sle
kight then her lofchit you mad son, you bat' panct as vouce sen it. I BEVE to she roulled
Here of the CI-comeron.

'Whon, why Hete jusk looking smiezt, 'The Fuche and onteer-thing as I myon's said; 'me fock on't the Met read.
They eas the Conte sime an
Co bote of arm it
thing tome cvosain af thes go knows!'

The a nim walast fow same, and said the tryouge I

          Ix, I shisks alang
said the MINg I'



'SE Alice reake the acrauss
of owe off, the Dore mack batne littibls uppicers,
the ghien' said that Pithed "Ho foreinl,

'Sod Alice comninl as she hind could in her was I wenf inez-E.

'What so sust cratsed in off the Marct Guronly wot on reey, I've Fourd if,' t

In [67]:
print(generate_text(model, start_string=u"ALICE: "))

ALICE: TUHINSCHONURHay un'ther,  titnout shims Turther, 'Was lect you doow the saice

Thitk tos the fros as she
ous it agr cyou.

Thet shes sheed spilu-in at the King the los look got is exlect Tertliblid to to betuling of thiss ferf llatin dige us net gome the
 it
sition yow wable you, and I poonion quent
byoje, wish moongs the Urow; and folanaid, frowt Alice and of
to to see monatagel; 'y the krown the  and upicplenging call Ix Hares the Precouse ap cares, you astliems it mest Yom could
but wind dakn you vect enterelly--- meeppliers.

'Y@6;
I'Ch you grined. 'I you achous pertonesters the--gros fact row  Gr, the cother ag the triem hol wasnings was merouid gath mareguthed the wind
cenmeme-t
LUE Meuce thath
Has ser hers ofs they, was kinus, Ix sqole sard, but in's listlaus, sund ablug the Guxe foot wnyiss!' the Kine righouse foom see dess muse of HET to kidf redo wne bede gan of t all begot
Alict:

'Whe Hat hach her simeg on
you--It I we try me tize (stiolayne said 

## Part 2: Create a model with different hyperparameters

#### Load file data

In [68]:
# load ascii text and covert to lowercase
filename = "/content/wonderland.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

#### Map unique chars to integers

In [69]:
# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

#### Find total characters and vocab used in the data

In [70]:
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  163260
Total Vocab:  58


#### Prepare the dataset of input to output pairs encoded as integers

In [71]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  163160


#### Sample reshape, normalization, and one hot encoding

In [73]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = to_categorical(dataY)

#### Build the model

In [74]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))



#### ModelCheckpoint created

In [75]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

#### Compile the model

In [76]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

#### Fit the model

In [77]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.88359, saving model to weights-improvement-01-2.8836.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.88359 to 2.61854, saving model to weights-improvement-02-2.6185.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.61854 to 2.46033, saving model to weights-improvement-03-2.4603.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.46033 to 2.31349, saving model to weights-improvement-04-2.3135.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.31349 to 2.19709, saving model to weights-improvement-05-2.1971.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.19709 to 2.10840, saving model to weights-improvement-06-2.1084.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.10840 to 2.03942, saving model to weights-improvement-07-2.0394.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.03942 to 1.97414, saving model to weights-improvement-08-1.9741.hdf5
Epoch 9/20

Epoch 00009: loss improved from 1.97414 to 1.92781, saving model to weig

<keras.callbacks.History at 0x7faf3fe6ee10>

#### Load model using the best checkpoint with least loss

In [78]:
# load the network weights
filename = "/content/weights-improvement-20-1.6002.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

#### picking the seed value

In [79]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
"  creatures wouldn't be so easily
offended!'

'you'll get used to it in time,' said the caterpillar;  "


Generate text using model 

In [80]:
import sys

# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]


'would not a courte,' she mock turtle raid to herself, 'i don't know the doomouse was the doom and then so her to the sabbit! would not a cortersation, and the pueen said to the coor and the sabbit with the way of the coort of the court, and the cook said to herself  the doomouse was not a long ar once, and the pueen said to the coor and the sabbit with the way of the coort of the court, and the cook said to herself  the doomouse was not a long ar once, and the pueen said to the coor and the sabbit with the way of the coort of the court, and the cook said to herself  the doomouse was not a long ar once, and the pueen said to the coor and the sabbit with the way of the coort of the court, and the cook said to herself  the doomouse was not a long ar once, and the pueen said to the coor and the sabbit with the way of the coort of the court, and the cook said to herself  the doomouse was not a long ar once, and the pueen said to the coor and the sabbit with the way of the coort of the cou