## Train a model that predict the next char based on the given chars sequence.

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import time, sys, os
import numpy as np
import matplotlib.pyplot as plt

### Load the data.

In [2]:
data_path = 'data/shakespeare.txt'
data = open(data_path, 'rb').read().decode(encoding='utf-8')
unique_chars = list(set(data))

print(f"Data size: {len(data)}")
print(f"Unique chars size:{len(unique_chars)}")

Data size: 1115394
Unique chars size:65


### Preprocess the data

In [3]:
char2idx = {c:i for i, c in enumerate(unique_chars)}
idx2char = np.array(unique_chars)

txt_as_idxs = np.array([char2idx[c] for c in data])
print(txt_as_idxs[:50])
print(f"chars len: {len(txt_as_idxs)}")

[58 51 56 25 19 18 53 51 19 51  1 48 35 10 26  7 48 45  5 56 48 18 32 48
 18 60 56  5 59 48 48 63 18  0 35 41 18 45 33 56 19 14 48 56 64 18 14 48
  0 56]
chars len: 1115394


In [4]:
char_dataset = tf.data.Dataset.from_tensor_slices(txt_as_idxs)

print(len(char_dataset), char_dataset)
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

1115394 <TensorSliceDataset shapes: (), types: tf.int64>
F
i
r
s
t


In [5]:
# batch the data set with (n//seq_length, seq_length)
seq_length = 100
sequences = char_dataset.batch(batch_size=seq_length+1, drop_remainder=True)

for item in sequences.take(1):
    print(repr(' '.join(idx2char[item.numpy()])))
    print("-----------------------------")

print("DATASET: (%d, %d)"%(len(sequences), seq_length+1))

'F i r s t   C i t i z e n : \n B e f o r e   w e   p r o c e e d   a n y   f u r t h e r ,   h e a r   m e   s p e a k . \n \n A l l : \n S p e a k ,   s p e a k . \n \n F i r s t   C i t i z e n : \n Y o u  '
-----------------------------
DATASET: (11043, 101)


In [10]:
# split the input sentence to (input, target). `input`: is the first :n-1 chars, `output`:
# is 1:n chars.
def split_to_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset=sequences.map(split_to_input_target)

for input, target in dataset.take(1):
    print(f"input: { repr(' '.join( idx2char[input.numpy()] )) }")
    print(f"target: { repr(' '.join( idx2char[target.numpy()] )) }")
    
print(f"DataSet: ({len(dataset)}, {dataset})")

input: 'F i r s t   C i t i z e n : \n B e f o r e   w e   p r o c e e d   a n y   f u r t h e r ,   h e a r   m e   s p e a k . \n \n A l l : \n S p e a k ,   s p e a k . \n \n F i r s t   C i t i z e n : \n Y o u'
target: 'i r s t   C i t i z e n : \n B e f o r e   w e   p r o c e e d   a n y   f u r t h e r ,   h e a r   m e   s p e a k . \n \n A l l : \n S p e a k ,   s p e a k . \n \n F i r s t   C i t i z e n : \n Y o u  '
DataSet: (11043, <MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>)


In [11]:
# each index of theses vectors is processed as one timestep
for input_exe, target_exe in dataset.take(1):
    for i, o in zip(input_exe.numpy()[:5], target_exe.numpy()[:5]):
        print(f"{i}:`{idx2char[i]}` -> {o}:`{idx2char[o]}`")

58:`F` -> 51:`i`
51:`i` -> 56:`r`
56:`r` -> 25:`s`
25:`s` -> 19:`t`
19:`t` -> 18:` `


In [12]:
# split the data into batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(buffer_size=BUFFER_SIZE).batch(batch_size=BATCH_SIZE, drop_remainder=True)

In [21]:
# describe the dataset that we will use to train the model.
print(dataset)
print(f"Number of batches: {len(dataset)}")
for input, target in dataset.take(1):
    print(f"Every batch input: {input.shape}")
    print(f"Every batch output: {target.shape}")

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>
Number of batches: 172
Every batch input: (64, 100)
Every batch output: (64, 100)


### Define the model

In [81]:
vocab_size = len(unique_chars)
embedding_dim = 256
rnn_untis = 1024
examples_per_epoch = len(txt_as_idxs)//(seq_length+1)

In [79]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
        return keras.Sequential([
            keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
            keras.layers.GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
            keras.layers.Dense(vocab_size),
        ])

In [23]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [61]:
# explore the model
for batch_example_input, batch_example_output in dataset.take(1):
    print("Input Shape: ", batch_example_input.shape)
    print("Expected Output Shape: ", batch_example_output.shape)
    batch_example_predictions = model(batch_example_input)
    print("Actual Output Shape: ", batch_example_predictions.shape)

Input Shape:  (64, 100)
Expected Output Shape:  (64, 100)
Actual Output Shape:  (64, 100, 65)


In [53]:
# the model output per one example in the batch
print("OUTPUT PER EXAMPLE:", output[0].shape)
sampled_indeces = tf.random.categorical(output[0], num_samples=1)
sampled_indeces = tf.squeeze(sampled_indeces, axis=-1)
print("OUTPUT AS INDECES:", sampled_indeces)

# decode it to the actual text.
print("\nOUTPUT AS CHARS: ", repr(" ".join([idx2char[i] for i in sampled_indeces])))

OUTPUT PER EXAMPLE: (100, 65)
OUTPUT AS INDECES: tf.Tensor(
[ 2 10 61  9 14 63 34 42 27  8  6 26 35 24 13  9 59 26 29 29  5  4 56 45
 56 52  0 54 11 51 29 13 28 62 23  7 56 15  7 34 53  3 34 53  7 46 10  4
 34 19 37  2 52 50 56 15 36 14  6 16 23 46 23 31 17  7 13 64 15 32 57 58
  9 25 53  7 61 58 50 32 17 30 56  2  0 12 30 55 48  4 29 28  7  4 64 43
 62  9 46 47], shape=(100,), dtype=int64)

OUTPUT AS CHARS:  "; : k P h d g G A R M \n n 3 ! P c \n ? ? o - r f r V a Q I i ? ! L x J B r b B g C $ g C B H : - g t Z ; V ' r b v h M N J H J K . B ! , b w U F P s C B k F ' w . j r ; a W j l e - ? L B - , X x P H q"


In [74]:
# define loss function 
def loss (labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# test the loss function
losses = loss(batch_example_output, batch_example_predictions)
print(f"Batch loss Shape: {losses.shape}")
print(f"Scaler loss {losses.numpy().mean()}")

Batch loss Shape: (64, 100)
Scaler loss 4.1747002601623535


In [75]:
model.compile(optimizer="adam", loss=loss)

In [77]:
# set some checkpoints to control the training process
checkpoints_dir = "model/checkpoints/char_based_txt_generator"

# set the name of the checkpoints files
checkpoints_file_name = os.path.join(checkpoints_dir, "chk_{epoch}")

# create checkpoint callback instance
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoints_file_name,
                                                         save_weights_only=True)

In [78]:
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
 34/172 [====>.........................] - ETA: 5:11 - loss: 1.7769

KeyboardInterrupt: 

In [91]:
"""
Because of the way the RNN state is passed from timestep to timestep,
the model only accepts a fixed batch size once built.
"""
print(tf.train.latest_checkpoint(checkpoints_dir))
model_ = build_model(vocab_size, embedding_dim, rnn_untis, 1)
model_.load_weights(tf.train.latest_checkpoint(checkpoints_dir))
model_.build(tf.TensorShape([1, None]))
model_.summary()

model/checkpoints/char_based_txt_generator/chk_2
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_8 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_8 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


## Model Evaluation: Text Generation

In [139]:
def generate_text(model, start_sentence):
    num_generated_words = 1000
    
    # vectorize input.
    input = [char2idx[c] for c in start_sentence]
    input = tf.expand_dims(input, 0)
    
    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0
    
    generated_txt = []
    model.reset_states()
    for i in range(num_generated_words):
        predictions = model(input)
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        # use categorical distribution to get the character predicted by the model.
        predictions = predictions / temperature
        predictions = tf.random.categorical(predictions, num_samples=1)
        predicted_char_id = predictions[-1, 0].numpy()
        
        # feed the predicted char as the next input with the previous hidden state.
        input = tf.expand_dims([predicted_char_id], 0)
        generated_txt.append(idx2char[predicted_char_id])
    
    return start_sentence + ''.join(generated_txt)
    
    pass

In [140]:
start_sentence = u"ROMEO: "
generated_txt = generate_text(model_, start_sentence)
print(generated_txt)

ROMEO: dive O kiss, To comfin.
The eave these arm she dife nef seal my live in my,
The Romen:
Sacry as I that must groe I trunk and spolkes the king,
Hevat wit, thee died mose way in thy bese it dive cifffrne.

LUCESTER:
The eathes if he shall know, the denate aviedy;
Dow his say, no manting and proslect a mooth distain,
Whut is wind he dour of thereRonk, ore duck!
Frist trough thit himss!
And I dave ugose contin of dued. I beas pance and sta groug nown, sholl glave frebmy abon
And swave shall agaks, my they?

JUKI GRKNCEF:
Bet a bean a, sweers your king you;
for viich of here herge frem at ap loid,.

LANUS:
Your knows not, dime?

Jatbe
Deedot have fairtull not.

HARI AUNIO:
Thou bec'd pefolt out sornat the wonton, thee hance not
Fill on Rowald; leig, mave you.

SATUNT:
Bet we creen you whon youron! -hats, I may, Sice,?

LUTIO nom.

KIRG RICHARD II:
Far I net I him of eety prite;
And shall fremat if? The' foo me to like youth love? of earty:
But vitcise?
Tnough now? why love.

MAENENO:

## Training loop control

In [None]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [141]:
optimizer = tf.keras.optimizers.Adam()

In [142]:
@tf.function
def train_step(inp, target):
    with tf.GradientTape() as tape:
        predictions = model(inp)
        loss = tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(
                target, predictions, from_logits=True))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

In [None]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    # resetting the hidden state at the start of every epoch
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            template = 'Epoch {} Batch {} Loss {}'
            print(template.format(epoch + 1, batch_n, loss))

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))