<a href="https://colab.research.google.com/github/GuyRobot/AINotesBook/blob/main/TextGenerateCustom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os
import time

path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')



Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
text = open(path_to_file, 'rb').read().decode('utf-8')
print('Length of text: {} characters'.format(len(text)))
print(text[:100])



Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
# Unique character in files (a, b, c...)
vocab = sorted(set(text))
len(vocab)


65

In [None]:
# Vectorize
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert all character to int base on char2idx dict
text_as_int = np.array([char2idx[c] for c in text])
idx2char, text_as_int


(array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
        'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
       dtype='<U1'), array([18, 47, 56, ..., 45,  8,  0]))

In [None]:
ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab))
ids_from_chars

<tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup at 0x7f8235c0c198>

In [None]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)
chars_from_ids

<tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup at 0x7f8235b73e48>

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([20, 49, 58, ..., 47, 10,  2])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [None]:
# Map text to input and target (both input and target have
# the same seq_length but target is shifted to right one character)
def split_input_target(chunk):
    input_text = chunk[:-1] # take all except the last character
    target_text = chunk[1:] # take all except the first character

    return input_text, target_text

dataset = sequences.map(split_input_target)
dataset


<MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>

In [None]:
for input_example, target_example in  dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [None]:
# # Training examples  and targets
# # Divide text into example sequences, each input sequence will
# # contain seq_length characters from the text
# # Each sequence, the targets contain the same seq_length of text, but shifted one character to right

# seq_length = 100
# examples_per_epoch = len(text) // (seq_length + 1)

# char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# for i in char_dataset.take(5):
#     print(idx2char[i.numpy()])


In [None]:
# sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# for item in sequences.take(5):
#     print(repr(''.join(idx2char[item.numpy()])))


In [None]:
# for input_exp, target_exp in dataset.take(1):
#     print('Input data', repr(''.join(idx2char[input_exp.numpy()])))
#     print("Target data", repr(''.join(idx2char[target_exp.numpy()])))

In [None]:
# """
#     Each index of these vectors is processed as a one
#     time step. For the input at time step 0, the model
#     receives the index for "F" and tries to predict
#     the index for "i" as the next character. At the
#     next timestep, it does the same thing but the RNN
#     considers the previous step context in addition
#     to the current input character.
# """

# for i, (input_idx, target_idx) in enumerate(zip(input_exp[:5], target_exp[:5])):
#     print("Step {:4d}".format(i))
#     print("\tinput: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
#     print("\toutput: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


In [None]:
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [None]:
# Model
# Embedding: input layer map the numbers of each character to a vector
# with embedding_dim
# GRU: special type of RNN with size units=rnn_units
# Dense: vocab_size outputs

vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    print(x.shape)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x


model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
dataset.take(1)

<TakeDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [None]:
"""
    For each character the model looks up the
    embedding, runs the GRU one timestep with
    the embedding as input, and applies the dense
    layer to generate logits predicting the log-likelihood of the next character:
"""
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()



(64, 100, 256)
(64, 100, 67) # (batch_size, sequence_length, vocab_size)
Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  17152     
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  68675     
Total params: 4,024,131
Trainable params: 4,024,131
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([ 9, 13, 39, 19, 65,  3,  2, 40,  8,  5, 48, 12, 44, 57, 66,  2, 60,
       37, 40, 49, 57, 36, 49, 34, 42, 36, 64, 29, 24, 21,  5, 48,  8, 55,
       27, 63,  9, 14, 12,  3, 37, 60, 14, 26, 37, 14, 28, 20, 61, 17, 36,
       12, 33, 10,  1, 63, 59, 39, 63, 17, 24, 23, 63,  4, 46,  7, 61, 52,
       54, 42, 36,  0,  6, 18, 53, 15, 62,  2, 56, 27,  3, 24,  6, 20, 60,
       55, 42,  9, 30, 34, 35, 61, 59,  0, 31, 34, 32, 48, 30,  4])

In [None]:
# print("Input: \n", repr(''.join(idx2char[input_example_batch[0]])))
# print("Next Char Predictions: \n", repr(''.join(idx2char[sampled_indices])))

In [None]:
def loss_sparse(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
example_batch_loss.numpy().mean()


4.2046723

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS)


Epoch 1/20
(64, 100, 256)
(64, 100, 256)
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    print("Input chars", input_chars)
    input_ids = self.ids_from_chars(input_chars).to_tensor()
    print("Input Shape", input_ids.shape)
    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, states=states, 
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]
  
for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")

Input chars tf.RaggedTensor(values=Tensor("UnicodeSplit/UnicodeEncode/UnicodeEncode/UnicodeEncode/UnicodeEncode:0", shape=(None,), dtype=string), row_splits=Tensor("UnicodeSplit/UnicodeDecode:0", shape=(2,), dtype=int64))
Input Shape (1, None)
(1, None, 256)
Input chars tf.RaggedTensor(values=Tensor("UnicodeSplit/UnicodeEncode/UnicodeEncode/UnicodeEncode/UnicodeEncode:0", shape=(None,), dtype=string), row_splits=Tensor("UnicodeSplit/UnicodeDecode:0", shape=(2,), dtype=int64))
Input Shape (1, None)
(1, None, 256)
ROMEO:
My lord, I purpose it.

DUKE VINCENTIO:
'Tis foe to-day, and gentlemen born.

Clown:
We cannot tell below.

LADY ANNE:
Before we could thither come athough to my death?

NORTHUMBERLAND:
My lord, have we now gone, 'tis parties: but if you
serve the ordare hath kept with me; besides and kings,
And bid her venturous to my love.

FLORD ROSS:
Why, phear his heart, and even your impression would Show her,
Are denied to look on his noble hand:
And because the idle dully unto th

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

(None, 100, 256)
(None, 100, 256)
(None, 100, 256)
(None, 100, 256)
Input chars tf.RaggedTensor(values=Tensor("UnicodeSplit/UnicodeEncode/UnicodeEncode/UnicodeEncode/UnicodeEncode:0", shape=(None,), dtype=string), row_splits=Tensor("UnicodeSplit/UnicodeDecode:0", shape=(2,), dtype=int64))
Input Shape (1, None)
(1, None, 256)
Input chars tf.RaggedTensor(values=Tensor("UnicodeSplit/UnicodeEncode/UnicodeEncode/UnicodeEncode/UnicodeEncode:0", shape=(None,), dtype=string), row_splits=Tensor("UnicodeSplit/UnicodeDecode:0", shape=(2,), dtype=int64))
Input Shape (1, None)
(1, None, 256)
(None, 100, 256)
(None, 100, 256)
(None, 100, 256)
(None, 100, 256)




INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))









ROMEO:
O, that once comes thereaf, good-song--manded; when
Is this her face-blight day and stood ugning.




In [None]:
# model.compile(optimizer='adam', loss=loss)
#
# checkpoint_dir = './training_checkpoint'
#
# checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
#
# checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_predix,
#     save_weights_only=True
# )
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(inp, target):
    with tf.GradientTape() as g:
        predictions = model(inp)
        loss = tf.reduce_mean(loss_sparse(target, predictions))

    gradients = g.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [None]:
EPOCHS = 10
checkpoint_dir = './training_checkpoint'

checkpoint_predix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_predix,
    save_weights_only=True
)

for epoch in range(EPOCHS):
    start = time.time()

    # reset hidden state
    model.reset_states()

    for (batch_n, (inp, target)) in enumerate(dataset):
        loss = train_step(inp, target)

        if batch_n % 100 == 0:
            print('Epoch {} Batch {} Loss {}'.format(epoch + 1, batch_n, loss))

    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_predix.format(epoch=epoch))

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


model.save_weights(checkpoint_predix.format(epoch=epoch))


(64, 100, 256)
(64, 100, 256)
Epoch 1 Batch 0 Loss 0.6629511117935181
Epoch 1 Batch 100 Loss 0.7019443511962891
Epoch 1 Loss 0.7169
Time taken for 1 epoch 11.343119859695435 sec

Epoch 2 Batch 0 Loss 0.5859658718109131
Epoch 2 Batch 100 Loss 0.6386222243309021
Epoch 2 Loss 0.6507
Time taken for 1 epoch 10.347403287887573 sec

Epoch 3 Batch 0 Loss 0.5183229446411133
Epoch 3 Batch 100 Loss 0.5777083039283752
Epoch 3 Loss 0.6360
Time taken for 1 epoch 10.415955305099487 sec

Epoch 4 Batch 0 Loss 0.5053024888038635
Epoch 4 Batch 100 Loss 0.5534664392471313
Epoch 4 Loss 0.6094
Time taken for 1 epoch 10.40802001953125 sec

Epoch 5 Batch 0 Loss 0.4681413173675537
Epoch 5 Batch 100 Loss 0.5183385610580444
Epoch 5 Loss 0.5968
Time taken for 1 epoch 10.565070629119873 sec

Epoch 6 Batch 0 Loss 0.4594515860080719
Epoch 6 Batch 100 Loss 0.49798935651779175
Epoch 6 Loss 0.5562
Time taken for 1 epoch 10.451234340667725 sec

Epoch 7 Batch 0 Loss 0.4375821352005005
Epoch 7 Batch 100 Loss 0.49340307712

In [None]:
skip_ids = ids_from_chars(['', '[UNK]'])[:, None]
skip_ids

<tf.Tensor: shape=(2, 1), dtype=int64, numpy=
array([[0],
       [1]])>

In [None]:
ids_from_chars.get_vocabulary()

['',
 '[UNK]',
 '\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [None]:
tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7f819abc96a0>

In [None]:
len(ids_from_chars.get_vocabulary())

67

In [None]:
inputs = "ROMEO: "
next_char = tf.constant(['ROMEO:'])
print(next_char)
input_chars = tf.strings.unicode_split(next_char, 'UTF-8')
input_chars.to_list()
tf.ragged.constant(input_chars.to_list())


tf.Tensor([b'ROMEO:'], shape=(1,), dtype=string)


<tf.RaggedTensor [[b'R', b'O', b'M', b'E', b'O', b':']]>

In [None]:
,tf.ragged.constant()

TypeError: ignored

In [None]:
# input_ids = ids_from_chars(input_chars)
# input_ids
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = ids_from_chars(input_chars)
input_ids = tf.convert_to_tensor(input_ids)
input_ids
print("Fuck", input_ids.shape)
tf.reshape(input_ids, shape=[7, None])
    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
# predicted_logits, states =  model(inputs=input_ids, states=states, 
                                          # return_state=True)

In [None]:
states = None
model(inputs=input_ids, states=states, return_state=True)