<a href="https://colab.research.google.com/github/GlassesNoGlasses/TFProjects/blob/main/projects/text/Harry_Potter_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Clone from GitHub repository

!git clone https://github.com/GlassesNoGlasses/TFProjects.git

fatal: destination path 'TFProjects' already exists and is not an empty directory.


**Goal**: Generate text for a harry potter book. We will use RNN and Keras similar to the TensorFlow tutorial.

In [22]:
# Required Imports
import tensorflow as tf

import numpy as np
import os
import time

In [23]:
# Obtain harry potter books in .txt form

#pathToFile = tf.keras.utils.get_file('harryPotterBook1.txt', 'file://content/TFProjects/data/texts/harryPotterBook1.txt')

text = open('/content/TFProjects/data/texts/Harry_Potter_all_books_preprocessed.txt', 'rb').read().decode(encoding='utf-8')

In [24]:
# The unique characters in book 1

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

71 unique characters


In [25]:
# Convert vocab into a list, then each character is tokenized with a unique id.

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [26]:
# Return characters based on their id representation defined above.

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [27]:
# Join ids back into original stirngs

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [28]:
# Tokenize and assign character ids to all characters in original text
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))


# Convert ids into a stream of ids that represent the original text characters
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [29]:
# Define the sequence length of characters to train model on
seq_length = 120

In [30]:
# Create sequential batches of size seq_length + 1
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

We are trying to predict the next character only.

In [31]:
# Split input sequence into a data set of (input, label)
# I.e. "tensorflow" = ("tensorflo", "ensorflow")

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [32]:
# Create training data set based on our original sequence
dataset = sequences.map(split_input_target)

In [33]:
# Creating test batches

BATCH_SIZE = 64

# Buffer to fit data into
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset


<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 120), dtype=tf.int64, name=None), TensorSpec(shape=(64, 120), dtype=tf.int64, name=None))>

In [34]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [35]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    # vocab_size: unique inputs + 1
    # embedding_dim: output vector dimensions
    # rnn_units: how many rnn used.
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    # log liklihood with vocab_size outputs
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [36]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [37]:
# Loss function

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [38]:
# Configuration of model with optimizer and loss functions

model.compile(optimizer='adam', loss=loss)

In [39]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [40]:
# Actual Training process

EPOCHS = 20

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
# Generating Text Class

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [42]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [43]:
start = time.time()
states = None
next_char = tf.constant(['CHAPTER'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

CHAPTER WI d HE DIDGS YOU TEOULL VIST TREAC !Ron and Hermione huddled for so long his body maiy .Two Monsters of Hogwarts crammed with a leap of memory on the nose of his face and showed that he was so he could still just stay on the house fists now in the dark ground .Are you not known know better this ?It it true however clear night concealed Malfoy and his voice pinning with to his legs again at each other into the air as the ghats rose into tears .Rabelook .well .Daylight for a wizard we came back again .And Amounted superviously bothered the wating wand still there was a slightly wash shaggy tinglegs of books on the floor let out a hand in frightly so that he broke the envelope .To Harrys old pieter .Didnt take this growl .So time you see said Harry What does that not it ?Im very proofill it it murder really lost .And when we brought this owls .Harry youve got to go to our first feagness .Be a real private hair of cheering the true had been very good to curse human life we got to 

In [44]:
class CustomTraining(MyModel):
  @tf.function
  def train_step(self, inputs):
      inputs, labels = inputs
      with tf.GradientTape() as tape:
          predictions = self(inputs, training=True)
          loss = self.loss(labels, predictions)
      grads = tape.gradient(loss, model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

      return {'loss': loss}

In [46]:
model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f747c751870>

In [47]:
EPOCHS = 50

mean = tf.metrics.Mean()

for epoch in range(EPOCHS):
    start = time.time()

    mean.reset_states()
    for (batch_n, (inp, target)) in enumerate(dataset):
        logs = model.train_step([inp, target])
        mean.update_state(logs['loss'])

        if batch_n % 50 == 0:
            template = f"Epoch {epoch+1} Batch {batch_n} Loss {logs['loss']:.4f}"
            print(template)

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print()
    print(f'Epoch {epoch+1} Loss: {mean.result().numpy():.4f}')
    print(f'Time taken for 1 epoch {time.time() - start:.2f} sec')
    print("_"*80)

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 0.9822
Epoch 1 Batch 50 Loss 0.9678
Epoch 1 Batch 100 Loss 0.9402
Epoch 1 Batch 150 Loss 0.9641
Epoch 1 Batch 200 Loss 0.9406
Epoch 1 Batch 250 Loss 0.9335
Epoch 1 Batch 300 Loss 0.9522
Epoch 1 Batch 350 Loss 0.9485
Epoch 1 Batch 400 Loss 0.9782
Epoch 1 Batch 450 Loss 0.9511
Epoch 1 Batch 500 Loss 0.9701
Epoch 1 Batch 550 Loss 0.9864
Epoch 1 Batch 600 Loss 0.9507
Epoch 1 Batch 650 Loss 0.9751
Epoch 1 Batch 700 Loss 0.9670
Epoch 1 Batch 750 Loss 0.9448

Epoch 1 Loss: 0.9545
Time taken for 1 epoch 53.83 sec
________________________________________________________________________________
Epoch 2 Batch 0 Loss 0.9777
Epoch 2 Batch 50 Loss 0.9314
Epoch 2 Batch 100 Loss 0.9465
Epoch 2 Batch 150 Loss 0.9761
Epoch 2 Batch 200 Loss 0.9513
Epoch 2 Batch 250 Loss 0.9450
Epoch 2 Batch 300 Loss 0.9426
Epoch 2 Batch 350 Loss 0.9313
Epoch 2 Batch 400 Loss 0.9598
Epoch 2 Batch 450 Loss 0.9211
Epoch 2 Batch 500 Loss 0.9366
Epoch 2 Batch 550 Loss 0.9274
Epoch 2 Batch 600 Loss 0.9573


KeyboardInterrupt: 

In [49]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

start = time.time()
states = None
next_char = tf.constant(['Harry Potter was dead.'])
result = [next_char]

for n in range(1000):
  next_char, states =  one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Harry Potter was dead.Skells did everybody echoed Harry added behind them so that he could not be remonstrating to sound as though if I were all yelling in the dark end students will take plenfylusion of Sirius Black had heard .The whole farther or out for us all three of us is only said Harry quietly .Please meet these things along in the class and Cedric .Dad will corridor Miss Nor much Sirius looked down at his own last sparks .Slowly very still longly very protected and happened to help him who Still said the Malfoy was already shrugging .When he lay moving out of the way and entered .Oh ever he just unseen pures !said Bagman pounding his hand at Cedrics .Where ?Sorry hes alrowed dreaming that the curse was unhalk to him why dont you make everyone dose that the Death Eaters ahead these words will matter of what he had said about anything else ?said Harry quickly shifted .My parents !said Mr Weasley quietly .Seither than depressed it practical echoes about okay Tom sick .He took a d