# Text generation with an RNN

The aim of the project is to train a model to predict the next characters in a sequence. Also, longer sequences of text can be generated by calling the model repeatedly.

In the beginning, the model did not know what words are and even how to spell a single English word.The model is trained on small batches of text (100 characters each)and is made to generate longer sequences of text.The output is block of text beginning with speaker's name. 

# 1. Importing Modules

Importing TensorFlow and other libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time

# 2. Data Loading

We will be working on the dataset containing Shakespeare's writing from Andrej Karpathy's The Unreasonable Effectiveness of Recurrent Neural Networks.

In [None]:
load_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Reading the data

In [None]:
data_text = open(load_file, 'rb').read().decode(encoding='utf-8')

In [None]:
print(data_text[:200])

In [None]:
print('Length of text: {} characters'.format(len(data_text)))

Unique characters in the file - creating vocabulary

In [None]:
vocabulary = sorted(set(data_text))
print('{} unique characters'.format(len(vocabulary)))

# 3. Processing the Data

Converting the strings to a numerical representation - vectorization

In [None]:
# Print separate characters from words

sample_txt = ['abcdefg', 'xyz']
characters = tf.strings.unicode_split(sample_txt, input_encoding='UTF-8')
print(characters)

In [None]:
# creating a function to convert string to number ids

str_to_num = preprocessing.StringLookup(vocabulary=list(vocabulary))
num_id = str_to_num(characters)
print(num_id)

Recover human-readable strings from numerical Id

In [None]:
num_to_str = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=str_to_num.get_vocabulary(), invert=True)
str = num_to_str(num_id)
print(str)

In [None]:
# joining the characters together

joined_str=tf.strings.reduce_join(str, axis=-1).numpy()
print(joined_str)

In [None]:
# creating a final function

def convert_to_text(number_ids):
    return tf.strings.reduce_join(num_to_str(number_ids), axis=-1)

# 4. Pre-Training Processing

Next step is to train the model to perform.

The input to the model is a sequence of characters, and the model is trained to predict the output i.e. the next characters after the input text. And RNNs have a unique property to maintain an internal state which depends on the previously seen characters.

Divide the text into example sequences. For each input sequence, the corresponding targets contain the same length of text. Text is broken into chuncks of input text length + 1

In [None]:
# Getting all number ids

all_num_ids = str_to_num(tf.strings.unicode_split(data_text, 'UTF-8'))
print(all_num_ids)

In [None]:
# Creating Dataset out of the number ids

num_id_dataset = tf.data.Dataset.from_tensor_slices(all_num_ids)
print(num_id_dataset)

In [None]:
# converting first 30 number ids to strings

for id in num_id_dataset.take(30):
    print(num_to_str(id).numpy().decode('utf-8'))

Creating batches of data to train

In [None]:
seq_length = 100
examples_per_epoch = len(data_text)//(seq_length+1)
sequences = num_id_dataset.batch(seq_length+1, drop_remainder=True)
for seq in sequences.take(1):
    print(num_to_str(seq))

Converting characters to proper sentences

In [None]:
for seq in sequences.take(5):
    print(convert_to_text(seq).numpy())

For training one needs two things: (input and labels) and at each time step the input is the current character and the label is the next character.

In [None]:
def input_label(sample_txt):
    input_text = sample_txt[:-1]
    target_text = sample_txt[1:]
    return input_text, target_text

In [None]:
string="Value ML"
input,label=input_label(list(string))

In [None]:
print("The string is: ",string)
print("Current Character: ",input)
print("Next Character: ",label)

In [None]:
dataset = sequences.map(input_label)

In [None]:
dataset

In [None]:
for input_example, target_example in  dataset.take(1):
    print("Current characters:",num_to_str(input_example).numpy())
    print("Next characters:",num_to_str(target_example).numpy())

Create training batches

Spliting the text into manageable sequences, shuffling the data and pack it into batches

Buffer size to used to shuffle the dataset

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))
print(dataset)

# 5. Building the model

In [None]:
vocab_len=len(vocabulary)
embedding_dim = 256
rnn_units = 1024

In [None]:
class Text_Gen_Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True, 
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        if return_state:
            return x, states
        else: 
            return x

In [None]:
model = Text_Gen_Model(vocab_size=len(str_to_num.get_vocabulary()),embedding_dim=embedding_dim,rnn_units=rnn_units)

In [None]:
model

In [None]:
model.summary()

# 6. Trying out the Model

Runing the model to see that it behaves as expected

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, " -> (batch_size, sequence_length, vocab_size)")

Trying the model for the first example in the batch

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
sampled_indices

Decode to see the predicted text

In [None]:
print("Input:\n", num_to_str(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", num_to_str(sampled_indices).numpy())

# 7. Training the Model

Attaching an optimizer, and a loss function

In [None]:
cal_losses=tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_loss = cal_losses(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " -> (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Compiling the model

In [None]:
model.compile(optimizer='adam', loss=cal_losses)

Training of the model

In [None]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

# 8.Generate Texts

Each time the model is called passing some text and an internal state. 

The model returns a prediction for the next character and its new state. 

Pass the prediction and state back in to continue generating text.

In [None]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature=temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars
        skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            values=[-float('inf')]*len(skip_ids),
            indices = skip_ids,
            dense_shape=[len(ids_from_chars.get_vocabulary())]) 
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
    def generate_one_step(self, inputs, states=None):
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()
        predicted_logits, states =  self.model(inputs=input_ids, states=states, 
                                              return_state=True)
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        predicted_logits = predicted_logits + self.prediction_mask
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
        predicted_chars = self.chars_from_ids(predicted_ids)
        return predicted_chars, states

In [None]:
one_step_model = OneStep(model, num_to_str, str_to_num)

The process is run it in a loop to generate some text. 

The model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. 

But it has not yet learned to form coherent sentences even after training.

In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]
for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(f"\nRun time: {end - start}")

Generating 5 similar outputs

In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]
for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print(f"\nRun time: {end - start}")

# 9. Exporting the results

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]
for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)
print(tf.strings.join(result)[0].numpy().decode("utf-8"))