In [34]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
import os
import time

In [35]:
# Example text
data = "This is a simple example for text generation using RNN in Python. "

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
tokenizer.word_index

{'this': 1,
 'is': 2,
 'a': 3,
 'simple': 4,
 'example': 5,
 'for': 6,
 'text': 7,
 'generation': 8,
 'using': 9,
 'rnn': 10,
 'in': 11,
 'python': 12}

In [36]:
total_words = len(tokenizer.word_index) + 1
total_words

13

In [78]:
# --- 1. Download and Prepare the Dataset ---
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt',
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# --- 2. Process the Text ---
vocab = sorted(set(text))
vocab = ['[UNK]'] + vocab  # Now vocab size is 66
print(f'{len(vocab)} unique characters')

chars_to_ids = {u: i for i, u in enumerate(vocab)}
ids_to_chars = np.array(vocab)

all_ids = np.array([chars_to_ids[c] for c in text])
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


66 unique characters


In [79]:
print(len(text))

1115394


In [80]:
print(all_ids[:50])
print(chars_to_ids)
for id in ids_dataset.take(5):
    print(id.numpy())

[19 48 57 58 59  2 16 48 59 48 65 44 53 11  1 15 44 45 54 57 44  2 62 44
  2 55 57 54 42 44 44 43  2 40 53 64  2 45 60 57 59 47 44 57  7  2 47 44
 40 57]
{'[UNK]': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '3': 10, ':': 11, ';': 12, '?': 13, 'A': 14, 'B': 15, 'C': 16, 'D': 17, 'E': 18, 'F': 19, 'G': 20, 'H': 21, 'I': 22, 'J': 23, 'K': 24, 'L': 25, 'M': 26, 'N': 27, 'O': 28, 'P': 29, 'Q': 30, 'R': 31, 'S': 32, 'T': 33, 'U': 34, 'V': 35, 'W': 36, 'X': 37, 'Y': 38, 'Z': 39, 'a': 40, 'b': 41, 'c': 42, 'd': 43, 'e': 44, 'f': 45, 'g': 46, 'h': 47, 'i': 48, 'j': 49, 'k': 50, 'l': 51, 'm': 52, 'n': 53, 'o': 54, 'p': 55, 'q': 56, 'r': 57, 's': 58, 't': 59, 'u': 60, 'v': 61, 'w': 62, 'x': 63, 'y': 64, 'z': 65}
19
48
57
58
59


In [81]:
# --- 3. Create Training Examples and Batches ---
seq_length = 100
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)
print(sequences)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


dataset = sequences.map(split_input_target)


<_BatchDataset element_spec=TensorSpec(shape=(101,), dtype=tf.int32, name=None)>


In [82]:
# --- 4. Configure the Training Batches ---
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

# --- 5. Build The RNN Model ---
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 256


class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        return (x, states) if return_state else x


model = MyModel(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units)
model.compile(optimizer='adam', loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True))


In [91]:
# --- 6. Train the Model ---
EPOCHS = 5
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [84]:
# --- 7. Generate Text ---
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [85]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab),
    mask_token=None,
    oov_token=None,
    num_oov_indices=0  # Prevents adding [UNK]
)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),
    invert=True,
    mask_token=None,
    oov_token="[UNK]"
)


In [86]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create mask with correct vocabulary size
        vocab_size = len(ids_from_chars.get_vocabulary())
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            values=[-float('inf')] * len(skip_ids),
            indices=skip_ids,
            dense_shape=[vocab_size])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()
        
        predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        
        # Ensure mask matches logits shape
        mask = tf.cast(self.prediction_mask, dtype=predicted_logits.dtype)
        mask = tf.reshape(mask, [1, -1])  # Reshape to [1, vocab_size]

        predicted_logits = predicted_logits + mask
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        predicted_chars = self.chars_from_ids(predicted_ids)
        return predicted_chars, states


In [92]:
# Create and test the generator
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

# Generate text
start_time = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end_time = time.time()

print("\n--- Generated Text ---")
print(result[0].numpy().decode('utf-8'))
print(f'\nRun time: {end_time - start_time:.2f}s')


--- Generated Text ---
ROMEO:
Pirtation I weet behere'd:
sorrow suthing
'Dilign of Luck?
Doo clood and be pine, in here--

OF YORK:

KING RICHARD II:
I would thy slight, glade you the tortion asse false,
If I dis most holt to him:
A courtion'd itswear most somell her
chance.

SEBAWISTEBOF:
Could they wrongs; when: when fair pleasurance, which of the scare come!

JULIET:
O, cull'd, gentlest blood honot is gively kinger
to you to pakuing uple aboubling
One more home! Solious lust, for thou shows, and, his woech:
Who he te have unkid.
Why, I see the cheeth once which love,
The corse, the occomant,
I'll yourse I'll not commandstick'd,
Of he innoch accouseding say moreques me
Do never effuries he, heart Master?

ARIEL:
No, new, by my nevery,--
Mess'd is the Causlan, the powle to-knep,
Antis'd anct a planted new the worther; toward
speeds in thy spirise of for other stand, the old.

ISABELIA:
O' the backly boyes asmand tempence
Edward so day.

KICHARD II:
He tife a'll smored that thengs, my