In [None]:
"""
Exercise 01: RNN Text Generation - Starter Code

Reflection Notes:
- model coherency is not great, but gets worse after about 20 epochs
- model is overfit to the sample text, so it will not generalize well
- adjusting temperature does not seem to have much effect on the quality of the generated text
- increasing sample text size did not improve the model's performance

Suggestions:
- use a much larger sample text
"""

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# ============================================================================
# SAMPLE TEXT (PROVIDED)
# ============================================================================

# SAMPLE_TEXT = """
# To be or not to be that is the question
# Whether tis nobler in the mind to suffer
# The slings and arrows of outrageous fortune
# Or to take arms against a sea of troubles
# And by opposing end them To die to sleep
# No more and by a sleep to say we end
# The heartache and the thousand natural shocks
# That flesh is heir to Tis a consummation
# Devoutly to be wished To die to sleep
# To sleep perchance to dream ay theres the rub
# For in that sleep of death what dreams may come
# """

SAMPLE_TEXT = """
To be or not to be that is the question
Whether tis nobler in the mind to suffer
The slings and arrows of outrageous fortune
Or to take arms against a sea of troubles
And by opposing end them To die to sleep
No more and by a sleep to say we end
The heartache and the thousand natural shocks
That flesh is heir to Tis a consummation
Devoutly to be wished To die to sleep
To sleep perchance to dream ay theres the rub
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil
Must give us pause theres the respect
That makes calamity of so long life
For who would bear the whips and scorns of time
The oppressors wrong the proud mans contumely
The pangs of despised love the laws delay
The insolence of office and the spurns
That patient merit of the unworthy takes
When he himself might his quietus make
With a bare bodkin who would fardels bear
To grunt and sweat under a weary life
But that the dread of something after death
The undiscovered country from whose bourn
No traveller returns puzzles the will
And makes us rather bear those ills we have
Than fly to others that we know not of
Thus conscience does make cowards of us all
And thus the native hue of resolution
Is sicklied oer with the pale cast of thought
And enterprises of great pith and moment
With this regard their currents turn awry
And lose the name of action
""" * 3


In [38]:

# ============================================================================
# TASK 1.1: Character-Level Preprocessing
# ============================================================================

class CharacterTokenizer:
    """
    Character-level tokenizer.
    
    Unlike word tokenizers, this maps individual characters to indices.
    Vocab is small (26 letters + space + punctuation = ~40 chars).
    """
    
    def __init__(self):
        self.char_to_idx = {}
        self.idx_to_char = {}
        self.vocab_size = 0
    
    def fit(self, text):
        """
        Build character vocabulary.
        
        STEPS:
        1. Get unique characters from text.lower()
        2. Sort them for reproducibility
        3. Create char_to_idx: {char: index}
        4. Create idx_to_char: {index: char}
        """
        chars = sorted(list(set(text.lower())))
        self.char_to_idx = {c: i for i, c in enumerate(chars)}
        self.idx_to_char = {i: c for i, c in enumerate(chars)}
        self.vocab_size = len(chars)
    
    def encode(self, text):
        """Convert text to list of indices"""
        return [self.char_to_idx.get(c, 0) for c in text.lower()]
    
    def decode(self, indices):
        """Convert indices back to text"""
        return ''.join([self.idx_to_char.get(i, '?') for i in indices])


def create_sequences(text, tokenizer, seq_length=40):
    """
    Create training sequences for next-character prediction.
    
    CONCEPT: Slide a window over the text
    - Input: characters [i : i+seq_length]
    - Target: character at position i+seq_length
    
    EXAMPLE with seq_length=5:
    Text: "hello world"
    Sequence 1: Input="hello", Target=" "
    Sequence 2: Input="ello ", Target="w"
    ...
    
    Returns:
        X: array of shape (num_sequences, seq_length)
        y: array of shape (num_sequences,)
    """
    encoded = tokenizer.encode(text)
    sequences = []
    targets = []
    
    for i in range(len(encoded) - seq_length):
        seq = encoded[i:i + seq_length]
        target = encoded[i + seq_length]
        sequences.append(seq)
        targets.append(target)
    
    return np.array(sequences), np.array(targets)

In [39]:

# ============================================================================
# TASK 1.2: Build RNN Generator
# ============================================================================

def build_text_generator(vocab_size, embedding_dim=64, rnn_units=128, seq_length=40):
    """
    Build character-level text generation model.
    
    ARCHITECTURE:
    1. Embedding(vocab_size, embedding_dim) - learn char representations
    2. LSTM(rnn_units, return_sequences=False) - process sequence
    3. Dense(vocab_size, softmax) - predict next character
    
    WHY LSTM OVER SIMPLERNN:
    - Text generation needs long-term memory (sentence structure)
    - LSTM handles this better than SimpleRNN
    
    SEE: demo_01_simple_rnn.py for RNN layer usage
    """
    model = keras.Sequential([
        layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
        layers.LSTM(rnn_units, return_sequences=False), 
        layers.Dense(vocab_size, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [40]:

# ============================================================================
# TASK 1.3: Training
# ============================================================================

def train_generator():
    """
    Train the text generator.
    
    STEPS:
    1. Create tokenizer, fit on SAMPLE_TEXT
    2. Create sequences with seq_length=40
    3. Build model
    4. Train for 50 epochs
    
    EXPECTED: Loss should decrease from ~3.5 to ~1.5
    """
    print("Training Text Generator")
    
    tokenizer = CharacterTokenizer()
    tokenizer.fit(SAMPLE_TEXT)
    print(f"Vocabulary size: {tokenizer.vocab_size}")
    
    seq_length = 20
    X, y = create_sequences(SAMPLE_TEXT, tokenizer, seq_length=seq_length)
    print(f"Total sequences: {len(X)}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    
    model = build_text_generator(tokenizer.vocab_size, seq_length=seq_length)
    model.summary()
    
    history = model.fit(
        X, y,
        epochs=20,
        batch_size=25,
        validation_split=0.1,
        verbose=1
    )
    
    return model, tokenizer, history


In [41]:

# ============================================================================
# TASK 1.4: Text Generation
# ============================================================================

def sample_with_temperature(predictions, temperature=1.0):
    """
    Sample from prediction distribution with temperature.
    
    Temperature controls randomness:
    - temperature = 1.0: Sample according to model's probabilities
    - temperature < 1.0: More deterministic (picks high-probability characters)
    - temperature > 1.0: More random (flattens probability distribution)
    """
    predictions = np.asarray(predictions).astype('float64')
    
    predictions = np.log(predictions + 1e-10) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)


def generate_text(model, tokenizer, seed_text, length=100, temperature=1.0):
    """
    Generate text starting from seed_text.
    
    ALGORITHM (autoregressive generation):
    1. Encode seed_text to indices
    2. For each new character:
       a. Take last seq_length characters as input
       b. Predict probability distribution over next char
       c. Sample from distribution (using temperature)
       d. Append sampled char to sequence
    3. Decode and return generated text
    
    TEMPERATURE:
    - 1.0 = sample from model's distribution
    - <1.0 = more confident (less random)
    - >1.0 = more creative (more random)
    
    SAMPLING WITH TEMPERATURE:
    logits = model.predict(input)  # raw scores
    logits = logits / temperature
    probs = softmax(logits)
    next_char = np.random.choice(vocab_size, p=probs)
    """
    seq_length = model.input_shape[1]
    generated = seed_text.lower()
    
    if len(seed_text) < seq_length:
        seed_text = ' ' * (seq_length - len(seed_text)) + seed_text.lower()
    else:
        seed_text = seed_text[-seq_length:].lower()
    
    for _ in range(length):
        x = np.array([tokenizer.encode(seed_text[-seq_length:])])
        
        predictions = model.predict(x, verbose=0)[0]
        next_idx = sample_with_temperature(predictions, temperature)
        next_char = tokenizer.idx_to_char[next_idx]
        
        generated += next_char
        seed_text = seed_text[1:] + next_char
    
    return generated


def experiment_temperature():
    """
    Generate text at different temperatures: 0.5, 1.0, 1.5
    
    Observe:
    - Low temp: Repetitive but coherent
    - High temp: Creative but may be nonsense
    """
    model, tokenizer, _ = train_generator()
    temperatures = [0.5, 1.5]
    seed = "to be or not"
    
    for temp in temperatures:
        print("=" * 60)
        print(f"Temperature = {temp}")
        print("=" * 60)
        generated = generate_text(model, tokenizer, seed, length=200, temperature=temp)
        print(generated)
        print()


In [42]:
# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    print("=" * 60)
    print("Exercise 01: RNN Text Generation")
    print("=" * 60)
    
    # Uncomment as you complete:
    train_generator()
    experiment_temperature()

Exercise 01: RNN Text Generation
Training Text Generator
Vocabulary size: 26
Total sequences: 4069
X shape: (4069, 20), y shape: (4069,)




Epoch 1/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1682 - loss: 2.9008 - val_accuracy: 0.1843 - val_loss: 2.8413
Epoch 2/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2925 - loss: 2.5448 - val_accuracy: 0.3587 - val_loss: 2.3749
Epoch 3/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3667 - loss: 2.2372 - val_accuracy: 0.3710 - val_loss: 2.1938
Epoch 4/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3730 - loss: 2.0824 - val_accuracy: 0.3759 - val_loss: 2.0949
Epoch 5/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4115 - loss: 1.9345 - val_accuracy: 0.4005 - val_loss: 1.9710
Epoch 6/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4514 - loss: 1.7827 - val_accuracy: 0.4300 - val_loss: 1.8641
Epoch 7/20
[1m147/147[0m 

Epoch 1/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1753 - loss: 2.8995 - val_accuracy: 0.2015 - val_loss: 2.7555
Epoch 2/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3020 - loss: 2.5500 - val_accuracy: 0.3366 - val_loss: 2.3815
Epoch 3/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3583 - loss: 2.2753 - val_accuracy: 0.3563 - val_loss: 2.2361
Epoch 4/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3760 - loss: 2.1212 - val_accuracy: 0.3735 - val_loss: 2.1296
Epoch 5/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4107 - loss: 1.9776 - val_accuracy: 0.3980 - val_loss: 1.9967
Epoch 6/20
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4500 - loss: 1.8185 - val_accuracy: 0.4324 - val_loss: 1.8560
Epoch 7/20
[1m147/147[0m 