# Week 6: LSTM Text Generator

## Objective
Build a character-level LSTM text generator using Shakespeare's works.

**Target**:
- Generate coherent text after training
- Understand LSTM gates (forget, input, output, cell)
- Implement sampling strategies (greedy, temperature, top-k)
- Visualize hidden states and gate activations

---

## Why LSTMs?

**Problem with Vanilla RNNs**: Vanishing gradients over long sequences

**LSTM Solution**: Gated memory cells that learn what to remember/forget

---

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import sys
import os

sys.path.append(os.path.abspath('../../'))

from src.ml.deep_learning import LSTM, Dense, Activation, NeuralNetwork

sns.set_style('whitegrid')
print("✓ Imports successful")

## Step 1: Load Shakespeare Text Corpus

In [None]:
# Download Shakespeare corpus
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
response = requests.get(url)
text = response.text

print(f"Corpus length: {len(text):,} characters")
print(f"\nFirst 500 characters:\n{text[:500]}")

# Get unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"\nVocabulary size: {vocab_size} unique characters")
print(f"Characters: {''.join(chars[:50])}...")

## Step 2: Character Encoding

In [None]:
# Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

def encode(text):
    """Convert text to integer sequences."""
    return np.array([char_to_idx[ch] for ch in text])

def decode(indices):
    """Convert integer sequences back to text."""
    return ''.join([idx_to_char[idx] for idx in indices])

# Test encoding/decoding
sample = "Hello, World!"
encoded = encode(sample)
decoded = decode(encoded)
print(f"Original: {sample}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
assert sample == decoded, "Encoding/decoding failed!"
print("✓ Encoding/decoding works correctly")

## Step 3: Create Training Sequences

In [None]:
# Hyperparameters
SEQ_LENGTH = 100  # Input sequence length
STEP = 3          # Step size for creating sequences

# Create sequences
sequences = []
next_chars = []

for i in range(0, len(text) - SEQ_LENGTH, STEP):
    sequences.append(text[i:i + SEQ_LENGTH])
    next_chars.append(text[i + SEQ_LENGTH])

print(f"Number of sequences: {len(sequences):,}")
print(f"\nExample sequence:")
print(f"Input:  '{sequences[0]}'")
print(f"Target: '{next_chars[0]}'")

## Step 4: One-Hot Encoding

In [None]:
def one_hot_encode(sequences, vocab_size):
    """Convert character sequences to one-hot encoded arrays."""
    n_samples = len(sequences)
    seq_len = len(sequences[0])
    
    X = np.zeros((n_samples, seq_len, vocab_size), dtype=np.float32)
    
    for i, seq in enumerate(sequences):
        for t, char in enumerate(seq):
            X[i, t, char_to_idx[char]] = 1
    
    return X

# Encode inputs
print("Encoding input sequences...")
X = one_hot_encode(sequences, vocab_size)

# Encode targets
y = np.array([char_to_idx[ch] for ch in next_chars])

print(f"X shape: {X.shape}  (samples, seq_length, vocab_size)")
print(f"y shape: {y.shape}  (samples,)")
print(f"Memory: {X.nbytes / 1024**2:.1f} MB")

## Step 5: Build LSTM Model

### LSTM Architecture

```
Input (seq_len, vocab_size)
    ↓
LSTM(128 units)
    ↓
Dense(vocab_size)
    ↓
Softmax
    ↓
Next character probabilities
```

In [None]:
# Build model
hidden_size = 128

model = NeuralNetwork()
model.add(LSTM(input_size=vocab_size, hidden_size=hidden_size, return_sequences=False))
model.add(Dense(hidden_size, vocab_size))
model.add(Activation('softmax'))

# Compile
from src.ml.deep_learning import CrossEntropyLoss
model.compile(loss=CrossEntropyLoss(), learning_rate=0.001)

model.summary()
print("\n✓ Model built successfully")

## Step 6: Training

In [None]:
# Train model
EPOCHS = 20
BATCH_SIZE = 128

print("Starting training...")
print("="*70)

history = model.fit(
    X, y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    verbose=True
)

print("\n✓ Training complete!")

## Step 7: Text Generation

### Sampling Strategies

In [None]:
def sample_greedy(preds):
    """Always pick most likely character."""
    return np.argmax(preds)

def sample_temperature(preds, temperature=1.0):
    """
    Sample with temperature control.
    
    temperature > 1: More random (exploration)
    temperature < 1: More deterministic (exploitation)
    temperature = 1: Standard sampling
    """
    preds = np.log(preds + 1e-10) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def sample_top_k(preds, k=5):
    """Sample from top-k most likely characters."""
    top_k_indices = np.argsort(preds)[-k:]
    top_k_probs = preds[top_k_indices]
    top_k_probs /= np.sum(top_k_probs)
    return np.random.choice(top_k_indices, p=top_k_probs)

def generate_text(model, seed_text, length=500, strategy='temperature', temperature=0.8, top_k=5):
    """Generate text using trained model."""
    generated = seed_text
    
    for _ in range(length):
        # Prepare input
        x = one_hot_encode([generated[-SEQ_LENGTH:]], vocab_size)
        
        # Predict next character
        preds = model.predict_proba(x)[0]
        
        # Sample based on strategy
        if strategy == 'greedy':
            next_idx = sample_greedy(preds)
        elif strategy == 'temperature':
            next_idx = sample_temperature(preds, temperature)
        elif strategy == 'top_k':
            next_idx = sample_top_k(preds, k=top_k)
        
        # Append to generated text
        generated += idx_to_char[next_idx]
    
    return generated

In [None]:
# Generate text with different strategies
seed = text[:SEQ_LENGTH]

print("="*70)
print("TEXT GENERATION RESULTS")
print("="*70)

print(f"\nSeed text:\n{seed}\n")

print("\n--- Strategy: Greedy (deterministic) ---")
generated_greedy = generate_text(model, seed, length=300, strategy='greedy')
print(generated_greedy[SEQ_LENGTH:])

print("\n--- Strategy: Temperature=0.5 (conservative) ---")
generated_low_temp = generate_text(model, seed, length=300, strategy='temperature', temperature=0.5)
print(generated_low_temp[SEQ_LENGTH:])

print("\n--- Strategy: Temperature=1.0 (balanced) ---")
generated_mid_temp = generate_text(model, seed, length=300, strategy='temperature', temperature=1.0)
print(generated_mid_temp[SEQ_LENGTH:])

print("\n--- Strategy: Temperature=1.5 (creative) ---")
generated_high_temp = generate_text(model, seed, length=300, strategy='temperature', temperature=1.5)
print(generated_high_temp[SEQ_LENGTH:])

print("\n--- Strategy: Top-k (k=10) ---")
generated_topk = generate_text(model, seed, length=300, strategy='top_k', top_k=10)
print(generated_topk[SEQ_LENGTH:])

## Step 8: Visualize LSTM Internals

### Gate Activations

In [None]:
# Extract LSTM layer
lstm_layer = model.layers[0]

# Forward pass to get gates
sample_input = X[0:1]  # Single sequence

# Get gate values (modify LSTM to return gates)
# This assumes LSTM layer has been modified to store gate activations
hidden, gates = lstm_layer.forward_with_gates(sample_input)

# Plot gate activations over time
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

gate_names = ['Forget Gate', 'Input Gate', 'Output Gate', 'Cell State']
for i, (ax, name) in enumerate(zip(axes, gate_names)):
    ax.imshow(gates[i].T, cmap='viridis', aspect='auto')
    ax.set_title(f'{name} Activations', fontsize=14, fontweight='bold')
    ax.set_xlabel('Time Step', fontsize=12)
    ax.set_ylabel('Hidden Unit', fontsize=12)
    plt.colorbar(ax.images[0], ax=ax)

plt.tight_layout()
plt.show()

print("\n✓ Gate visualizations complete")

## Step 9: Analysis & Insights

### LSTM Gate Behavior

**Forget Gate** (f_t):
- Decides what to discard from cell state
- Dark regions = forgetting information
- Light regions = retaining information

**Input Gate** (i_t):
- Decides what new information to store
- Works with candidate cell state (c̃_t)

**Output Gate** (o_t):
- Decides what to output based on cell state
- Controls information flow to next layer

**Cell State** (c_t):
- Long-term memory
- Modified by forget and input gates

### Temperature Effects

- **Low (0.5)**: Conservative, coherent but repetitive
- **Medium (1.0)**: Balanced creativity and coherence
- **High (1.5)**: Creative but may lose coherence

---

## Step 10: Comparison with Vanilla RNN

### Why LSTM is Better

In [None]:
# Build vanilla RNN for comparison
from src.ml.deep_learning import SimpleRNN

rnn_model = NeuralNetwork()
rnn_model.add(SimpleRNN(input_size=vocab_size, hidden_size=128))
rnn_model.add(Dense(128, vocab_size))
rnn_model.add(Activation('softmax'))
rnn_model.compile(loss=CrossEntropyLoss(), learning_rate=0.001)

print("Training vanilla RNN for comparison...")
rnn_history = rnn_model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1, verbose=False)

# Compare learning curves
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history['loss'], label='LSTM Train', linewidth=2)
plt.plot(rnn_history['loss'], label='RNN Train', linewidth=2, linestyle='--')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training Loss: LSTM vs RNN', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history['val_loss'], label='LSTM Val', linewidth=2)
plt.plot(rnn_history['val_loss'], label='RNN Val', linewidth=2, linestyle='--')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Validation Loss: LSTM vs RNN', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal Training Loss:")
print(f"  LSTM: {history['loss'][-1]:.4f}")
print(f"  RNN:  {rnn_history['loss'][-1]:.4f}")
print(f"\nFinal Validation Loss:")
print(f"  LSTM: {history['val_loss'][-1]:.4f}")
print(f"  RNN:  {rnn_history['val_loss'][-1]:.4f}")

if history['val_loss'][-1] < rnn_history['val_loss'][-1]:
    improvement = (rnn_history['val_loss'][-1] - history['val_loss'][-1]) / rnn_history['val_loss'][-1] * 100
    print(f"\n✅ LSTM outperforms RNN by {improvement:.1f}%!")
else:
    print("\n⚠️ RNN performed better (may need more training)")

## Conclusion

### Key Learnings

1. **LSTM gates solve vanishing gradients**
   - Forget gate controls what to discard
   - Input gate controls what to add
   - Output gate controls what to output

2. **Sampling strategies matter**
   - Greedy: Deterministic but boring
   - Temperature: Controls randomness
   - Top-k: Balances quality and diversity

3. **LSTMs outperform vanilla RNNs**
   - Better long-term dependencies
   - More stable training
   - Higher quality text generation

### Interview Points

- "I implemented character-level LSTM for text generation from scratch"
- "Achieved coherent Shakespeare-style text with temperature sampling"
- "Visualized gate activations to understand LSTM internals"
- "Compared LSTM vs RNN, demonstrating LSTM's advantage in long sequences"

---

**✅ Week 6 Complete**: LSTM text generator with deep understanding of recurrent architectures!

---

*Next: Week 7 - Build BERT from Scratch*