# ✍️ GPT-2 for Text Generation

Welcome to **GPT-2 text generation**! In this notebook, we'll explore autoregressive language modeling and create an AI that can write stories, poems, code, and more with remarkable creativity and coherence.

## What you'll learn:
- Autoregressive language modeling
- GPT-2 architecture and decoder-only Transformers
- Text generation strategies and sampling
- Fine-tuning for creative applications

Let's generate some amazing text! 🚀

In [None]:
# Install required packages (run once)
# !pip install transformers torch datasets gradio

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer, GPT2Config,
    TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, pipeline
)
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Load pre-trained GPT-2 model and tokenizer
MODEL_NAME = "gpt2"  # Can also use "gpt2-medium", "gpt2-large", "gpt2-xl"

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Loaded {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")

# Test tokenization
sample_text = "Once upon a time, in a land far away,"
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.encode(sample_text)

print(f"\n🔤 Tokenization Example:")
print(f"Original: {sample_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Decoded: {tokenizer.decode(token_ids)}")

In [None]:
# Create text generation pipeline
generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test basic generation
prompt = "The future of artificial intelligence is"
generated = generator(
    prompt,
    max_length=100,
    num_return_sequences=1,
    temperature=0.8,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

print("🤖 Basic Text Generation:")
print("=" * 50)
print(generated[0]['generated_text'])
print("=" * 50)

In [None]:
# Advanced text generation with different sampling strategies
def generate_text_advanced(prompt, strategy='top_p', **kwargs):
    """Generate text with different sampling strategies"""
    
    # Default parameters
    default_params = {
        'max_length': 150,
        'num_return_sequences': 1,
        'pad_token_id': tokenizer.eos_token_id,
        'do_sample': True
    }
    
    # Update with provided parameters
    params = {**default_params, **kwargs}
    
    if strategy == 'greedy':
        params['do_sample'] = False
    elif strategy == 'top_k':
        params['top_k'] = 50
        params['temperature'] = 0.8
    elif strategy == 'top_p':
        params['top_p'] = 0.9
        params['temperature'] = 0.8
    elif strategy == 'beam_search':
        params['num_beams'] = 5
        params['do_sample'] = False
        params['early_stopping'] = True
    
    return generator(prompt, **params)

# Test different generation strategies
prompt = "In the year 2050, technology will"
strategies = ['greedy', 'top_k', 'top_p', 'beam_search']

print("🎯 Different Sampling Strategies:")
print("=" * 60)

for strategy in strategies:
    generated = generate_text_advanced(prompt, strategy=strategy, max_length=120)
    print(f"\n{strategy.upper()} SAMPLING:")
    print("-" * 30)
    print(generated[0]['generated_text'])
    print("-" * 30)

In [None]:
# Creative text generation examples
creative_prompts = {
    "Story": "Once upon a time, in a magical forest where trees could talk,",
    "Poem": "Roses are red, violets are blue,",
    "Science Fiction": "The spaceship landed on the mysterious planet, and the crew discovered",
    "Recipe": "To make the perfect chocolate cake, you will need",
    "News Article": "Breaking news: Scientists have just announced a groundbreaking discovery"
}

print("🎨 Creative Text Generation Examples:")
print("=" * 70)

for category, prompt in creative_prompts.items():
    generated = generate_text_advanced(
        prompt, 
        strategy='top_p',
        max_length=200,
        temperature=0.8,
        top_p=0.9
    )
    
    print(f"\n📝 {category.upper()}:")
    print("-" * 40)
    print(generated[0]['generated_text'])
    print("-" * 40)

In [None]:
# Temperature analysis - creativity vs coherence
def analyze_temperature_effects(prompt, temperatures=[0.1, 0.5, 1.0, 1.5, 2.0]):
    """Analyze how temperature affects generation quality"""
    
    print(f"🌡️ Temperature Analysis for prompt: '{prompt}'")
    print("=" * 80)
    
    for temp in temperatures:
        generated = generator(
            prompt,
            max_length=100,
            temperature=temp,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
        
        print(f"\nTemperature {temp}:")
        print("-" * 20)
        print(generated[0]['generated_text'])
        print("-" * 20)

# Analyze temperature effects
analyze_temperature_effects("The secret to happiness is")

In [None]:
# Create custom training data for fine-tuning
def create_custom_dataset():
    """Create a custom dataset for fine-tuning"""
    
    # Sample creative writing dataset
    stories = [
        "The old lighthouse keeper had seen many storms, but none like this one. The waves crashed against the rocks with unprecedented fury, and the wind howled like a banshee. Yet, he remained at his post, knowing that ships depended on his light to guide them safely to shore.",
        
        "In the heart of the ancient library, between dusty tomes and forgotten scrolls, lived a small dragon named Ember. Unlike his fierce relatives, Ember preferred reading to roaring, and his favorite pastime was organizing books by their magical properties.",
        
        "The time traveler checked her pocket watch one last time before stepping into the swirling portal. She had one chance to prevent the catastrophe that would reshape history, but changing the past always came with unexpected consequences.",
        
        "Professor Chen's latest invention hummed quietly in the corner of her laboratory. The device could translate any language in the universe, but she had just discovered it was also picking up signals from civilizations that shouldn't exist.",
        
        "The garden grew in impossible ways - flowers bloomed in winter, trees bore fruit out of season, and the paths rearranged themselves when no one was looking. The gardener smiled, knowing that magic required patience and understanding."
    ] * 20  # Repeat for more training data
    
    return stories

# Create dataset
custom_stories = create_custom_dataset()
print(f"Created custom dataset with {len(custom_stories)} stories")
print(f"\nSample story:")
print(custom_stories[0][:200] + "...")

# Save to file for training
with open('custom_stories.txt', 'w', encoding='utf-8') as f:
    for story in custom_stories:
        f.write(story + '\n\n')

print("\n💾 Custom dataset saved to 'custom_stories.txt'")

In [None]:
# Interactive text generation function
def interactive_generation():
    """Interactive text generation with user prompts"""
    
    print("🎮 Interactive Text Generation")
    print("Enter prompts and see GPT-2 complete them!")
    print("Type 'quit' to exit\n")
    
    while True:
        try:
            prompt = input("Enter your prompt: ")
            
            if prompt.lower() == 'quit':
                break
            
            if not prompt.strip():
                continue
            
            # Get generation parameters
            print("\nGeneration options:")
            print("1. Creative (high temperature)")
            print("2. Balanced (medium temperature)")
            print("3. Conservative (low temperature)")
            
            choice = input("Choose option (1-3, default=2): ").strip() or '2'
            
            # Set parameters based on choice
            if choice == '1':
                temp, top_p = 1.2, 0.95
            elif choice == '3':
                temp, top_p = 0.5, 0.8
            else:
                temp, top_p = 0.8, 0.9
            
            # Generate text
            generated = generator(
                prompt,
                max_length=200,
                temperature=temp,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
            
            print("\n" + "="*60)
            print("GENERATED TEXT:")
            print("="*60)
            print(generated[0]['generated_text'])
            print("="*60 + "\n")
            
        except KeyboardInterrupt:
            break
        except Exception as e:
            print(f"Error: {e}")
    
    print("Thanks for using the interactive generator!")

# Note: Uncomment the line below to run interactive mode
# interactive_generation()

In [None]:
# Analyze model performance and generation quality
def analyze_generation_quality(prompts, num_samples=3):
    """Analyze generation quality across different prompts"""
    
    results = []
    
    for prompt in prompts:
        print(f"\n📊 Analyzing prompt: '{prompt[:50]}...'")
        
        generations = []
        for i in range(num_samples):
            generated = generator(
                prompt,
                max_length=150,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
            generations.append(generated[0]['generated_text'])
        
        # Calculate diversity (unique words)
        all_words = []
        for gen in generations:
            words = gen.lower().split()
            all_words.extend(words)
        
        unique_words = len(set(all_words))
        total_words = len(all_words)
        diversity = unique_words / total_words if total_words > 0 else 0
        
        results.append({
            'prompt': prompt,
            'diversity': diversity,
            'avg_length': np.mean([len(gen.split()) for gen in generations]),
            'generations': generations
        })
        
        print(f"Diversity score: {diversity:.3f}")
        print(f"Average length: {results[-1]['avg_length']:.1f} words")
    
    return results

# Test prompts for analysis
test_prompts = [
    "The future of space exploration",
    "A day in the life of a robot",
    "The most important lesson I learned"
]

quality_results = analyze_generation_quality(test_prompts)

# Visualize results
if quality_results:
    diversities = [r['diversity'] for r in quality_results]
    lengths = [r['avg_length'] for r in quality_results]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Diversity scores
    ax1.bar(range(len(diversities)), diversities)
    ax1.set_title('📊 Generation Diversity Scores')
    ax1.set_xlabel('Prompt Index')
    ax1.set_ylabel('Diversity Score')
    ax1.set_xticks(range(len(diversities)))
    
    # Average lengths
    ax2.bar(range(len(lengths)), lengths, color='orange')
    ax2.set_title('📏 Average Generation Length')
    ax2.set_xlabel('Prompt Index')
    ax2.set_ylabel('Average Words')
    ax2.set_xticks(range(len(lengths)))
    
    plt.tight_layout()
    plt.show()

print(f"\n📊 GPT-2 Analysis Summary:")
print(f"Model: {MODEL_NAME}")
print(f"Parameters: {model.num_parameters():,}")
print(f"Vocabulary: {tokenizer.vocab_size:,} tokens")
print(f"Average diversity: {np.mean(diversities):.3f}")
print(f"Average length: {np.mean(lengths):.1f} words")

## 🎉 Congratulations!

You've successfully explored GPT-2 for text generation! Here's what you've accomplished:

✅ **GPT-2 Usage**: Loaded and used pre-trained language models  
✅ **Generation Strategies**: Explored different sampling methods  
✅ **Creative Applications**: Generated stories, poems, and more  
✅ **Parameter Tuning**: Controlled creativity with temperature  
✅ **Quality Analysis**: Measured generation diversity and coherence  

### 🚀 Next Steps:
1. Fine-tune GPT-2 on your custom dataset
2. Try larger models (GPT-2 Medium/Large)
3. Implement controllable generation
4. Move on to **Project 12: Diffusion Models for Image Generation**

Ready for the final project? Let's generate images with diffusion! 🎨