In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Example: Traditional neural network with fixed dimensions
class TraditionalNN(nn.Module):
    def __init__(self, input_size=10, hidden_size=20, output_size=5):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# This works fine for fixed-size inputs
traditional_model = TraditionalNN()
fixed_input = torch.randn(1, 10)  # Batch size 1, 10 features
output = traditional_model(fixed_input)
print(f"Input shape: {fixed_input.shape}")
print(f"Output shape: {output.shape}")

# But what if we have variable-length sequences?
sentences = [
    "Hello",
    "How are you today?",
    "This is a much longer sentence with many words",
    "Hi"
]

print("\nVariable-length sentence examples:")
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: '{sentence}' (length: {len(sentence.split())} words)")

print("\nProblem: How do we feed these into a traditional neural network?")
print("- Different input sizes: Can't use fixed Linear layers")
print("- Different output needs: Translations have different lengths")
print("- Information loss: How to represent variable content in fixed size?")


In [None]:
# Real-world sequence-to-sequence examples
seq2seq_examples = {
    "Machine Translation": {
        "input": "Hello, how are you?",
        "output": "Bonjour, comment allez-vous?",
        "challenge": "Different languages have different word orders and lengths"
    },
    "Text Summarization": {
        "input": "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
        "output": "Sentence with all alphabet letters.",
        "challenge": "Output much shorter than input, must capture key information"
    },
    "Question Answering": {
        "input": "What is the capital of France? Context: France is a country in Europe with Paris as its capital.",
        "output": "Paris",
        "challenge": "Extract specific information from longer context"
    },
    "Code Generation": {
        "input": "Create a function that adds two numbers",
        "output": "def add_numbers(a, b):\n    return a + b",
        "challenge": "Natural language to structured code with syntax"
    },
    "Chatbot Response": {
        "input": "I'm feeling sad today",
        "output": "I'm sorry to hear that. What's been bothering you?",
        "challenge": "Context-aware, empathetic response generation"
    }
}

# Visualize the examples
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (task, example) in enumerate(seq2seq_examples.items()):
    if i < len(axes):
        ax = axes[i]
        
        # Calculate lengths
        input_len = len(example["input"].split())
        output_len = len(example["output"].split())
        
        # Create bar chart
        ax.bar(['Input', 'Output'], [input_len, output_len], 
               color=['skyblue', 'lightcoral'], alpha=0.7)
        ax.set_title(f'{task}\nLength Comparison', fontsize=10, fontweight='bold')
        ax.set_ylabel('Word Count')
        
        # Add example text as annotation
        ax.text(0.5, max(input_len, output_len) * 0.8, 
                f'Input: "{example["input"][:30]}..."\nOutput: "{example["output"][:30]}..."',
                ha='center', va='top', fontsize=8, 
                bbox=dict(boxstyle="round,pad=0.3", facecolor="wheat", alpha=0.7))

# Remove empty subplot
if len(seq2seq_examples) < len(axes):
    axes[-1].remove()

plt.tight_layout()
plt.suptitle('Sequence-to-Sequence Applications: Input vs Output Lengths', 
             fontsize=14, fontweight='bold', y=0.98)
plt.show()

# Print detailed examples
print("Detailed Sequence-to-Sequence Examples:\n")
for task, example in seq2seq_examples.items():
    print(f"=== {task} ===")
    print(f"Input:  '{example['input']}'")
    print(f"Output: '{example['output']}'")
    print(f"Challenge: {example['challenge']}")
    print(f"Input length: {len(example['input'].split())} words")
    print(f"Output length: {len(example['output'].split())} words")
    print()


In [None]:
# Visualize the encoder-decoder concept
def create_encoder_decoder_diagram():
    fig, ax = plt.subplots(1, 1, figsize=(14, 8))
    
    # Input sequence
    input_words = ["Hello", "world", "!"]
    input_positions = np.arange(len(input_words))
    
    # Context vector position
    context_pos = len(input_words) + 1
    
    # Output sequence
    output_words = ["Bonjour", "le", "monde", "!"]
    output_positions = np.arange(context_pos + 2, context_pos + 2 + len(output_words))
    
    # Draw input sequence
    for i, word in enumerate(input_words):
        rect = plt.Rectangle((i-0.4, 0.5), 0.8, 0.8, 
                           facecolor='lightblue', edgecolor='blue', linewidth=2)
        ax.add_patch(rect)
        ax.text(i, 0.9, word, ha='center', va='center', fontweight='bold')
    
    # Draw arrows from input to context
    for i in range(len(input_words)):
        ax.arrow(i, 0.5, context_pos - i - 0.1, -0.3, 
                head_width=0.1, head_length=0.1, fc='red', ec='red')
    
    # Draw context vector
    rect = plt.Rectangle((context_pos-0.6, -0.4), 1.2, 1.2, 
                       facecolor='yellow', edgecolor='orange', linewidth=3)
    ax.add_patch(rect)
    ax.text(context_pos, 0.2, 'Context\nVector', ha='center', va='center', 
            fontweight='bold', fontsize=12)
    
    # Draw arrows from context to output
    for i, pos in enumerate(output_positions):
        ax.arrow(context_pos + 0.1, 0.2, pos - context_pos - 0.2, 0.6, 
                head_width=0.1, head_length=0.1, fc='green', ec='green')
    
    # Draw output sequence
    for i, (word, pos) in enumerate(zip(output_words, output_positions)):
        rect = plt.Rectangle((pos-0.4, 0.9), 0.8, 0.8, 
                           facecolor='lightgreen', edgecolor='green', linewidth=2)
        ax.add_patch(rect)
        ax.text(pos, 1.3, word, ha='center', va='center', fontweight='bold')
    
    # Labels
    ax.text(len(input_words)/2 - 0.5, -0.8, 'ENCODER', ha='center', va='center', 
            fontsize=14, fontweight='bold', color='blue')
    ax.text(np.mean(output_positions), 2.1, 'DECODER', ha='center', va='center', 
            fontsize=14, fontweight='bold', color='green')
    
    # Set limits and remove axes
    ax.set_xlim(-1, max(output_positions) + 1)
    ax.set_ylim(-1.2, 2.5)
    ax.set_aspect('equal')
    ax.axis('off')
    
    plt.title('Encoder-Decoder Architecture Concept', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()

create_encoder_decoder_diagram()

# Explain the process step by step
print("Encoder-Decoder Process:")
print("1. ENCODER reads input sequence word by word")
print("2. ENCODER creates a fixed-size context vector containing sequence information")
print("3. DECODER takes context vector and generates output sequence word by word")
print("4. Each decoder step uses previous word and context to predict next word")
print("\nKey Innovation: Fixed-size context vector bridges variable-length sequences!")


In [None]:
# Simple seq2seq task: reverse a sequence of numbers
def generate_reverse_data(num_samples=1000, max_length=10):
    """Generate data for sequence reversal task"""
    data = []
    
    for _ in range(num_samples):
        # Random length between 3 and max_length
        length = random.randint(3, max_length)
        
        # Random sequence of numbers 1-9
        sequence = [random.randint(1, 9) for _ in range(length)]
        
        # Target is the reversed sequence
        target = sequence[::-1]
        
        data.append((sequence, target))
    
    return data

# Generate sample data
sample_data = generate_reverse_data(10, 6)

print("Sample Sequence Reversal Data:")
print("Input -> Target")
print("-" * 30)
for i, (input_seq, target_seq) in enumerate(sample_data):
    print(f"{input_seq} -> {target_seq}")

# Analyze the data characteristics
all_input_lengths = [len(seq[0]) for seq in sample_data]
all_output_lengths = [len(seq[1]) for seq in sample_data]

print(f"\nData Characteristics:")
print(f"Input lengths: {sorted(set(all_input_lengths))}")
print(f"Output lengths: {sorted(set(all_output_lengths))}")
print(f"Same length? {all_input_lengths == all_output_lengths}")

# Visualize length distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Length distribution
length_counts = Counter(all_input_lengths)
ax1.bar(length_counts.keys(), length_counts.values(), color='skyblue', alpha=0.7)
ax1.set_xlabel('Sequence Length')
ax1.set_ylabel('Count')
ax1.set_title('Sequence Length Distribution')

# Sample sequences visualization
ax2.barh(range(len(sample_data[:5])), [len(seq[0]) for seq in sample_data[:5]], 
         color='lightgreen', alpha=0.7, label='Input')
ax2.barh(range(len(sample_data[:5])), [len(seq[1]) for seq in sample_data[:5]], 
         color='lightcoral', alpha=0.7, label='Output')
ax2.set_xlabel('Length')
ax2.set_ylabel('Sample Index')
ax2.set_title('Input vs Output Lengths (First 5 samples)')
ax2.legend()

plt.tight_layout()
plt.show()

print("\nKey Observations:")
print("1. Input and output have same length (for reversal task)")
print("2. But in general, seq2seq models handle different input/output lengths")
print("3. This is a simple task to understand the encoder-decoder concept")
print("4. Encoder must capture the entire sequence information")
print("5. Decoder must generate sequence in reverse order")


In [None]:
# Demonstrate the context vector bottleneck problem
def simulate_context_vector_limitation():
    """Simulate how information gets compressed in context vector"""
    
    # Different length sequences
    sequences = [
        "Hi",
        "Hello there",
        "How are you doing today my friend",
        "This is a very long sentence with lots of important information that needs to be preserved"
    ]
    
    # Simulate fixed-size context vector (e.g., 128 dimensions)
    context_size = 128
    
    print("Context Vector Information Compression:")
    print("=" * 60)
    
    for i, seq in enumerate(sequences):
        words = seq.split()
        input_info = len(words) * 50  # Assume each word has 50 "units" of information
        
        # Information loss calculation (simplified)
        compression_ratio = input_info / context_size
        information_retained = min(100, 100 / compression_ratio)
        
        print(f"\nSequence {i+1}: '{seq}'")
        print(f"  Input length: {len(words)} words")
        print(f"  Estimated input information: {input_info} units")
        print(f"  Context vector size: {context_size} units")
        print(f"  Compression ratio: {compression_ratio:.2f}x")
        print(f"  Information retained: {information_retained:.1f}%")
        
        if compression_ratio > 2:
            print("  ⚠️  HIGH COMPRESSION - Potential information loss!")
        elif compression_ratio > 1.5:
            print("  ⚠️  MODERATE COMPRESSION - Some information loss")
        else:
            print("  ✅ LOW COMPRESSION - Most information preserved")

simulate_context_vector_limitation()

# Visualize the bottleneck effect
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Sequence lengths vs context vector size
seq_lengths = [2, 3, 8, 16, 32, 64]
context_size = 128
information_units = [length * 50 for length in seq_lengths]

ax1.plot(seq_lengths, information_units, 'bo-', linewidth=2, markersize=8, label='Input Information')
ax1.axhline(y=context_size, color='red', linestyle='--', linewidth=2, label='Context Vector Capacity')
ax1.fill_between(seq_lengths, 0, context_size, alpha=0.3, color='red', label='Context Vector Limit')
ax1.set_xlabel('Sequence Length (words)')
ax1.set_ylabel('Information Units')
ax1.set_title('Information Bottleneck Problem')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Information retention percentage
retention = [min(100, 100 * context_size / info) for info in information_units]
ax2.plot(seq_lengths, retention, 'ro-', linewidth=2, markersize=8)
ax2.axhline(y=100, color='green', linestyle='--', alpha=0.7, label='Perfect Retention')
ax2.axhline(y=50, color='orange', linestyle='--', alpha=0.7, label='50% Retention')
ax2.set_xlabel('Sequence Length (words)')
ax2.set_ylabel('Information Retained (%)')
ax2.set_title('Information Retention vs Sequence Length')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 105)

plt.tight_layout()
plt.show()

print("\n🔍 Key Insights:")
print("1. Fixed-size context vector creates an information bottleneck")
print("2. Longer sequences suffer more information loss")
print("3. This is why basic seq2seq models struggle with long sequences")
print("4. Tomorrow we'll learn about attention mechanisms to solve this!")
print("\n💡 This limitation led to the invention of attention mechanisms!")
