In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Common problems in sequence-to-sequence models
def analyze_seq2seq_problems():
    """Analyze typical problems and their causes"""
    
    problems = {
        "Repetitive Outputs": {
            "description": "Model generates the same token repeatedly",
            "example": "Input: 'Hello' -> Output: 'the the the the the'",
            "causes": [
                "Poor context vector representation",
                "Insufficient training data",
                "Local minima in training",
                "Vanishing gradients"
            ],
            "solutions": [
                "Add attention mechanisms",
                "Increase model capacity",
                "Better initialization",
                "Gradient clipping"
            ]
        },
        "Short Sequences": {
            "description": "Model terminates sequences too early", 
            "example": "Input: 'How are you today?' -> Output: 'Fine.'",
            "causes": [
                "Biased training data (short targets)",
                "EOS token learned too aggressively",
                "Information loss in context vector"
            ],
            "solutions": [
                "Balanced sequence lengths in training",
                "Sequence length penalties",
                "Better context representation"
            ]
        },
        "Nonsensical Results": {
            "description": "Grammatically incorrect or meaningless output",
            "example": "Input: 'Good morning' -> Output: 'Car blue elephant'",
            "causes": [
                "Insufficient training",
                "Poor vocabulary handling",
                "Context vector bottleneck",
                "Out-of-vocabulary words"
            ],
            "solutions": [
                "More training data",
                "Better preprocessing",
                "Attention mechanisms",
                "Subword tokenization"
            ]
        },
        "Exposure Bias": {
            "description": "Performance gap between training and inference",
            "example": "Good training loss but poor inference quality",
            "causes": [
                "Teacher forcing during training",
                "Model never sees its own errors",
                "Different input distributions"
            ],
            "solutions": [
                "Scheduled sampling",
                "Curriculum learning",
                "Inference-time training"
            ]
        }
    }
    
    # Display problems and solutions
    for i, (problem, details) in enumerate(problems.items()):
        print(f"{i+1}. {problem}")
        print("=" * (len(problem) + 3))
        print(f"Description: {details['description']}")
        print(f"Example: {details['example']}")
        print("Causes:")
        for cause in details['causes']:
            print(f"  • {cause}")
        print("Solutions:")
        for solution in details['solutions']:
            print(f"  ✓ {solution}")
        print()

analyze_seq2seq_problems()
