In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import random
from typing import List, Dict, Tuple

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'GPU' if torch.cuda.is_available() else 'CPU'}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# Let's create examples of how different NLP tasks are converted to text-to-text format

def create_task_examples():
    """Create examples showing traditional vs T5 formatting"""
    
    examples = {
        "Sentiment Classification": {
            "traditional": {
                "input": "I love this movie! It's amazing.",
                "output": "positive (probability: 0.95)"
            },
            "t5": {
                "input": "sentiment: I love this movie! It's amazing.",
                "output": "positive"
            }
        },
        
        "Translation": {
            "traditional": {
                "input": "Hello world",
                "output": "Bonjour le monde"
            },
            "t5": {
                "input": "translate English to French: Hello world",
                "output": "Bonjour le monde"
            }
        },
        
        "Question Answering": {
            "traditional": {
                "input": "Context: Paris is the capital of France. Question: What is the capital of France?",
                "output": "span_start: 0, span_end: 5 (Paris)"
            },
            "t5": {
                "input": "question: What is the capital of France? context: Paris is the capital of France.",
                "output": "Paris"
            }
        },
        
        "Summarization": {
            "traditional": {
                "input": "Long article about AI development spanning multiple paragraphs...",
                "output": "Generated summary tokens with attention weights"
            },
            "t5": {
                "input": "summarize: Long article about AI development spanning multiple paragraphs...",
                "output": "AI development has made significant progress"
            }
        },
        
        "Text Generation": {
            "traditional": {
                "input": "The weather today is",
                "output": "Next token probabilities"
            },
            "t5": {
                "input": "generate: The weather today is",
                "output": "sunny and warm with clear skies"
            }
        }
    }
    
    return examples

# Display the examples
examples = create_task_examples()

print("=" * 80)
print("TRADITIONAL APPROACH vs T5 TEXT-TO-TEXT APPROACH")
print("=" * 80)

for task, formats in examples.items():
    print(f"\nüîπ {task.upper()}")
    print("-" * 60)
    
    print("Traditional Approach:")
    print(f"  Input:  {formats['traditional']['input']}")
    print(f"  Output: {formats['traditional']['output']}")
    
    print("\nT5 Text-to-Text Approach:")
    print(f"  Input:  {formats['t5']['input']}")
    print(f"  Output: {formats['t5']['output']}")
    print()

print("=" * 80)
print("KEY INSIGHT: T5 converts ALL tasks to the same input-output format!")
print("This allows one model to handle multiple tasks through task prefixes.")
print("=" * 80)


In [None]:
# Let's implement a simplified T5-style task formatter

class T5TaskFormatter:
    """Formats different NLP tasks into T5's text-to-text format"""
    
    def __init__(self):
        self.task_prefixes = {
            'sentiment': 'sentiment:',
            'translate_en_fr': 'translate English to French:',
            'translate_fr_en': 'translate French to English:',
            'summarize': 'summarize:',
            'question': 'question:',
            'generate': 'generate:',
            'classify': 'classify:',
            'paraphrase': 'paraphrase:'
        }
    
    def format_sentiment(self, text: str) -> str:
        """Format sentiment analysis task"""
        return f"{self.task_prefixes['sentiment']} {text}"
    
    def format_translation(self, text: str, src_lang: str = 'en', tgt_lang: str = 'fr') -> str:
        """Format translation task"""
        if src_lang == 'en' and tgt_lang == 'fr':
            prefix = self.task_prefixes['translate_en_fr']
        elif src_lang == 'fr' and tgt_lang == 'en':
            prefix = self.task_prefixes['translate_fr_en']
        else:
            prefix = f"translate {src_lang} to {tgt_lang}:"
        
        return f"{prefix} {text}"
    
    def format_qa(self, question: str, context: str) -> str:
        """Format question answering task"""
        return f"{self.task_prefixes['question']} {question} context: {context}"
    
    def format_summarization(self, text: str) -> str:
        """Format summarization task"""
        return f"{self.task_prefixes['summarize']} {text}"
    
    def format_generation(self, prompt: str) -> str:
        """Format text generation task"""
        return f"{self.task_prefixes['generate']} {prompt}"

# Create formatter instance
formatter = T5TaskFormatter()

# Test the formatter with sample data
print("T5 TASK FORMATTING EXAMPLES")
print("=" * 50)

# Sentiment Analysis
sentiment_text = "This movie was absolutely fantastic!"
formatted_sentiment = formatter.format_sentiment(sentiment_text)
print(f"Original: {sentiment_text}")
print(f"T5 Format: {formatted_sentiment}")
print(f"Expected Output: positive\n")

# Translation
english_text = "Good morning, how are you?"
formatted_translation = formatter.format_translation(english_text)
print(f"Original: {english_text}")
print(f"T5 Format: {formatted_translation}")
print(f"Expected Output: Bonjour, comment allez-vous?\n")

# Question Answering
question = "What is the capital of Japan?"
context = "Tokyo is the capital and largest city of Japan."
formatted_qa = formatter.format_qa(question, context)
print(f"Question: {question}")
print(f"Context: {context}")
print(f"T5 Format: {formatted_qa}")
print(f"Expected Output: Tokyo\n")

# Summarization
long_text = "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data. It has applications in many fields including computer vision, natural language processing, and robotics."
formatted_summary = formatter.format_summarization(long_text)
print(f"Original: {long_text}")
print(f"T5 Format: {formatted_summary}")
print(f"Expected Output: ML is AI that learns from data for vision, NLP, and robotics.")

print("\n" + "=" * 50)
print("Notice how all tasks now have the same input-output structure!")
print("This is the power of the text-to-text paradigm.")


In [None]:
# Create a multi-task dataset for T5 training

class MultiTaskDataset:
    """Creates a dataset with multiple NLP tasks in T5 format"""
    
    def __init__(self):
        self.formatter = T5TaskFormatter()
        self.data = []
        self.task_counts = defaultdict(int)
    
    def add_sentiment_data(self):
        """Add sentiment analysis examples"""
        sentiment_examples = [
            ("I love this product! It's amazing.", "positive"),
            ("This is the worst thing I've ever bought.", "negative"),
            ("The movie was okay, nothing special.", "neutral"),
            ("Absolutely fantastic experience!", "positive"),
            ("Terrible customer service.", "negative"),
            ("It's an average product.", "neutral"),
            ("Best purchase ever!", "positive"),
            ("Complete waste of money.", "negative"),
            ("Pretty good, would recommend.", "positive"),
            ("Not worth the price.", "negative")
        ]
        
        for text, label in sentiment_examples:
            input_text = self.formatter.format_sentiment(text)
            self.data.append({
                'task': 'sentiment',
                'input': input_text,
                'output': label
            })
            self.task_counts['sentiment'] += 1
    
    def add_translation_data(self):
        """Add translation examples"""
        translation_examples = [
            ("Hello", "Bonjour"),
            ("Thank you", "Merci"),
            ("Good morning", "Bonjour"),
            ("How are you?", "Comment allez-vous?"),
            ("I am fine", "Je vais bien"),
            ("What is your name?", "Comment vous appelez-vous?"),
            ("Nice to meet you", "Enchant√© de vous rencontrer"),
            ("See you later", "√Ä bient√¥t"),
            ("Have a good day", "Bonne journ√©e"),
            ("Where is the library?", "O√π est la biblioth√®que?")
        ]
        
        for en, fr in translation_examples:
            # English to French
            input_text = self.formatter.format_translation(en, 'en', 'fr')
            self.data.append({
                'task': 'translation_en_fr',
                'input': input_text,
                'output': fr
            })
            self.task_counts['translation_en_fr'] += 1
            
            # French to English
            input_text = self.formatter.format_translation(fr, 'fr', 'en')
            self.data.append({
                'task': 'translation_fr_en',
                'input': input_text,
                'output': en
            })
            self.task_counts['translation_fr_en'] += 1
    
    def add_qa_data(self):
        """Add question answering examples"""
        qa_examples = [
            ("What is the capital of France?", "Paris is the capital of France.", "Paris"),
            ("Who invented the telephone?", "Alexander Graham Bell invented the telephone.", "Alexander Graham Bell"),
            ("What is the largest planet?", "Jupiter is the largest planet in our solar system.", "Jupiter"),
            ("When was Python created?", "Python was created by Guido van Rossum in 1991.", "1991"),
            ("What is machine learning?", "Machine learning is a subset of AI that learns from data.", "A subset of AI that learns from data")
        ]
        
        for question, context, answer in qa_examples:
            input_text = self.formatter.format_qa(question, context)
            self.data.append({
                'task': 'qa',
                'input': input_text,
                'output': answer
            })
            self.task_counts['qa'] += 1
    
    def add_summarization_data(self):
        """Add summarization examples"""
        summarization_examples = [
            ("Machine learning is a branch of artificial intelligence that uses statistical techniques to give computers the ability to learn from data without being explicitly programmed.", "ML is AI that learns from data"),
            ("Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns in data.", "Deep learning uses multi-layer neural networks for complex patterns"),
            ("Natural language processing enables computers to understand, interpret, and generate human language in a valuable way.", "NLP helps computers understand and generate human language"),
            ("Computer vision is a field of artificial intelligence that trains computers to interpret and understand the visual world from digital images and videos.", "Computer vision teaches AI to understand visual data"),
            ("Transformers are a type of neural network architecture that has revolutionized natural language processing with their attention mechanism.", "Transformers revolutionized NLP with attention mechanisms")
        ]
        
        for text, summary in summarization_examples:
            input_text = self.formatter.format_summarization(text)
            self.data.append({
                'task': 'summarization',
                'input': input_text,
                'output': summary
            })
            self.task_counts['summarization'] += 1
    
    def create_dataset(self):
        """Create the complete multi-task dataset"""
        self.add_sentiment_data()
        self.add_translation_data()
        self.add_qa_data()
        self.add_summarization_data()
        
        # Shuffle the dataset
        random.shuffle(self.data)
        
        return self.data
    
    def get_statistics(self):
        """Get dataset statistics"""
        total_examples = len(self.data)
        stats = {
            'total_examples': total_examples,
            'tasks': dict(self.task_counts),
            'task_distribution': {task: count/total_examples for task, count in self.task_counts.items()}
        }
        return stats

# Create the multi-task dataset
dataset_creator = MultiTaskDataset()
multi_task_data = dataset_creator.create_dataset()
stats = dataset_creator.get_statistics()

print("MULTI-TASK T5 DATASET CREATED")
print("=" * 40)
print(f"Total Examples: {stats['total_examples']}")
print("\nTask Distribution:")
for task, count in stats['tasks'].items():
    percentage = stats['task_distribution'][task] * 100
    print(f"  {task}: {count} examples ({percentage:.1f}%)")

print("\nSample Examples:")
print("-" * 40)
for i, example in enumerate(multi_task_data[:5]):
    print(f"Example {i+1} ({example['task']}):")
    print(f"  Input:  {example['input']}")
    print(f"  Output: {example['output']}")
    print()


In [None]:
# Visualize the multi-task dataset distribution

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Task distribution pie chart
tasks = list(stats['tasks'].keys())
counts = list(stats['tasks'].values())
colors = sns.color_palette("husl", len(tasks))

ax1.pie(counts, labels=tasks, autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Task Distribution in Multi-Task Dataset', fontsize=14, fontweight='bold')

# Task count bar chart
ax2.bar(tasks, counts, color=colors)
ax2.set_title('Number of Examples per Task', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Examples')
ax2.set_xlabel('Task Type')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Analyze input/output length distributions
input_lengths = [len(example['input'].split()) for example in multi_task_data]
output_lengths = [len(example['output'].split()) for example in multi_task_data]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Input length distribution
ax1.hist(input_lengths, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_title('Input Length Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Words')
ax1.set_ylabel('Frequency')
ax1.axvline(np.mean(input_lengths), color='red', linestyle='--', 
           label=f'Mean: {np.mean(input_lengths):.1f}')
ax1.legend()

# Output length distribution
ax2.hist(output_lengths, bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
ax2.set_title('Output Length Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Words')
ax2.set_ylabel('Frequency')
ax2.axvline(np.mean(output_lengths), color='red', linestyle='--', 
           label=f'Mean: {np.mean(output_lengths):.1f}')
ax2.legend()

plt.tight_layout()
plt.show()

print("DATASET ANALYSIS")
print("=" * 30)
print(f"Average input length: {np.mean(input_lengths):.1f} words")
print(f"Average output length: {np.mean(output_lengths):.1f} words")
print(f"Max input length: {max(input_lengths)} words")
print(f"Max output length: {max(output_lengths)} words")
print(f"Min input length: {min(input_lengths)} words")
print(f"Min output length: {min(output_lengths)} words")


In [None]:
# Simulate performance comparison between traditional and T5 approaches

def simulate_performance_comparison():
    """Simulate performance metrics for traditional vs T5 approaches"""
    
    tasks = ['Sentiment', 'Translation', 'QA', 'Summarization', 'Classification']
    
    # Simulated performance data (accuracy/BLEU scores)
    traditional_performance = {
        'Sentiment': 0.85,
        'Translation': 0.72,
        'QA': 0.78,
        'Summarization': 0.68,
        'Classification': 0.82
    }
    
    # T5 tends to have more consistent performance across tasks
    t5_performance = {
        'Sentiment': 0.83,
        'Translation': 0.74,
        'QA': 0.80,
        'Summarization': 0.71,
        'Classification': 0.81
    }
    
    # Model complexity (parameters in millions)
    traditional_complexity = {
        'Sentiment': 50,   # BERT-base for sentiment
        'Translation': 120, # Transformer for translation
        'QA': 110,         # BERT-large for QA
        'Summarization': 130, # BART for summarization
        'Classification': 60   # RoBERTa for classification
    }
    
    t5_complexity = {
        'Sentiment': 220,  # T5-base for all tasks
        'Translation': 220,
        'QA': 220,
        'Summarization': 220,
        'Classification': 220
    }
    
    return tasks, traditional_performance, t5_performance, traditional_complexity, t5_complexity

# Get simulation data
tasks, trad_perf, t5_perf, trad_complex, t5_complex = simulate_performance_comparison()

# Create comparison visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Performance comparison
x = np.arange(len(tasks))
width = 0.35

ax1.bar(x - width/2, [trad_perf[task] for task in tasks], width, 
        label='Traditional Models', alpha=0.8, color='lightblue')
ax1.bar(x + width/2, [t5_perf[task] for task in tasks], width, 
        label='T5', alpha=0.8, color='lightcoral')

ax1.set_xlabel('Tasks')
ax1.set_ylabel('Performance Score')
ax1.set_title('Performance Comparison: Traditional vs T5', fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(tasks, rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Model complexity comparison
total_trad_params = sum(trad_complex.values())
total_t5_params = sum(t5_complex.values())

ax2.bar(['Traditional\n(5 models)', 'T5\n(1 model)'], 
        [total_trad_params, total_t5_params], 
        color=['lightblue', 'lightcoral'], alpha=0.8)
ax2.set_ylabel('Total Parameters (Millions)')
ax2.set_title('Model Complexity Comparison', fontweight='bold')
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for i, v in enumerate([total_trad_params, total_t5_params]):
    ax2.text(i, v + 10, f'{v}M', ha='center', va='bottom', fontweight='bold')

# Task-specific complexity
ax3.bar(x - width/2, [trad_complex[task] for task in tasks], width, 
        label='Traditional Models', alpha=0.8, color='lightblue')
ax3.bar(x + width/2, [t5_complex[task] for task in tasks], width, 
        label='T5', alpha=0.8, color='lightcoral')

ax3.set_xlabel('Tasks')
ax3.set_ylabel('Parameters (Millions)')
ax3.set_title('Per-Task Model Complexity', fontweight='bold')
ax3.set_xticks(x)
ax3.set_xticklabels(tasks, rotation=45)
ax3.legend()
ax3.grid(True, alpha=0.3)

# Performance variance
trad_scores = list(trad_perf.values())
t5_scores = list(t5_perf.values())

ax4.boxplot([trad_scores, t5_scores], labels=['Traditional', 'T5'])
ax4.set_ylabel('Performance Score')
ax4.set_title('Performance Consistency Across Tasks', fontweight='bold')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print detailed comparison
print("TRADITIONAL vs T5 COMPARISON")
print("=" * 50)
print(f"Traditional approach - Total parameters: {total_trad_params}M")
print(f"T5 approach - Total parameters: {total_t5_params}M")
print(f"Parameter efficiency: T5 is {total_trad_params/total_t5_params:.1f}x more efficient")
print()

print("Performance Analysis:")
trad_mean = np.mean(trad_scores)
t5_mean = np.mean(t5_scores)
trad_std = np.std(trad_scores)
t5_std = np.std(t5_scores)

print(f"Traditional - Mean: {trad_mean:.3f}, Std: {trad_std:.3f}")
print(f"T5 - Mean: {t5_mean:.3f}, Std: {t5_std:.3f}")
print(f"T5 is {'more' if t5_std < trad_std else 'less'} consistent across tasks")

print("\nKey Insights:")
print("- T5 uses one model for all tasks vs. multiple specialized models")
print("- T5 shows more consistent performance across different tasks")
print("- Traditional models may excel in specific tasks but lack generalization")
print("- T5 enables easier deployment and maintenance with unified architecture")


In [None]:
# Final demonstration: Interactive T5-style task processor

class InteractiveT5Demo:
    """Interactive demonstration of T5's text-to-text approach"""
    
    def __init__(self):
        self.formatter = T5TaskFormatter()
        
        # Simple rule-based responses for demonstration
        self.responses = {
            'sentiment': {
                'good': 'positive', 'great': 'positive', 'excellent': 'positive',
                'love': 'positive', 'amazing': 'positive', 'fantastic': 'positive',
                'bad': 'negative', 'terrible': 'negative', 'awful': 'negative',
                'hate': 'negative', 'horrible': 'negative', 'worst': 'negative',
                'okay': 'neutral', 'average': 'neutral', 'fine': 'neutral'
            },
            'translate_en_fr': {
                'hello': 'bonjour', 'goodbye': 'au revoir', 'thank you': 'merci',
                'yes': 'oui', 'no': 'non', 'please': 's\'il vous pla√Æt',
                'how are you': 'comment allez-vous', 'good morning': 'bonjour'
            }
        }
    
    def process_sentiment(self, text: str) -> str:
        """Process sentiment analysis task"""
        text_lower = text.lower()
        for word, sentiment in self.responses['sentiment'].items():
            if word in text_lower:
                return sentiment
        return 'neutral'
    
    def process_translation(self, text: str) -> str:
        """Process translation task"""
        text_lower = text.lower().strip()
        return self.responses['translate_en_fr'].get(text_lower, 'translation not available')
    
    def process_qa(self, question: str, context: str) -> str:
        """Process question answering task"""
        # Simple keyword-based QA
        q_lower = question.lower()
        c_lower = context.lower()
        
        if 'capital' in q_lower:
            if 'france' in c_lower and 'paris' in c_lower:
                return 'Paris'
            elif 'japan' in c_lower and 'tokyo' in c_lower:
                return 'Tokyo'
        
        if 'who' in q_lower and 'invented' in q_lower:
            if 'bell' in c_lower:
                return 'Alexander Graham Bell'
        
        return 'answer not found'
    
    def process_task(self, task_input: str) -> str:
        """Process any T5-formatted task input"""
        task_input = task_input.strip()
        
        if task_input.startswith('sentiment:'):
            text = task_input[10:].strip()
            return self.process_sentiment(text)
        
        elif task_input.startswith('translate English to French:'):
            text = task_input[28:].strip()
            return self.process_translation(text)
        
        elif task_input.startswith('question:'):
            # Extract question and context
            parts = task_input[9:].split(' context: ')
            if len(parts) == 2:
                question, context = parts[0].strip(), parts[1].strip()
                return self.process_qa(question, context)
        
        elif task_input.startswith('summarize:'):
            text = task_input[10:].strip()
            # Simple summarization - return first few words
            words = text.split()[:5]
            return ' '.join(words) + '...'
        
        return 'Task not supported in this demo'

# Create demo instance
demo = InteractiveT5Demo()

# Test various tasks
test_inputs = [
    "sentiment: I love this new phone!",
    "sentiment: This product is terrible.",
    "translate English to French: hello",
    "translate English to French: thank you",
    "question: What is the capital of France? context: Paris is the capital of France.",
    "summarize: Machine learning is a powerful technology that enables computers to learn from data and make predictions.",
    "generate: The weather today is"  # Not implemented
]

print("T5 INTERACTIVE DEMO")
print("=" * 40)
print("Testing various T5-formatted tasks:")
print()

for i, task_input in enumerate(test_inputs, 1):
    output = demo.process_task(task_input)
    print(f"Test {i}:")
    print(f"  Input:  {task_input}")
    print(f"  Output: {output}")
    print()

print("=" * 40)
print("This demonstrates how T5 processes different tasks")
print("using the same input-output format!")
print("In a real T5 model, all these tasks would be")
print("handled by the same neural network architecture.")
