In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import re
import random
from typing import List, Dict, Tuple, Any
import math

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'GPU' if torch.cuda.is_available() else 'CPU'}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# Comprehensive evaluation framework for T5 models
print("T5 EVALUATION FRAMEWORK")
print("=" * 40)
print("Key evaluation challenges:")
print("1. Multiple tasks require different metrics")
print("2. Text generation makes evaluation complex")
print("3. Need to balance task-specific vs overall performance")
print("4. Transfer effects between tasks must be measured")
print()
print("Week 5 T5 learning complete! 🎉")
print("You've learned:")
print("- Text-to-text paradigm fundamentals")
print("- Multi-task learning strategies")
print("- Task balancing and sampling techniques")
print("- Evaluation approaches for complex models")


In [None]:
class T5EvaluationMetrics:
    """Comprehensive evaluation metrics for T5 multi-task models"""
    
    def __init__(self):
        pass
    
    # Classification/Sentiment Analysis Metrics
    def exact_match_accuracy(self, predictions: List[str], targets: List[str]) -> float:
        """Calculate exact match accuracy for classification tasks"""
        if len(predictions) != len(targets):
            raise ValueError("Predictions and targets must have same length")
        
        correct = sum(1 for pred, target in zip(predictions, targets) 
                     if pred.strip().lower() == target.strip().lower())
        return correct / len(predictions)
    
    def fuzzy_match_accuracy(self, predictions: List[str], targets: List[str], threshold: float = 0.8) -> float:
        """Calculate fuzzy match accuracy using string similarity"""
        if len(predictions) != len(targets):
            raise ValueError("Predictions and targets must have same length")
        
        correct = 0
        for pred, target in zip(predictions, targets):
            similarity = self._string_similarity(pred.strip().lower(), target.strip().lower())
            if similarity >= threshold:
                correct += 1
        
        return correct / len(predictions)
    
    # Translation Metrics
    def bleu_score_simple(self, predictions: List[str], targets: List[str]) -> float:
        """Simplified BLEU score calculation"""
        total_score = 0
        
        for pred, target in zip(predictions, targets):
            pred_tokens = pred.lower().split()
            target_tokens = target.lower().split()
            
            if len(pred_tokens) == 0:
                continue
            
            # Calculate precision for unigrams
            matches = 0
            for token in pred_tokens:
                if token in target_tokens:
                    matches += 1
            
            precision = matches / len(pred_tokens)
            
            # Length penalty
            bp = min(1.0, len(pred_tokens) / len(target_tokens)) if len(target_tokens) > 0 else 0
            
            score = bp * precision
            total_score += score
        
        return total_score / len(predictions) if len(predictions) > 0 else 0
    
    # Question Answering Metrics
    def f1_score_qa(self, predictions: List[str], targets: List[str]) -> float:
        """Calculate F1 score for QA tasks"""
        total_f1 = 0
        
        for pred, target in zip(predictions, targets):
            pred_tokens = set(pred.lower().split())
            target_tokens = set(target.lower().split())
            
            if len(pred_tokens) == 0 and len(target_tokens) == 0:
                f1 = 1.0
            elif len(pred_tokens) == 0 or len(target_tokens) == 0:
                f1 = 0.0
            else:
                common = pred_tokens.intersection(target_tokens)
                precision = len(common) / len(pred_tokens)
                recall = len(common) / len(target_tokens)
                
                if precision + recall == 0:
                    f1 = 0.0
                else:
                    f1 = 2 * precision * recall / (precision + recall)
            
            total_f1 += f1
        
        return total_f1 / len(predictions) if len(predictions) > 0 else 0
    
    # Summarization Metrics
    def rouge_l_simple(self, predictions: List[str], targets: List[str]) -> float:
        """Simplified ROUGE-L score"""
        total_rouge = 0
        
        for pred, target in zip(predictions, targets):
            pred_tokens = pred.lower().split()
            target_tokens = target.lower().split()
            
            # Find LCS (Longest Common Subsequence)
            lcs_length = self._lcs_length(pred_tokens, target_tokens)
            
            if len(target_tokens) == 0:
                rouge_l = 0
            else:
                rouge_l = lcs_length / len(target_tokens)
            
            total_rouge += rouge_l
        
        return total_rouge / len(predictions) if len(predictions) > 0 else 0
    
    def _string_similarity(self, s1: str, s2: str) -> float:
        """Calculate string similarity using simple character overlap"""
        if len(s1) == 0 and len(s2) == 0:
            return 1.0
        if len(s1) == 0 or len(s2) == 0:
            return 0.0
        
        common_chars = len(set(s1).intersection(set(s2)))
        total_chars = len(set(s1).union(set(s2)))
        
        return common_chars / total_chars if total_chars > 0 else 0
    
    def _lcs_length(self, seq1: List[str], seq2: List[str]) -> int:
        """Calculate length of longest common subsequence"""
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])
        
        return dp[m][n]

# Create evaluator instance
evaluator = T5EvaluationMetrics()

# Test with sample data
print("T5 EVALUATION METRICS TESTING")
print("=" * 40)

# Test classification metrics
sentiment_preds = ["positive", "negative", "positive", "neutral", "positive"]
sentiment_targets = ["positive", "negative", "neutral", "neutral", "positive"]

exact_acc = evaluator.exact_match_accuracy(sentiment_preds, sentiment_targets)
fuzzy_acc = evaluator.fuzzy_match_accuracy(sentiment_preds, sentiment_targets)

print(f"Sentiment Analysis:")
print(f"  Predictions: {sentiment_preds}")
print(f"  Targets:     {sentiment_targets}")
print(f"  Exact Accuracy: {exact_acc:.3f}")
print(f"  Fuzzy Accuracy: {fuzzy_acc:.3f}")
print()

# Test translation metrics
translation_preds = ["bonjour le monde", "au revoir", "comment allez vous"]
translation_targets = ["bonjour monde", "au revoir", "comment allez-vous"]

bleu = evaluator.bleu_score_simple(translation_preds, translation_targets)
print(f"Translation:")
print(f"  BLEU Score: {bleu:.3f}")
print()

# Test QA metrics
qa_preds = ["Paris", "Alexander Bell", "Jupiter is largest"]
qa_targets = ["Paris", "Alexander Graham Bell", "Jupiter"]

f1 = evaluator.f1_score_qa(qa_preds, qa_targets)
print(f"Question Answering:")
print(f"  F1 Score: {f1:.3f}")
print()

# Test summarization metrics
summ_preds = ["AI learns from data", "Deep learning uses neural networks", "NLP processes language"]
summ_targets = ["AI learns from data automatically", "Deep learning uses multiple neural network layers", "NLP helps computers understand language"]

rouge = evaluator.rouge_l_simple(summ_preds, summ_targets)
print(f"Summarization:")
print(f"  ROUGE-L Score: {rouge:.3f}")
