In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Ready to evaluate translation models!")


In [None]:
def calculate_bleu_score(reference, candidate):
    """
    Calculate BLEU score for a single translation
    
    Args:
        reference: List of reference words
        candidate: List of candidate words
    
    Returns:
        BLEU score (0-1)
    """
    try:
        # Use smoothing to handle short sentences
        smoothing = SmoothingFunction().method1
        score = sentence_bleu([reference], candidate, smoothing_function=smoothing)
        return score
    except:
        return 0.0

def evaluate_translations(translations):
    """
    Evaluate a list of translation pairs
    
    Args:
        translations: List of (reference, candidate, attention_weights) tuples
    
    Returns:
        Dictionary with evaluation metrics
    """
    bleu_scores = []
    total_length_diff = 0
    perfect_matches = 0
    
    print("TRANSLATION EVALUATION RESULTS")
    print("=" * 50)
    
    for i, (reference, candidate, attention) in enumerate(translations):
        # Calculate BLEU score
        ref_words = reference.split()
        cand_words = candidate.split()
        
        bleu = calculate_bleu_score(ref_words, cand_words)
        bleu_scores.append(bleu)
        
        # Calculate length difference
        length_diff = abs(len(ref_words) - len(cand_words))
        total_length_diff += length_diff
        
        # Check for perfect matches
        if reference.lower() == candidate.lower():
            perfect_matches += 1
        
        # Print individual results
        print(f"\nTranslation {i+1}:")
        print(f"  Reference: {reference}")
        print(f"  Candidate: {candidate}")
        print(f"  BLEU Score: {bleu:.3f}")
        print(f"  Length Diff: {length_diff} words")
        
        if bleu > 0.8:
            print("  Quality: Excellent ✅")
        elif bleu > 0.5:
            print("  Quality: Good 👍")
        elif bleu > 0.3:
            print("  Quality: Fair 😐")
        else:
            print("  Quality: Poor ❌")
    
    # Calculate overall metrics
    avg_bleu = np.mean(bleu_scores)
    avg_length_diff = total_length_diff / len(translations)
    perfect_rate = perfect_matches / len(translations)
    
    print(f"\nOVERALL EVALUATION METRICS:")
    print(f"  Average BLEU Score: {avg_bleu:.3f}")
    print(f"  Average Length Difference: {avg_length_diff:.1f} words")
    print(f"  Perfect Match Rate: {perfect_rate:.1%}")
    print(f"  Total Translations: {len(translations)}")
    
    return {
        'avg_bleu': avg_bleu,
        'bleu_scores': bleu_scores,
        'avg_length_diff': avg_length_diff,
        'perfect_rate': perfect_rate
    }

# Test with sample translations
sample_translations = [
    ("Hello", "Bonjour", None),
    ("How are you", "Comment allez-vous", None),
    ("I love you", "Je t'aime", None),
    ("The cat is sleeping", "Le chat dort", None),
    ("What time is it", "Quelle heure est-il", None),
    ("Good morning my friend", "Bonjour mon ami", None),
    ("I am learning French", "J'apprends le français", None),
    ("The weather is beautiful today", "Il fait beau aujourd'hui", None),
]

# Evaluate the sample translations
evaluation_results = evaluate_translations(sample_translations)
