In [2]:
%%capture
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
from huggingface_hub import login
login(token=f"{secret_value_0}",write_permission=True)  # Enter your HF token when prompted
!pip install sacrebleu pandas tqdm

In [7]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from tqdm.auto import tqdm
import pandas as pd
from sacrebleu.metrics import BLEU
from typing import List, Dict, Tuple
import re
import warnings
warnings.filterwarnings('ignore')

class LojbanTranslationEvaluator:
    def __init__(self):
        self.model_name = "woctordho/lojban-translation"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.bleu = BLEU()
        
        # Lojban normalization rules
        self.normalization_rules = {
            'lo lo': 'le',
            'la la': 'le',
            'zvati': 'cpana',  # Common spatial relationship equivalents
            'se bangu': 'bangu',  # Language usage equivalents
        }
        
    def normalize_lojban(self, text: str) -> str:
        """Enhanced Lojban text normalization."""
        text = re.sub(r'\s+', ' ', text.strip())
        text = re.sub(r'\.+', '.', text)  # Normalize multiple dots
        text = re.sub(r'\s*\.\s*', ' . ', text)  # Normalize spacing around dots
        
        # Apply normalization rules
        for old, new in self.normalization_rules.items():
            text = text.replace(old, new)
            
        # Remove unnecessary particles in certain contexts
        text = re.sub(r'\.i\s+', '', text)  # Remove .i at start
        
        return text.strip()

    def semantic_similarity(self, translation: str, reference: str) -> float:
        """Calculate semantic similarity between translation and reference."""
        trans_words = set(translation.split())
        ref_words = set(reference.split())
        
        intersection = len(trans_words.intersection(ref_words))
        union = len(trans_words.union(ref_words))
        
        return intersection / union if union > 0 else 0.0

    def load_test_data(self) -> List[Dict[str, str]]:
        """Load test data with multiple reference translations."""
        return [
            {
                "english": "Hello, how are you?",
                "references": ["coi do mo", "coi .i do mo", "coi pei"],
                "category": "greetings",
                "complexity": "simple"
            },
            {
                "english": "The red book is on the big table",
                "references": ["le xunre cukta cu cpana le barda jubme", 
                             "lo xunre cukta cu zvati lo barda jubme"],
                "category": "spatial",
                "complexity": "complex"
            },
            {
                "english": "I really love learning Lojban",
                "references": ["mi mutce nelci lo nu cilre la lojban",
                             "mi nelci lo nu cilre la lojban"],
                "category": "emotions",
                "complexity": "complex"
            },
            {
                "english": "What time is it?",
                "references": ["ma tcika", "lo tcika cu mo"],
                "category": "questions",
                "complexity": "simple"
            },
            {
                "english": "The weather is nice today",
                "references": ["le temci cu xamgu ca lo cabdei",
                             "lo temci be ca lo cabdei cu xamgu"],
                "category": "descriptions",
                "complexity": "medium"
            }
        ]

    def evaluate_translation(self, 
                           translation: str, 
                           references: List[str], 
                           category: str) -> Dict[str, float]:
        """Evaluate a single translation against multiple references."""
        # Normalize all texts
        norm_translation = self.normalize_lojban(translation)
        norm_references = [self.normalize_lojban(ref) for ref in references]
        
        # Calculate metrics
        metrics = {
            "bleu": max(self.bleu.corpus_score([norm_translation], [[ref]]).score 
                       for ref in norm_references),
            "exact_match": any(norm_translation == ref for ref in norm_references),
            "semantic_similarity": max(self.semantic_similarity(norm_translation, ref) 
                                    for ref in norm_references),
            "valid_lojban": bool(re.search(r'\b(cu|lo|le|la)\b', translation))
        }
        
        return metrics

    def translate_text(self, 
                      model: MarianMTModel, 
                      tokenizer: MarianTokenizer, 
                      text: str) -> List[str]:
        """Generate translation variants using different beam search parameters."""
        inputs = tokenizer(text, return_tensors="pt", padding=True).to(self.device)
        
        # Generate with different beam widths
        translations = []
        
        # Standard beam search
        outputs = model.generate(
            **inputs,
            max_length=50,
            num_beams=5,
            length_penalty=1.0,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        translations.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        
        # Wider beam search
        outputs = model.generate(
            **inputs,
            max_length=50,
            num_beams=8,
            length_penalty=0.8,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        translations.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        
        # More conservative beam search
        outputs = model.generate(
            **inputs,
            max_length=50,
            num_beams=4,
            length_penalty=1.2,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        translations.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        
        return list(set(translations))  # Remove duplicates

    def run_evaluation(self) -> pd.DataFrame:
        """Run comprehensive evaluation."""
        print("Loading model...")
        model = MarianMTModel.from_pretrained(self.model_name).to(self.device)
        tokenizer = MarianTokenizer.from_pretrained(self.model_name)
        
        results = []
        test_data = self.load_test_data()
        
        print("Running translations...")
        for item in tqdm(test_data):
            translations = self.translate_text(model, tokenizer, item["english"])
            
            # Evaluate each variant
            variant_metrics = []
            for trans in translations:
                metrics = self.evaluate_translation(
                    trans, item["references"], item["category"])
                variant_metrics.append(metrics)
            
            # Select best variant based on BLEU score
            best_variant_idx = max(range(len(variant_metrics)), 
                                 key=lambda i: variant_metrics[i]["bleu"])
            
            results.append({
                "english": item["english"],
                "best_translation": translations[best_variant_idx],
                "references": item["references"],
                "category": item["category"],
                "complexity": item["complexity"],
                "metrics": variant_metrics[best_variant_idx],
                "all_variants": translations
            })
        
        return pd.DataFrame(results)

def main():
    evaluator = LojbanTranslationEvaluator()
    results = evaluator.run_evaluation()
    
    # Generate detailed report
    print("\nDetailed Translation Analysis:")
    print("=" * 80)
    
    # Overall statistics
    print("\nOverall Performance:")
    avg_bleu = results["metrics"].apply(lambda x: x["bleu"]).mean()
    avg_semantic = results["metrics"].apply(lambda x: x["semantic_similarity"]).mean()
    print(f"Average BLEU Score: {avg_bleu:.2f}")
    print(f"Average Semantic Similarity: {avg_semantic:.2f}")
    
    # Performance by category and complexity
    print("\nPerformance by Category:")
    for category in results["category"].unique():
        cat_results = results[results["category"] == category]
        avg_cat_bleu = cat_results["metrics"].apply(lambda x: x["bleu"]).mean()
        print(f"{category:15} - BLEU: {avg_cat_bleu:.2f}")
    
    print("\nPerformance by Complexity:")
    for complexity in results["complexity"].unique():
        comp_results = results[results["complexity"] == complexity]
        avg_comp_bleu = comp_results["metrics"].apply(lambda x: x["bleu"]).mean()
        print(f"{complexity:15} - BLEU: {avg_comp_bleu:.2f}")
    
    # Detailed examples
    print("\nDetailed Examples:")
    for _, row in results.iterrows():
        print(f"\nCategory: {row['category']} (Complexity: {row['complexity']})")
        print(f"English: {row['english']}")
        print("Translation Variants:")
        for i, variant in enumerate(row['all_variants'], 1):
            print(f"{i}. {variant}")
        print("References:")
        for i, ref in enumerate(row['references'], 1):
            print(f"{i}. {ref}")
        print("-" * 40)
    
    # Save results
    results.to_csv("lojban_translation_detailed_analysis.csv", index=False)
    print("\nDetailed results saved to lojban_translation_detailed_analysis.csv")

if __name__ == "__main__":
    main()

Loading model...
Running translations...


  0%|          | 0/5 [00:00<?, ?it/s]


Detailed Translation Analysis:

Overall Performance:
Average BLEU Score: 45.85
Average Semantic Similarity: 0.56

Performance by Category:
greetings       - BLEU: 46.31
spatial         - BLEU: 100.00
emotions        - BLEU: 61.05
questions       - BLEU: 0.00
descriptions    - BLEU: 21.87

Performance by Complexity:
simple          - BLEU: 23.15
complex         - BLEU: 80.52
medium          - BLEU: 21.87

Detailed Examples:

Category: greetings (Complexity: simple)
English: Hello, how are you?
Translation Variants:
1. coi do pei
2. coi pei pei
3. coi .i pei
References:
1. coi do mo
2. coi .i do mo
3. coi pei
----------------------------------------

Category: spatial (Complexity: complex)
English: The red book is on the big table
Translation Variants:
1. le xunre cukta cu zvati lo barda jubme
2. lo xunre cukta cu zvati lo barda jubme
References:
1. le xunre cukta cu cpana le barda jubme
2. lo xunre cukta cu zvati lo barda jubme
----------------------------------------

Category: emotio