In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge

# Load model and tokenizer
device = torch.device("cpu")
model_name = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name, use_safetensors=True).to(device)

In [None]:
def translate_en_to_vi(sentence: str) -> str:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [None]:
def evaluate(sentences, translate_func):
    """
    Evaluate translation quality with BLEU, METEOR, and ROUGE.
    
    Parameters
    ----------
    sentences: list of (english_sentence, reference_vietnamese_sentence)
    translate_func: the translate function
    """
    rouge = Rouge()
    all_references, all_predictions = [], []
    bleu_scores, meteor_scores, rouge_scores = [], [], []

    for en_sentence, ref_vi in sentences:
        pred_vi = translate_func(en_sentence)
        
        # BLEU
        # Collect for corpus BLEU
        all_references.append([ref_vi.split()])  # BLEU expects list of list
        all_predictions.append(pred_vi.split())

        # Sentence-level BLEU
        bleu = sentence_bleu([ref_vi.split()], pred_vi.split())
        bleu_scores.append(bleu)
        
        # METEOR
        meteor = meteor_score([ref_vi.split()], pred_vi.split())
        meteor_scores.append(meteor)
        
        # ROUGE-L F1
        rouge_result = rouge.get_scores(pred_vi, ref_vi)[0]["rouge-l"]["f"]
        rouge_scores.append(rouge_result)
        
        print(f"\nEN: {en_sentence}")
        print(f"Reference VI: {ref_vi}")
        print(f"Predicted VI: {pred_vi}")
        print(f"BLEU: {bleu:.4f}, METEOR: {meteor:.4f}, ROUGE-L F1: {rouge_result:.4f}")

    # Corpus BLEU
    corpus_bleu_score = corpus_bleu(all_references, all_predictions)
    
    print("\n--- AVERAGE METRICS ---")
    print(f"Sentence-level BLEU (avg): {sum(bleu_scores)/len(bleu_scores):.4f}")
    print(f"Corpus BLEU: {corpus_bleu_score:.4f}")
    print(f"METEOR: {sum(meteor_scores)/len(meteor_scores):.4f}")
    print(f"ROUGE-L F1: {sum(rouge_scores)/len(rouge_scores):.4f}")

In [None]:
dataset = [
    ("The weather is nice today.", "Thời tiết hôm nay thật đẹp."),
    ("I love learning new languages.", "Tôi thích học những ngôn ngữ mới."),
    ("This book is very interesting.", "Cuốn sách này rất thú vị."),
    ("Can you help me with my homework?", "Bạn có thể giúp tôi làm bài tập về nhà không?"),
]

evaluate(dataset, translate_func=translate_en_to_vi)
    

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



EN: The weather is nice today.
Reference VI: Thời tiết hôm nay thật đẹp.
Predicted VI: Hôm nay thời tiết đẹp quá.
BLEU: 0.0000, METEOR: 0.6250, ROUGE-L F1: 0.3333

EN: I love learning new languages.
Reference VI: Tôi thích học những ngôn ngữ mới.
Predicted VI: Tôi thích học ngoại ngữ mới.
BLEU: 0.0000, METEOR: 0.7014, ROUGE-L F1: 0.7692

EN: This book is very interesting.
Reference VI: Cuốn sách này rất thú vị.
Predicted VI: Cuốn sách này rất thú vị.
BLEU: 1.0000, METEOR: 0.9977, ROUGE-L F1: 1.0000

EN: Can you help me with my homework?
Reference VI: Bạn có thể giúp tôi làm bài tập về nhà không?
Predicted VI: Cậu giúp tớ làm bài tập được không?
BLEU: 0.0000, METEOR: 0.4168, ROUGE-L F1: 0.5263

--- AVERAGE METRICS ---
Sentence-level BLEU (avg): 0.2500
Corpus BLEU: 0.3320
METEOR: 0.6852
ROUGE-L F1: 0.6572
