In [None]:
!pip install nltk rouge-score bert-score numpy torch transformers underthesea

In [None]:
from typing import List, Dict
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from underthesea import word_tokenize
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np
from tqdm import tqdm

def calculate_vqa_metrics(references: List[str], hypotheses: List[str], bert_batch_size: int = 16) -> Dict[str, float]:
    """
    Calculate BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and BERTScore metrics for VQA model evaluation in Vietnamese.
    
    Args:
        references: List of reference answers (ground truth) for each question in Vietnamese.
        hypotheses: List of predicted answers from the VQA model in Vietnamese.
        bert_batch_size: Batch size for BERTScore computation (default 512).
    
    Returns:
        Dictionary containing BLEU, ROUGE-1, ROUGE-2, ROUGE-L, and BERTScore metrics.
    """
    if len(references) != len(hypotheses):
        raise ValueError("Number of references and hypotheses must match.")
    if not references or not hypotheses:
        raise ValueError("Input lists cannot be empty.")
    
    bleu_scores = []
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []
    bert_p_scores = []
    bert_r_scores = []
    bert_f1_scores = []

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

    # Calculate BLEU and ROUGE
    for ref, hyp in tqdm(zip(references, hypotheses), desc = "Calculating BLEU & ROUGE", total=len(references)):
        ref_tokens = [word_tokenize(ref.lower(), format = "text").split()]
        hyp_tokens = word_tokenize(hyp.lower(), format = "text").split()
        
        try:
            bleu = sentence_bleu(ref_tokens, hyp_tokens, 
                                 smoothing_function=SmoothingFunction().method1)
        except ZeroDivisionError:
            bleu = 0.0
        bleu_scores.append(bleu)
        
        rouge_scores = rouge.score(ref, hyp)
        rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
        rouge_l_scores.append(rouge_scores['rougeL'].fmeasure)

    # Calculate BERTScore in batches with tqdm
    print("Calculating BERTScore...")
    try:
        for i in tqdm(range(0, len(hypotheses), bert_batch_size), desc="BERTScore batches"):
            batch_hyps = hypotheses[i:i+bert_batch_size]
            batch_refs = references[i:i+bert_batch_size]
            P, R, F1 = score(batch_hyps, batch_refs, 
                             lang="vi", model_type="xlm-roberta-base", verbose=False)
            bert_p_scores.extend(P.tolist())
            bert_r_scores.extend(R.tolist())
            bert_f1_scores.extend(F1.tolist())
    except Exception as e:
        print(f"Warning: BERTScore calculation failed: {e}")
        bert_p_scores = [0.0] * len(hypotheses)
        bert_r_scores = [0.0] * len(hypotheses)
        bert_f1_scores = [0.0] * len(hypotheses)
    
    # Compute average scores
    metrics = {
        'BLEU': float(np.mean(bleu_scores)),
        'ROUGE-1': float(np.mean(rouge_1_scores)),
        'ROUGE-2': float(np.mean(rouge_2_scores)),
        'ROUGE-L': float(np.mean(rouge_l_scores)),
        'BERTScore_Precision': float(np.mean(bert_p_scores)),
        'BERTScore_Recall': float(np.mean(bert_r_scores)),
        'BERTScore_F1': float(np.mean(bert_f1_scores))
    }

    return metrics

In [None]:
# Example usage
if __name__ == "__main__":
    # Sample data in Vietnamese
    refs = [
        "Con mèo nằm trên thảm.",
        "Bầu trời màu xanh."
    ]
    hyps = [
        "Con mèo nằm trên thảm.",
        "Trời màu xanh."
    ]
    
    try:
        results = calculate_vqa_metrics(refs, hyps)
        for metric, value in results.items():
            print(f"{metric}: {value:.4f}")
    except Exception as e:
        print(f"Error in metric calculation: {e}")