In [None]:
!pip install rouge-score==0.0.4

Collecting rouge-score==0.0.4
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl.metadata (3.8 kB)
Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm  import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import os
import tensorflow as tf
from rouge_score import rouge_scorer
# import bleurt.score

nltk.download('omw-1.4')
nltk.download('punkt')


def calculate_vqa_score(standard_answer, generated_answer):
    """
    计算标准答案和生成答案之间的VQA分数。

    参数:
    standard_answer -- 标准答案字符串
    generated_answer -- 模型生成的答案字符串

    返回:
    score -- VQA得分（0到1之间）
    """
    # 创建TfidfVectorizer对象
    vectorizer = TfidfVectorizer()

    # 将答案转化为TF-IDF矩阵
    tfidf_matrix = vectorizer.fit_transform([standard_answer, generated_answer])

    # 使用余弦相似度计算相似度
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

    return score

def calculate_bleu_score(reference_answer, candidate_answer):
    """
    计算候选答案与参考答案之间的Bleu Score。

    参数:
    reference_answer -- 参考答案列表，可以包含多个参考答案
    candidate_answer -- 模型生成的答案字符串

    返回:
    bleu_score -- Bleu Score
    """
    # 预处理：将字符串转换为小写并移除两端空白字符
    reference_answer = [ref.lower().strip().split() for ref in reference_answer]
    candidate_answer = candidate_answer.lower().strip().split()

    # 计算Bleu Score
    smoothing = SmoothingFunction().method1  # 为了避免分母为零的情况
    bleu_score = sentence_bleu(reference_answer, candidate_answer, smoothing_function=smoothing)

    return bleu_score


def calculate_rouge_l(standard_answer, generated_answer):
    """
    计算标准答案与生成答案之间的 ROUGE-L 分数。

    参数:
    standard_answer -- 标准答案字符串
    generated_answer -- 模型生成的答案字符串

    返回:
    scores -- 包含 ROUGE-L 分数的字典
    """
    # 初始化 ROUGE 计分器
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

    # 计算 ROUGE-L 分数
    scores = scorer.score(standard_answer, generated_answer)

    return scores




directory = "data"
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = []
for filename in os.listdir(directory):
    n = 0
    if filename.endswith(".json"):
      filepath = os.path.join(directory, filename)
      with open(filepath, 'r') as f:
        data = json.load(f)
        vqa_score = 0
        bleu_score = 0
        R_scores = 0
        for item in tqdm(data):

            for item2 in item["Q&A"]:
                standard_answer = item2["ground_truth"]
                generated_answer = item2["answer"]
                vqa_score += calculate_vqa_score(standard_answer, generated_answer)
                bleu_score += calculate_bleu_score(standard_answer, generated_answer)
                score = calculate_rouge_l(standard_answer, generated_answer)
                scores.append(score['rougeL'])
                n += 1
        avg_precision = sum(score.precision for score in scores) / len(scores)
        avg_recall = sum(score.recall for score in scores) / len(scores)
        avg_f_measure = sum(score.fmeasure for score in scores) / len(scores)
        average_scores = {
        'rougeL': {
            'precision': avg_precision,
            'recall': avg_recall,
            'fmeasure': avg_f_measure
          }
        }

        fin_vqa_score = vqa_score / n
        fin_bleu_score = bleu_score / n



        print(f"Filename: {filename}")
        print(f"VQA Score: {fin_vqa_score:.4f}")
        print(f"bleu Score: {fin_bleu_score:.4f}")
        for key, value in average_scores.items():
          print(f"{key}: Precision={value['precision']:.4f}, Recall={value['recall']:.4f}, F-Measure={value['fmeasure']:.4f}")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|██████████| 1016/1016 [00:31<00:00, 32.59it/s]


Filename: eval_qwen2-2B_95.json
VQA Score: 0.1381
bleu Score: 0.0333
rougeL: Precision=0.1302, Recall=0.1968, F-Measure=0.1365


100%|██████████| 1016/1016 [00:19<00:00, 53.28it/s]


Filename: eval_geo_llava95.json
VQA Score: 0.0484
bleu Score: 0.0502
rougeL: Precision=0.0869, Recall=0.1323, F-Measure=0.0883


100%|██████████| 1040/1040 [00:19<00:00, 53.72it/s]


Filename: eval_geo_RAG_20llava7B.json
VQA Score: 0.1287
bleu Score: 0.0407
rougeL: Precision=0.1000, Recall=0.1464, F-Measure=0.0980


100%|██████████| 1016/1016 [00:18<00:00, 54.82it/s]


Filename: eval_qwen2-7B_95.json
VQA Score: 0.1560
bleu Score: 0.0361
rougeL: Precision=0.1209, Recall=0.1576, F-Measure=0.1103


100%|██████████| 1016/1016 [00:18<00:00, 55.13it/s]


Filename: eval_qwen2-7B_RAG.json
VQA Score: 0.3144
bleu Score: 0.0261
rougeL: Precision=0.1707, Recall=0.1981, F-Measure=0.1509


100%|██████████| 1016/1016 [00:17<00:00, 57.47it/s]


Filename: eval_qwen2-7B_20RAG.json
VQA Score: 0.2657
bleu Score: 0.0299
rougeL: Precision=0.1932, Recall=0.2171, F-Measure=0.1695


100%|██████████| 1016/1016 [00:29<00:00, 34.65it/s]


Filename: eval_geo_minicpm95.json
VQA Score: 0.1673
bleu Score: 0.0092
rougeL: Precision=0.1776, Recall=0.2328, F-Measure=0.1619


100%|██████████| 1016/1016 [00:26<00:00, 37.91it/s]


Filename: eval_geochat_95.json
VQA Score: 0.1574
bleu Score: 0.0031
rougeL: Precision=0.1715, Recall=0.2315, F-Measure=0.1570


100%|██████████| 1016/1016 [00:21<00:00, 47.93it/s]

Filename: eval_geo_RAG_llava7B.json
VQA Score: 0.1785
bleu Score: 0.0362
rougeL: Precision=0.1727, Recall=0.2323, F-Measure=0.1580



