### 模型评估脚本

这个脚本用于评估基于transformers库的模型在古文翻译任务上的性能。

- **模型加载**: 加载预训练模型和分词器。
- **文本翻译**: 通过模型将文言文翻译为白话文。
- **评估过程**: 对模型的翻译结果进行BLEU评分，以评估翻译质量。

In [1]:
import json
import os
import time
import random
from typing import List, Dict

from transformers import AutoModel, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch

In [2]:
# 定义全局常量
# model_name_or_path=模型路径

model_name_or_path = 'THUDM/chatglm3-6b'
model_name_or_path = '/root/.cache/huggingface/hub/models--THUDM--chatglm3-6b'

In [3]:
# 加载模型和分词器
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True).half().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = model.eval()
# 打印模型的数据精度
print(f"model dtype: {model.dtype}")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

model dtype: torch.float16


In [4]:
# 提示构造
PROMPT_DICT = {
    "prompt_input": (
        "下面是一段文言文文本，请直接将它翻译成白话文。\n"
        "{terms}"  # 如有专业术语或特定背景，将在这里显示
        "#文言文文本:\n{input}\n"
        "#白话文文本:\n"
    )
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
                        tokenizer.get_command("<|observation|>")]
max_length= 1024
# num_beams=1
# do_sample=True
top_p=0.8
temperature=0.1
# gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,"temperature": temperature}
gen_kwargs = {"max_length": max_length, "top_p": top_p,"temperature": temperature}

In [5]:
def generate_output_from_text_batch(text_list: List[str]) -> List[str]:
    """
    生成批量文本输出。

    :param text_list: 需要翻译的文言文文本列表。
    :return: 模型翻译输出的列表。
    """
    results = []
    # 使用PROMPT_DICT构造输入提示
    batch_prompts = [PROMPT_DICT["prompt_input"].format(input=text, terms="") for text in text_list]
    inputs = tokenizer.batch_encode_plus(batch_prompts, return_tensors='pt', padding=True, max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 移动输入到正确的设备

    # 生成翻译输出
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        **gen_kwargs,
        eos_token_id=eos_token_id
    )

    # 处理每个生成的输出
    for j, output in enumerate(outputs):
        decoded_output = tokenizer.decode(output, skip_special_tokens=True)
        translation = decoded_output.replace(batch_prompts[j], "").strip()
        # 替换开头的特殊标记（如果存在）
        results.append(translation.replace("[gMASK]sop ", ""))
    return results

In [6]:
# 使用方法：
text_list = ["老吾老，以及人之老；幼吾幼，以及人之幼", "古之学者必有师。师者， 所以传道受业解惑也。"]
results = generate_output_from_text_batch(text_list)
print(f"results={results}")
# print(results[0]['text'])

results=['要尊重老人，也要尊重别人家的老人；要关心孩子，也要关心别人家的孩子。', '古代的学者一定有老师。老师，是为了传承道理、教授学问、解答疑惑。']


In [7]:
# 使用方法：
text_list = ["青，取之于蓝，而青于蓝；冰，水为之，而寒于水。"]
results = generate_output_from_text_batch(text_list)
print(f"results={results}")

results=['青色是从蓝色中提取的，但青色比蓝色更鲜艳；冰是由水制成的，但冰的寒冷程度超过了水。']


In [8]:
# 使用方法：
text_list = ["十三年，署武昌知府。吴三桂犯湖南，师方攻岳州，檄成龙造浮桥济师，甫成，山水发，桥圮，坐夺官。"]
results = generate_output_from_text_batch(text_list)
print(f"results={results}")

results=['十三年，我担任武昌知府。那时吴三桂侵犯湖南，我们的军队正攻打岳州。我发布文书请求成龙制造浮桥来帮助军队渡河，浮桥刚刚建成，山水爆发，桥毁了，因此我失去了官职。']


进行评估

In [9]:
import time
from tqdm import tqdm

In [10]:
# 定义全局常量
# eva_file_path=用于评估的文件路径
# output_file_path=评估结果输出路径

eva_file_path = r"../data/version4/sampled_1000_merged_output_20230812_190843.json"
output_file_path = "./data/eval_results/"

# 检查输出文件夹是否存在，如果不存在则创建
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# 读取评估数据
with open(eva_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [11]:
def calculate_bleu(reference, candidate):
    reference = [list(reference)]
    candidate = list(candidate)
    return sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method1)

In [12]:
def seconds_to_hms(seconds):
    """将秒数转换为小时、分钟和秒的格式。"""
    parts = []
    hours = seconds // 3600
    if hours > 0:
        parts.append(f"{int(hours)}小时")
    minutes = (seconds % 3600) // 60
    if minutes > 0:
        parts.append(f"{int(minutes)}分钟")
    seconds = seconds % 60
    if seconds > 0 or len(parts) == 0:
        parts.append(f"{int(seconds)}秒")
    return ''.join(parts)

def evaluate_in_batches(data, batch_size, evaluation_results):
    total_samples = len(data)
    total_bleu_score = 0
    start_time = time.time()

    print(f"开始评估，共 {total_samples} 个样本.")

    for i in tqdm(range(0, total_samples, batch_size), desc="评估进度"):
        # 切分批量数据
        batch_data = data[i:i + batch_size]
        batch_inputs = [example["input"] for example in batch_data]
        batch_truths = [[example["output"]] for example in batch_data]
        batch_results = generate_output_from_text_batch(batch_inputs)

        for j, result_str in enumerate(batch_results):
            # 如果有多个参考答案，选择BLEU分数最高的一个
            max_bleu_score = max(calculate_bleu(truth, result_str) for truth in batch_truths[j])
            total_bleu_score += max_bleu_score

            # 将每个样本的评估结果添加到evaluation_results中
            evaluation_results["samples"].append({
                "inputs": batch_inputs[j],
                "truth": batch_truths[j][0],
                "results": result_str,
                "BLEU": max_bleu_score
            })

    # 计算平均BLEU分数和总评估时长
    average_bleu_score = total_bleu_score / total_samples
    total_time = time.time() - start_time
    evaluation_results["scores"]["average_BLEU"] = average_bleu_score
    evaluation_results["infos"]["evaluation_duration"] = seconds_to_hms(total_time)
    print(f"\nBatch={batch_size}")
    print(f"\n评估完成，平均BLEU分数为: {average_bleu_score:.3f}, 总耗时: {seconds_to_hms(total_time)}")
    return evaluation_results

In [13]:
# 设置批大小
batch_size = 1  # 根据需要调整批大小
total_samples = len(data)

# 初始化评估结果字典
evaluation_results = {
    "scores": {"average_BLEU": 0},
    "infos": {
        "evaluation_time": time.strftime("%Y/%m/%d %H:%M:%S", time.localtime()),
        "model_name_or_path": "model_name",
        "eva_file_path": "evaluation_file_path",
        "total_samples": total_samples
    },
    "samples": []
}

# 进行评估
evaluation_results = evaluate_in_batches(data, batch_size, evaluation_results)

开始评估，共 1000 个样本.


评估进度: 100%|██████████| 1000/1000 [16:19<00:00,  1.02it/s] 


Batch=1

评估完成，平均BLEU分数为: 0.175, 总耗时: 16分钟19秒





In [14]:
result_file = os.path.join(output_file_path, f"eval_resu_{time.strftime('%Y%m%d%H%M%S', time.localtime())}.json")
with open(result_file, "w", encoding="utf-8") as f:
    json.dump(evaluation_results, f, ensure_ascii=False, indent=4)

print(f"评估完成，平均BLEU分数为: {evaluation_results['scores']['average_BLEU']:.2f}")
print(f"评估结果已保存到 {result_file}")

评估完成，平均BLEU分数为: 0.18
评估结果已保存到 ./data/eval_results/eval_resu_20231224170900.json
