In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 你保存模型的位置
model_path = "./mengzi_t5_qa_model"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
import json
from datasets import Dataset

test_data_path = "data/dev.json"
test_data = []
with open(test_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():  # 忽略空行
            test_data.append(json.loads(line.strip()))
# 构造 input_text 和 target_text
test_samples = [
    {
        "input_text": f"问题：{item['question']} 文本：{item['context']}",
        "target_text": item['answer']
    }
    for item in test_data
]

test_dataset = Dataset.from_list(test_samples)

In [7]:
from tqdm import tqdm
import torch
import jieba
from datasets import Dataset
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


def preprocess_function(examples, tokenizer):
    inputs = tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    targets = tokenizer(
        examples["target_text"],
        max_length=64,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

def preprocess_and_predict(dataset,model,tokenizer):
    print("Preprocessing dataset...")
    tokenized_dataset=dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    tokenized_dataset.set_format(type='torch')
    print("Dataset preprocessing complete.")
    print("Starting prediction...")
    model.eval()
    predictions = []
    references = []
    data_loader = DataLoader(tokenized_dataset, batch_size=2, shuffle=False)

    for i, batch in tqdm(enumerate(data_loader), total=len(data_loader)):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch_indices = batch["input_ids"].shape[0]
        start_idx = i * data_loader.batch_size
        end_idx = start_idx + batch_indices
        decoded_labels = test_dataset[start_idx:end_idx]["target_text"]

        predictions.extend(decoded_preds)
        references.extend([[label] for label in decoded_labels])
    print("Prediction complete.")
    return predictions, references

def evaluate_bleu(predictions, references):
    print("Evaluating BLEU scores...")
    bleu1 = bleu2 = bleu3 = bleu4 = 0
    smoothie = SmoothingFunction().method4

    for pred, ref in zip(predictions, references):
        pred_tokens = list(jieba.cut(pred))  # 分词后的预测
        ref_tokens = [list(jieba.cut(ref[0]))]  # 分词后的参考答案（必须是嵌套列表）

        bleu1 += sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
        bleu2 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
        bleu3 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
        bleu4 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    total = len(predictions)
    print(f"BLEU-1: {bleu1 / total:.4f}")
    print(f"BLEU-2: {bleu2 / total:.4f}")
    print(f"BLEU-3: {bleu3 / total:.4f}")
    print(f"BLEU-4: {bleu4 / total:.4f}")

def evaluation_pipline(dataset,model,tokenizer):
    predictions, references = preprocess_and_predict(dataset,model,tokenizer)
    evaluate_bleu(predictions, references)
    return predictions, references
    

In [8]:
print("Evaluating trained model...")
predictions, references = evaluation_pipline(test_dataset, model, tokenizer)
for i in range(5):
    print(f"Question {i+1}: {test_samples[i]['input_text']}")
    print(f"Reference Answer {i+1}: {test_samples[i]['target_text']}")
    print(f"Model Answer {i+1}: {predictions[i]}")

Evaluating trained model...
Preprocessing dataset...


Map: 100%|██████████| 984/984 [00:01<00:00, 937.86 examples/s]


Dataset preprocessing complete.
Starting prediction...


100%|██████████| 492/492 [03:08<00:00,  2.61it/s]
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


Prediction complete.
Evaluating BLEU scores...


Loading model cost 0.548 seconds.
Prefix dict has been built successfully.


BLEU-1: 0.6338
BLEU-2: 0.6027
BLEU-3: 0.4925
BLEU-4: 0.4057
Question 1: 问题：2017年银行贷款基准利率 文本：年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。
Reference Answer 1: 年基准利率4.35%
Model Answer 1: 4.35%
Question 2: 问题：2017年银行贷款基准利率 文本：年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。
Reference Answer 2: 4.35%
Model Answer 2: 4.35%
Question 3: 问题：格力空调哪个系列好 文本：U系列是最好的，采用国际顶尖技术（由格力自主研发）双级变频压缩机，提高压缩机运转效率，制冷制热能力更强劲；1赫兹变频技术，使空调相当于一个15 W电灯泡，更加节能省电；送风面积广，风力大；生态风，净化空气。非常不错，现在国美在做活动，可以了解一下。
Reference Answer 3: U系列
Model Answer 3: U系列
Question 4: 问题：橱柜宽度 文本：平面操作区域进深（即宽度）以40至60厘米为宜；要充分考虑洗菜盆的宽度。以标准洗菜盆来算，应选择550－－600MM的宽度为好。另：在高度方面，根据我国人体高度测算，掌握以下尺寸为宜：操作台高度在89至92厘米为宜；平面操作区域进深以40至60厘米为宜；抽油烟机与灶台的距离掌握在60至80厘米为宜；操作台上方的吊柜要能使主人操作时不碰头为宜，它距地面最小距离不应小于145厘米，进深尺寸为25至35厘米，吊柜与操作台之间的距离应在55厘米以上。
Reference Answer 4: 以40至60厘米为宜
Model Answer 4: 40至60厘米
Question 5: 问题：橱柜宽度 文本：平面操作区域进深（即宽度）以40至

In [10]:
import pandas as pd

df = pd.DataFrame({
    "question": [item["input_text"] for item in test_samples],
    "reference_answer": [ref[0] for ref in references],
    "predicted_answer": predictions
})

df.to_csv("qa_predictions.csv", index=False, encoding="utf-8-sig")
print("预测结果已保存到 qa_predictions.csv")

预测结果已保存到 qa_predictions.csv
