In [1]:
import json

train_data_path = "data/train.json"

train_data = []
with open(train_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():  # 忽略空行
            train_data.append(json.loads(line.strip()))

print(f"共加载训练样本数：{len(train_data)}")
print("样本:\ncontext:",train_data[0]['context'],"\nquestion:",train_data[0]['question'],"\nanswer:",train_data[0]['answer'])

共加载训练样本数：14520
样本:
context: 第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。 
question: 仙剑奇侠传3第几集上天界 
answer: 第35集


In [2]:
from datasets import Dataset

# 构造 input_text 和 target_text
samples = []
for item in train_data:
    input_text = f"问题：{item['question']} 文本：{item['context']}"
    target_text = item['answer']
    samples.append({
        "input_text": input_text,
        "target_text": target_text
    })

# 转换为 HuggingFace Dataset 并划分训练/验证集
dataset = Dataset.from_list(samples)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("训练集样本数：", len(train_dataset))
print("验证集样本数：", len(eval_dataset))

  from .autonotebook import tqdm as notebook_tqdm


训练集样本数： 13068
验证集样本数： 1452


In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# model_name = "Langboat/mengzi-t5-base"
model_path = "/root/.cache/modelscope/hub/models/langboat/mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

max_input_len = 512
max_target_len = 64

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_len,
        padding="max_length",
        truncation=True
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=max_target_len,
        padding="max_length",
        truncation=True
    )["input_ids"]

    # 避免对 PAD 部分计算损失
    labels = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs

In [4]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

tokenized_train.set_format("torch")
tokenized_eval.set_format("torch")

Map:   0%|          | 0/13068 [00:00<?, ? examples/s]

Map: 100%|██████████| 13068/13068 [00:12<00:00, 1073.64 examples/s]
Map: 100%|██████████| 1452/1452 [00:00<00:00, 1547.09 examples/s]


In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mengzi_t5_qa_output",
    run_name="mengzi-t5-qa-run",
    num_train_epochs=6,                        
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4, 
    eval_accumulation_steps=4,                 
    gradient_accumulation_steps=1,
    eval_strategy="epoch",                     
    save_strategy="epoch",
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,    
    metric_for_best_model="eval_loss",      
    greater_is_better=False,                  
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    fp16=True,
    dataloader_num_workers=4,
    seed=42
)
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [6]:
trainer.train()
# 保存模型和分词器
output_dir = "./mengzi_t5_qa_model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Epoch,Training Loss,Validation Loss
1,0.586,0.582998
2,0.36,0.608305
3,0.2,0.70742


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('./mengzi_t5_qa_model/tokenizer_config.json',
 './mengzi_t5_qa_model/special_tokens_map.json',
 './mengzi_t5_qa_model/spiece.model',
 './mengzi_t5_qa_model/added_tokens.json')

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 你保存模型的位置
trained_model_path = "./mengzi_t5_qa_model"

trained_tokenizer = T5Tokenizer.from_pretrained(trained_model_path)
train_model = T5ForConditionalGeneration.from_pretrained(trained_model_path)

origin_model_path = "/root/.cache/modelscope/hub/models/langboat/mengzi-t5-base"
origin_tokenizer = T5Tokenizer.from_pretrained(model_path)
origin_model = T5ForConditionalGeneration.from_pretrained(model_path)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NameError: name 'model_path' is not defined

In [8]:
from datasets import Dataset

test_data_path = "data/dev.json"
test_data = []
with open(test_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():  # 忽略空行
            test_data.append(json.loads(line.strip()))
# 构造 input_text 和 target_text
test_samples = [
    {
        "input_text": f"问题：{item['question']} 文本：{item['context']}",
        "target_text": item['answer']
    }
    for item in test_data
]

test_dataset = Dataset.from_list(test_samples)

In [9]:
max_input_len = 512
max_target_len = 64

def preprocess_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_len,
        padding="max_length",
        truncation=True,
        return_tensors=None
    )
    return inputs

tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_test.set_format(type="torch")


Map: 100%|██████████| 984/984 [00:00<00:00, 1181.53 examples/s]


In [None]:
import torch
from torch.utils.data import DataLoader

predictions = []
references = []

test_loader = DataLoader(tokenized_test, batch_size=2, shuffle=False)

for i, batch in enumerate(test_loader):
    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_len)

    # 解码预测
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # 找到对应的原始 label（不能用 i * N 固定映射）
    # 获取当前 batch 的原始样本索引
    batch_indices = batch["input_ids"].shape[0]
    start_idx = i * test_loader.batch_size
    end_idx = start_idx + batch_indices
    decoded_labels = test_dataset[start_idx:end_idx]["target_text"]

    # 扩展预测与参考答案
    predictions.extend(decoded_preds)
    references.extend([[label] for label in decoded_labels])  # BLEU 要求每个参考是一个列表



In [None]:
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

bleu1 = bleu2 = bleu3 = bleu4 = 0
smoothie = SmoothingFunction().method4

for pred, ref in zip(predictions, references):
    pred_tokens = list(jieba.cut(pred))  # 分词后的预测
    ref_tokens = [list(jieba.cut(ref[0]))]  # 分词后的参考答案（必须是嵌套列表）

    bleu1 += sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu2 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
    bleu3 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
    bleu4 += sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

total = len(predictions)
print(f"BLEU-1: {bleu1 / total:.4f}")
print(f"BLEU-2: {bleu2 / total:.4f}")
print(f"BLEU-3: {bleu3 / total:.4f}")
print(f"BLEU-4: {bleu4 / total:.4f}")


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.540 seconds.
Prefix dict has been built successfully.


BLEU-1: 0.6338
BLEU-2: 0.6027
BLEU-3: 0.4925
BLEU-4: 0.4057


In [12]:
import pandas as pd

df = pd.DataFrame({
    "question": [item["input_text"] for item in test_samples],
    "reference_answer": [ref[0] for ref in references],
    "predicted_answer": predictions
})

df.to_csv("qa_predictions.csv", index=False, encoding="utf-8-sig")
print("预测结果已保存到 qa_predictions.csv")

预测结果已保存到 qa_predictions.csv
