In [2]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import load_from_disk
import os


# 1. 加载tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("../user_data/bart_tokenizer")
print("tokenizer is done!")

# 2. 加载模型(用en的模型)
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/en/results/checkpoint-154690")
print("model is done!")


# 5. 加载数据
dataset_dir = "../user_data/step1/de/dataset"
tokenized_train_dataset = load_from_disk(f"{dataset_dir}/train")
tokenized_val_dataset = load_from_disk(f"{dataset_dir}/val")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print("data is done!")

# 6. 设置训练参数
output_dir = "../user_data/step1/de"
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, "results"),         # 训练结果保存路径
    eval_strategy="steps",                            # 按步数进行评估
    save_strategy="steps",                                   # 按步数进行保存
    logging_dir=os.path.join(output_dir, "logs"),           # 日志保存路径
    logging_steps=500,                                     # 日志打印间隔
    eval_steps=500,                                        # 每 500 步进行一次评估
    save_steps=500,
    learning_rate=1e-4,                                     # 学习率
    per_device_train_batch_size=64,                         # 每个设备的训练批次大小
    per_device_eval_batch_size=256,                         # 每个设备的验证批次大小
    weight_decay=0.01,                                      # 权重衰减
    save_total_limit=3,                                    # 保存的 checkpoint 数量上限
    num_train_epochs=5,                                     # 训练 epoch 数
    predict_with_generate=True,                             # 使用生成模式进行评估
    bf16=True,                                              # 使用 bf16 精度
    load_best_model_at_end=True,                            # 训练结束后加载最好的模型
    metric_for_best_model="eval_loss",                      # 最好模型的评估指标
    greater_is_better=False,                                # 对于 Loss，越小越好
)

print("训练参数已设置完成！")


# 7. 使用 Seq2SeqTrainer 进行 微调
trainer = Seq2SeqTrainer(
    model=model,                                     # 模型
    args=training_args,                              # 训练参数
    train_dataset=tokenized_train_dataset,           # 训练数据集
    eval_dataset=tokenized_val_dataset,              # 验证数据集
    tokenizer=tokenizer,                             # 分词器
)

# 8. 开始训练
trainer.train()

2024-10-09 15:57:51.181963: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 15:57:51.200788: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 15:57:51.206743: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 15:57:51.223809: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The tokenizer class you load from this checkpoint is 

tokenizer is done!
model is done!
data is done!
训练参数已设置完成！


Step,Training Loss,Validation Loss
500,1.4395,1.086582
1000,1.072,0.935476
1500,0.9588,0.858483
2000,0.899,0.810336
2500,0.8516,0.77607
3000,0.8228,0.749787
3500,0.7581,0.731573
4000,0.744,0.716576
4500,0.7259,0.703194
5000,0.7143,0.689517


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=15470, training_loss=0.6935355543396746, metrics={'train_runtime': 1749.039, 'train_samples_per_second': 566.025, 'train_steps_per_second': 8.845, 'total_flos': 5.271370334208e+16, 'train_loss': 0.6935355543396746, 'epoch': 5.0})

# 评估

In [1]:
import torch
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
import sacrebleu
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm 


# 设置参数
batch_size = 32
beam_size = 8


# 读取source和target文件
def load_txt_data(source_path, target_path):
    with open(source_path, "r", encoding="utf-8") as src_file, open(target_path, "r", encoding="utf-8") as tgt_file:
        source_sentences = [f"<zh> {line.strip()} </s>" for line in src_file.readlines()]
        target_sentences = [line.strip() for line in tgt_file.readlines()]
    return source_sentences, target_sentences

data_files = {
    "source": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/val/中文/de-zh.txt",
    "target": "../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/val/其他语言/de-zh.txt"
}

# 加载txt文件中的句子
source_sentences, target_sentences = load_txt_data(data_files["source"], data_files["target"])

# 将数据转换为datasets格式
dataset_dict = {"source": source_sentences, "target": target_sentences}
dataset = Dataset.from_dict(dataset_dict)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
print("Data loaded.")
model_output_dir = "../user_data/step1/de/results/checkpoint-15470"
# 加载tokenizer和model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_output_dir)
print("Tokenizer loaded.")
model = BartForConditionalGeneration.from_pretrained(model_output_dir).eval().to(device)
print("Model loaded.")

  from .autonotebook import tqdm as notebook_tqdm


Data loaded.
Tokenizer loaded.
Model loaded.


In [2]:
# 定义翻译函数
def translate_batch(batch):
    inputs = tokenizer(batch['source'], return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    # 去掉token_type_ids，只保留input_ids和attention_mask
    inputs = {key: inputs[key] for key in ['input_ids', 'attention_mask']}
    translated_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=beam_size,
        early_stopping=True,
        decoder_start_token_id=model.config.bos_token_id,  # 使用起始标记
    )
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

# 创建保存预测结果和真实结果的列表
predictions = []
references = dataset['target']

# 分批处理并翻译
for batch in tqdm(dataloader):
    batch_predictions = translate_batch(batch)
    predictions.extend([i.strip() for i in batch_predictions])

# 计算BLEU分数
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU-4 score: {bleu.score:.2f}")

100%|██████████| 16/16 [00:46<00:00,  2.90s/it]

BLEU-4 score: 9.34



