In [1]:
#####################################################################################################################
###                                             Step2: kk                                                         ###
#####################################################################################################################
from transformers import PreTrainedTokenizerFast
from datasets import DatasetDict, Dataset
import os
from tqdm import tqdm 


# 加载分词器
tokenizer = PreTrainedTokenizerFast.from_pretrained("../user_data/bart_tokenizer")

# 构建双语数据集
# 读取文件，构造数据列表
source_data = []
with open("../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/val/中文/kk-zh.txt", "r", encoding="utf-8") as f:
    for line in f:
        source_text = line.strip()
        # 添加特殊token
        source_text = f"<zh> {source_text} </s>"
        source_data.append(source_text)
target_data = []
with open("../xfdata/多语言机器翻译挑战赛数据集更新（以此测试集提交得分为准）/val/其他语言/kk-zh.txt", "r", encoding="utf-8") as f:
    for line in f:
        target_text = line.strip()
        # 添加特殊token
        target_text = f"<kk> {target_text} </s>"
        target_data.append(target_text)

# Tokenize 函数
def tokenize_function(examples):
    source_texts = examples["source"]
    target_texts = examples["target"]

    # Tokenize source texts
    model_inputs = tokenizer(source_texts, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    
    # Tokenize target texts without using as_target_tokenizer context
    labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    # 将 labels 直接添加到 model_inputs
    model_inputs["labels"] = labels["input_ids"]

    # 转换成字典格式，便于 datasets 库使用
    return {key: value.tolist() for key, value in model_inputs.items()}


data = []
for source_text, target_text in zip(source_data, target_data):
    data.append({"source": source_text, "target": target_text})
dataset = Dataset.from_list(data)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
# 划分数据集
tokenized_train_dataset, tokenized_val_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42).values()
# 定义保存路径
output_dir = "../user_data/step1/kk/dataset"
os.makedirs(output_dir, exist_ok=True)

# 保存 tokenized 数据集
tokenized_train_dataset.save_to_disk(os.path.join(output_dir, "train"))
tokenized_val_dataset.save_to_disk(os.path.join(output_dir, "val"))

print("数据集已保存到 ../user_data/step1/kk/dataset 中")

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Map: 100%|██████████| 500/500 [00:00<00:00, 3549.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 52280.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 21565.65 examples/s]

数据集已保存到 ../user_data/step1/kk/dataset 中





In [1]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import load_from_disk
import os


# 1. 加载tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("../user_data/bart_tokenizer")
print("tokenizer is done!")

# 2. 加载模型
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/ru/results/checkpoint-30940")
print("model is done!")


# 5. 加载数据
dataset_dir = "../user_data/step1/kk/dataset"
tokenized_train_dataset = load_from_disk(f"{dataset_dir}/train")
tokenized_val_dataset = load_from_disk(f"{dataset_dir}/val")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print("data is done!")

# 6. 设置训练参数
output_dir = "../user_data/step1/kk"
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, "results"),         # 训练结果保存路径
    eval_strategy="epoch",                            # 按步数进行评估
    save_strategy="epoch",                                   # 按步数进行保存
    logging_dir=os.path.join(output_dir, "logs"),           # 日志保存路径
    logging_strategy="epoch",                                     # 日志打印间隔
    learning_rate=1e-4,                                     # 学习率
    per_device_train_batch_size=64,                         # 每个设备的训练批次大小
    per_device_eval_batch_size=256,                         # 每个设备的验证批次大小
    weight_decay=0.01,                                      # 权重衰减
    save_total_limit=3,                                    # 保存的 checkpoint 数量上限
    num_train_epochs=10,                                     # 训练 epoch 数
    predict_with_generate=True,                             # 使用生成模式进行评估
    bf16=True,                                              # 使用 bf16 精度
    load_best_model_at_end=True,                            # 训练结束后加载最好的模型
    metric_for_best_model="eval_loss",                      # 最好模型的评估指标
    greater_is_better=False,                                # 对于 Loss，越小越好
)

print("训练参数已设置完成！")


# 7. 使用 Seq2SeqTrainer 进行 微调
trainer = Seq2SeqTrainer(
    model=model,                                     # 模型
    args=training_args,                              # 训练参数
    train_dataset=tokenized_train_dataset,           # 训练数据集
    eval_dataset=tokenized_val_dataset,              # 验证数据集
    tokenizer=tokenizer,                             # 分词器
)

# 8. 开始训练
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
2024-10-09 17:11:09.053332: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 17:11:09.075870: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 17:11:09.082865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 17:11:09.102673: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The

tokenizer is done!
model is done!
data is done!
训练参数已设置完成！


Epoch,Training Loss,Validation Loss
1,3.1584,2.256412
2,2.2041,1.933548
3,1.8301,1.738075
4,1.6674,1.642027
5,1.4815,1.591474
6,1.3837,1.563963
7,1.2666,1.54486
8,1.2538,1.539731
9,1.1631,1.533865
10,1.1724,1.532473


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=70, training_loss=1.6581157548086984, metrics={'train_runtime': 37.3639, 'train_samples_per_second': 107.055, 'train_steps_per_second': 1.873, 'total_flos': 212984659968000.0, 'train_loss': 1.6581157548086984, 'epoch': 10.0})

# 评估

In [2]:
import torch
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
import sacrebleu
from datasets import Dataset, load_from_disk
from torch.utils.data import DataLoader
from tqdm import tqdm 


# 设置参数
batch_size = 32
beam_size = 8


# 5. 加载数据
dataset_dir = "../user_data/step1/kk/dataset"
tokenized_train_dataset = load_from_disk(f"{dataset_dir}/train")
tokenized_val_dataset = load_from_disk(f"{dataset_dir}/val")
print("data is done!")
model_output_dir = "../user_data/bart_tokenizer"
# 加载tokenizer和model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_output_dir)
print("Tokenizer loaded.")
model = trainer.model
print("Model loaded.")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


data is done!




Tokenizer loaded.
Model loaded.


In [4]:
# 定义翻译函数
def translate_batch(batch):
    inputs = tokenizer(batch['source'], return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    # 去掉token_type_ids，只保留input_ids和attention_mask
    inputs = {key: inputs[key] for key in ['input_ids', 'attention_mask']}
    translated_tokens = model.generate(
        **inputs,
        max_length=128,
        num_beams=beam_size,
        early_stopping=True,
        decoder_start_token_id=model.config.bos_token_id,  # 使用起始标记
    )
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

# 创建保存预测结果和真实结果的列表
predictions = []
references = tokenized_val_dataset['target']
dataset = DataLoader(tokenized_val_dataset, batch_size=50)
# 分批处理并翻译
for batch in tqdm(dataset):
    batch_predictions = translate_batch(batch)
    predictions.extend([i.strip() for i in batch_predictions])

# 计算BLEU分数
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU-4 score: {bleu.score:.2f}")

100%|██████████| 2/2 [00:08<00:00,  4.37s/it]

BLEU-4 score: 0.52





# 全量数据微调15个epoch

In [7]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import load_from_disk, concatenate_datasets
import os




# 1. 加载tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("../user_data/bart_tokenizer")
print("tokenizer is done!")

# 2. 加载模型
model = BartForConditionalGeneration.from_pretrained("../user_data/step1/ru/results/checkpoint-30940")
print("model is done!")


# 5. 加载数据
dataset_dir = "../user_data/step1/kk/dataset"
tokenized_train_dataset = load_from_disk(f"{dataset_dir}/train")
tokenized_val_dataset = load_from_disk(f"{dataset_dir}/val")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# 拼接两个数据集
tokenized_dataset = concatenate_datasets([tokenized_train_dataset, tokenized_val_dataset])

print("data is done!")

# 6. 设置训练参数
output_dir = "../user_data/step1/kk"
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, "results"),         # 训练结果保存路径
    save_strategy="epoch",                                   # 按步数进行保存
    logging_dir=os.path.join(output_dir, "logs"),           # 日志保存路径
    logging_strategy="epoch",                                     # 日志打印间隔
    learning_rate=1e-4,                                     # 学习率
    per_device_train_batch_size=64,                         # 每个设备的训练批次大小
    weight_decay=0.01,                                      # 权重衰减
    save_total_limit=3,                                    # 保存的 checkpoint 数量上限
    num_train_epochs=15,                                     # 训练 epoch 数
    bf16=True,                                              # 使用 bf16 精度
)

print("训练参数已设置完成！")


# 7. 使用 Seq2SeqTrainer 进行 微调
trainer = Seq2SeqTrainer(
    model=model,                                     # 模型
    args=training_args,                              # 训练参数
    train_dataset=tokenized_dataset,           # 训练数据集
    tokenizer=tokenizer,                             # 分词器
)

# 8. 开始训练
trainer.train()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


tokenizer is done!
model is done!
data is done!
训练参数已设置完成！


Step,Training Loss
8,3.1529
16,2.0443
24,1.7022
32,1.4913
40,1.3407
48,1.2253
56,1.1238
64,1.0485
72,0.9836
80,0.93


TrainOutput(global_step=120, training_loss=1.2806284546852111, metrics={'train_runtime': 55.6214, 'train_samples_per_second': 134.84, 'train_steps_per_second': 2.157, 'total_flos': 399346237440000.0, 'train_loss': 1.2806284546852111, 'epoch': 15.0})