In [30]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [31]:
import json

def load_data(file_path, max_lines=float("inf")):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        n = 0
        for line in f:
            n += 1
            if n > max_lines: break

            try:
                item = json.loads(line.strip())
                data.append(item)
            except json.JSONDecodeError as e:
                print(f"JSON解码错误：{e}，跳过该行")
                continue
    return data

train_data = load_data('translation2019zh_train.json', 10000)
valid_data = load_data('translation2019zh_valid.json', 100)

print("Data loaded successfully!")
print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(valid_data)}")

Data loaded successfully!
Number of training samples: 10000
Number of validation samples: 100


In [32]:
def clean_data(data):
    cleaned_data = []
    seen = set()
    for item in data:
        en = item['english'].strip()
        zh = item['chinese'].strip()
        if (en, zh) not in seen:
            cleaned_data.append({'english': en, 'chinese': zh})
            seen.add((en, zh))
    return cleaned_data

train_data = clean_data(train_data)
valid_data = clean_data(valid_data)

print(f"训练数据清洗完毕，剩余 {len(train_data)} 条。")
print(f"验证数据清洗完毕，剩余 {len(valid_data)} 条。")


训练数据清洗完毕，剩余 10000 条。
验证数据清洗完毕，剩余 100 条。


In [33]:
from transformers import AutoTokenizer

tokenizer_name = "bert-base-multilingual-cased" # 多语言BERT分词器
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_data(data, tokenizer, max_length=128):
    source_texts = [item['english'] for item in data]
    target_texts = [item['chinese'] for item in data]

    tokenized_data = tokenizer(source_texts, text_target=target_texts, 
                              max_length=max_length, truncation=True, padding='max_length')
    return tokenized_data

train_data = tokenize_data(train_data, tokenizer)
valid_data = tokenize_data(valid_data, tokenizer)

print("训练数据分词完毕。")
print("验证数据分词完毕。")


训练数据分词完毕。
验证数据分词完毕。


In [34]:
import torch

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = TranslationDataset(train_data)
valid_dataset = TranslationDataset(valid_data)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=8)

print("训练数据加载器创建完毕。")
print("验证数据加载器创建完毕。")


训练数据加载器创建完毕。
验证数据加载器创建完毕。


In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np

# 选择一个合适的预训练模型
model_name = "Helsinki-NLP/opus-mt-en-zh" # 示例：一个小型NMT模型
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 配置LoRA（如果使用PEFT）
peft_config = LoraConfig(
    r=8, # LoRA 秩
    lora_alpha=16, # LoRA 缩放因子
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM, # 序列到序列语言模型任务
    target_modules=["q_proj", "v_proj"] # 目标模块，通常是注意力机制中的查询和值投影层
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # 打印可训练参数

model.to('cpu')

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    use_cpu=True,
)

# 定义评估指标计算函数
from evaluate import load
bleu_metric = load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels] # 调整references格式
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu_result["bleu"]}

# 创建Trainer实例并进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainable params: 294,912 || all params: 78,238,208 || trainable%: 0.3769




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from evaluate import load
import numpy as np

def evaluate_model(model, tokenizer, test_dataset, max_length=128):
    """
    评估模型性能。

    Args:
        model: 训练好的模型。
        tokenizer: 分词器。
        test_dataset: 测试数据集。
        max_length (int, optional): 最大生成长度。

    Returns:
        dict: 包含评估指标的字典。
    """

    # 生成翻译结果
    predictions = []
    references = []
    model.eval() # 设置模型为评估模式
    for batch in test_dataset:
        with torch.no_grad():
            input_ids = batch['input_ids'].unsqueeze(0).to(model.device)
            generated_ids = model.generate(input_ids, max_length=max_length)
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            predictions.extend(decoded_preds)

        labels = batch['labels'].numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # Replace masked label ids
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        references.extend(decoded_labels)

    # 计算评估指标
    bleu = load('bleu')
    chrf = load('chrf')
    comet = load('comet') # 需要安装 'unbabel-comet'

    bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])['bleu']
    chrf_score = chrf.compute(predictions=predictions, references=[[ref] for ref in references])['score']
    comet_score = comet.compute(predictions=predictions, references=[[ref] for ref in references], sources=[[src] for src in source_texts])['mean_score']

    return {'bleu': bleu_score, 'chrf': chrf_score, 'comet': comet_score}