In [24]:
import torch
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import numpy as np
from evaluate import load
import re

# 指定模型名称
model_name = "Helsinki-NLP/opus-mt-en-zh"  # 英文到中文的翻译模型

# 加载模型和分词器
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def clean_translation(text):
    text = re.sub(r"\s+", " ", text)  # 去除多余的空格
    text = re.sub(r"\s([?.!\"'])", r"\1", text)  # 去除句末的多余空格
    return text

def translate(text, tokenizer, model):
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True)
    outputs = model.generate(inputs, max_length=50, num_beams=8, early_stopping=True)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_translation(translated_text)

print(translate("The patient was prescribed PO metformin 500mg BID for HbA1c control.", tokenizer, model))


为HbA1c控制,向病人开具了PO 500mg BID的PO imforkin 500mg BID。


In [20]:
data = {
    "translation": [
        {"en": "This is a small example.", "zh": "这是一个小例子。"},
        {"en": "The cat is on the mat.", "zh": "猫在垫子上。"},
        {"en": "Please pass the salt.", "zh": "请递一下盐。"},
        {"en": "Where is the library?", "zh": "图书馆在哪里？"},
    ]
}
from datasets import Dataset
dataset = Dataset.from_dict(data)

def tokenize_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    inputs = tokenizer(inputs, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(targets, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }


tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets
eval_dataset = tokenized_datasets

print(tokenized_datasets[0])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

{'translation': {'en': 'This is a small example.', 'zh': '这是一个小例子。'}, 'input_ids': [208, 32, 13, 1037, 1146, 6, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000], 'attention_mask': [1, 1, 1, 1, 1, 1, 

In [17]:
metric = load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.array(preds)  # 转换为 numpy 数组
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# 4. 完全微调
training_args_full = TrainingArguments(
    output_dir="./full-tuned-marian-en-zh",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    num_train_epochs=20,  # 可以调整
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="none" #  避免与 LoRA 的回调冲突
)

trainer_full = Trainer(
    model=model,
    args=training_args_full,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_full.train()
full_tuned_results = trainer_full.evaluate()

  trainer_full = Trainer(


Epoch,Training Loss,Validation Loss


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'