# 翻译任务

翻译属于Seq 2 Seq 任务，类似文本摘要

### 预处理

In [2]:
from datasets import load_dataset
# 这里我选择了中文-日文
raw_datasets  = load_dataset("kde4", lang1="ja", lang2="zh_CN",trust_remote_code=True)
print(raw_datasets ['train']['translation'][:3])
print(raw_datasets )
# 切割数据为训练集和测试集
split_datasets = raw_datasets['train'].train_test_split(train_size=0.9,seed=20)
split_datasets['validation'] = split_datasets.pop('test')
print(split_datasets)

[{'ja': '日本語翻訳：しのぱ', 'zh_CN': '开源软件国际化之简体中文组'}, {'ja': '日本語翻訳：しのぱ', 'zh_CN': '开源软件国际化之简体中文组'}, {'ja': '日本語訳：Sinopa sinohara@kde.gr.jp', 'zh_CN': 'Funda Wang'}]
DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 118258
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 106432
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 11826
    })
})


In [2]:
# !pip install sentencepiece

In [3]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

model_checkpoint = 'Helsinki-NLP/opus-mt-tc-big-zh-ja'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [4]:
print(dir(tokenizer))# 查看所有属性和方法
print(tokenizer.source_lang)
print(tokenizer.target_lang)
print(split_datasets['train'][0]['translation']['ja'])
print(split_datasets['train'][0]['translation']['zh_CN'])

# 尝试tokenize
inputs = tokenizer(split_datasets['train'][0]['translation']['zh_CN'],text_target=split_datasets['train'][0]['translation']['ja'])
print(inputs)
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

zh
ja
完全に拡大したときに描画する最も暗い恒星の等級を指定します。
最小视野时， 显示的最弱星等 。
{'input_ids': [3, 5143, 0, 31527, 19, 0, 0, 3352, 899, 703, 3, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [15693, 1561, 20283, 17900, 35, 388, 6089, 16781, 6, 26173, 9825, 4, 2]}
['▁', '最小', '<unk>', '时', ',', '<unk>', '<unk>', '弱', '星', '等', '▁', '。', '</s>']


In [5]:
# 预处理函数
max_length = 128
# 处理两种语言的句子放入tokenizer
def preprocess_function(examples):
    inputs = [ex["zh_CN"] for ex in examples["translation"]]
    targets = [ex["ja"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

tokenized_datasets = split_datasets.map(
    preprocess_function,batched=True,remove_columns=split_datasets['train'].column_names
)
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 106432
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11826
    })
})


### 模型和数据整理器

In [13]:
from transformers import AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)


In [14]:
# 测试
batch = data_collator([tokenized_datasets['train'][i] for i in range(1,3)])
print(batch.keys())
print(batch['labels'])
print(batch['decoder_input_ids']) # 解码器输入的是目标语言的label
print(tokenizer.decode(batch['input_ids'][0]))
print(tokenizer.decode(batch['labels'][0]))            # 解码器输出对比
print(tokenizer.decode(batch['decoder_input_ids'][0])) # 解码器输入的是目标语言的label

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
tensor([[11244,  3832, 13541,    18,   342,   422,  6615,     3,  5404, 25778,
             2],
        [ 6071,   996, 23722,     4,     2,  -100,  -100,  -100,  -100,  -100,
          -100]])
tensor([[32000, 11244,  3832, 13541,    18,   342,   422,  6615,     3,  5404,
         25778],
        [32000,  6071,   996, 23722,     4,     2, 32000, 32000, 32000, 32000,
         32000]])
未<unk> <unk> : compaction <unk> </s> <pad> <pad>
未読件数:compaction status</s>
<pad> 未読件数:compaction status


### BLEU评估

In [None]:
# !pip install sacrebleu
# !pip install mecab-python3 unidic-lite  # 专门分词日文的工具

import evaluate
import MeCab
metric = evaluate.load("sacrebleu")
# 初始化mecab分词器
tagger = MeCab.Tagger("-Owakati")

In [84]:
# 评估指标使用BLEU，但是一个句子可能有多种翻译方式，所以预测结果应该是一个句子列表，而参考应该是一个句子列表的列表
print(tagger.parse('実を言うと私はコンピュータ, 関連の仕事がとても好きです。').strip())
predictions = [
    tagger.parse('実を言うと私はコンピュータ, 関連の仕事がとても好きです。').strip()
]
references = [
    [[tagger.parse('実はコンピュータ関連の仕事が本当に好きです。').strip()]]
]
metric.compute(predictions=predictions,references=references)

実 を 言う と 私 は コンピュータ , 関連 の 仕事 が とても 好き です 。


{'score': 20.68720601025941,
 'counts': [8, 5, 2, 1],
 'totals': [16, 15, 14, 13],
 'precisions': [50.0,
  33.333333333333336,
  14.285714285714286,
  7.6923076923076925],
 'bp': 1.0,
 'sys_len': 16,
 'ref_len': 14}

In [87]:
import numpy as np

# 接收预测的字符id序列和真实标签的id序列，评估结果
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # 如果模型返回的内容超过了预测的logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # 由于我们无法解码 -100,因此将标签中的 -100 替换掉
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 一些简单的后处理
    decoded_preds = [tagger.parse(pred).strip() for pred in decoded_preds]
    decoded_labels = [[tagger.parse(label).strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

### Trainer微调

In [88]:
from transformers import Seq2SeqTrainingArguments,Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    f"./my_model/marian-finetuned-kde4-zh_CN-to-ja",
    evaluation_strategy='no',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True, # 可以自动调用Generate方法生成句子而不是词表概率分布序列，能增强训练
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [89]:
# 目前分数
trainer.evaluate(max_length=32) # 控制生成文本的最大长度

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/185 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 6.214041233062744,
 'eval_model_preparation_time': 0.0037,
 'eval_bleu': 0.06925426983533732,
 'eval_runtime': 1546.9731,
 'eval_samples_per_second': 7.645,
 'eval_steps_per_second': 0.12}

In [90]:
# 训练
trainer.train()

  0%|          | 0/9978 [00:00<?, ?it/s]

{'loss': 4.0193, 'grad_norm': 6.290567874908447, 'learning_rate': 1.8997795149328523e-05, 'epoch': 0.15}
{'loss': 3.3635, 'grad_norm': 6.268918037414551, 'learning_rate': 1.7995590298657048e-05, 'epoch': 0.3}
{'loss': 3.1024, 'grad_norm': 5.958572864532471, 'learning_rate': 1.699338544798557e-05, 'epoch': 0.45}
{'loss': 2.8858, 'grad_norm': 6.385309219360352, 'learning_rate': 1.5991180597314094e-05, 'epoch': 0.6}
{'loss': 2.748, 'grad_norm': 7.46329402923584, 'learning_rate': 1.4988975746642616e-05, 'epoch': 0.75}
{'loss': 2.6116, 'grad_norm': 5.865606784820557, 'learning_rate': 1.3986770895971137e-05, 'epoch': 0.9}




{'loss': 2.462, 'grad_norm': 5.529031753540039, 'learning_rate': 1.298456604529966e-05, 'epoch': 1.05}
{'loss': 2.2769, 'grad_norm': 4.721032619476318, 'learning_rate': 1.1982361194628182e-05, 'epoch': 1.2}
{'loss': 2.195, 'grad_norm': 6.690873622894287, 'learning_rate': 1.098216075365805e-05, 'epoch': 1.35}
{'loss': 2.1695, 'grad_norm': 5.7417683601379395, 'learning_rate': 9.979955902986571e-06, 'epoch': 1.5}
{'loss': 2.1156, 'grad_norm': 5.667165279388428, 'learning_rate': 8.977751052315094e-06, 'epoch': 1.65}
{'loss': 2.1042, 'grad_norm': 6.3196001052856445, 'learning_rate': 7.975546201643617e-06, 'epoch': 1.8}
{'loss': 2.0832, 'grad_norm': 6.4377570152282715, 'learning_rate': 6.975345760673482e-06, 'epoch': 1.95}
{'loss': 1.9526, 'grad_norm': 5.914865016937256, 'learning_rate': 5.973140910002004e-06, 'epoch': 2.1}
{'loss': 1.8991, 'grad_norm': 6.0430402755737305, 'learning_rate': 4.970936059330528e-06, 'epoch': 2.25}
{'loss': 1.8788, 'grad_norm': 6.473100185394287, 'learning_rate':

TrainOutput(global_step=9978, training_loss=2.3710779315079114, metrics={'train_runtime': 2064.6036, 'train_samples_per_second': 154.652, 'train_steps_per_second': 4.833, 'total_flos': 1.6868377028984832e+16, 'train_loss': 2.3710779315079114, 'epoch': 3.0})

In [91]:
trainer.evaluate(max_length=32) # 控制生成文本的最大长度

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/185 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 2.0260016918182373,
 'eval_model_preparation_time': 0.0037,
 'eval_bleu': 21.811598109485114,
 'eval_runtime': 432.9502,
 'eval_samples_per_second': 27.315,
 'eval_steps_per_second': 0.427,
 'epoch': 3.0}

In [None]:
model.save_pretrained('./my_model/translation')

: 

In [6]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained('./my_model/translation')


In [33]:
import torch
outputs = model(**batch)
outputs = torch.argmax(outputs.logits,dim=-1)
print(tokenizer.decode(batch['input_ids'][0]))
print(tokenizer.decode(batch['labels'][0]))
print(tokenizer.decode(outputs[0]))

未<unk> <unk> : compaction <unk> </s> <pad> <pad>
未読件数:compaction status</s>
読の 1ments</s> </s> 読みtus</s>


In [35]:
from transformers import pipeline
tokenizer.save_pretrained("./my_model/translation")
translator = pipeline('translation',model='./my_model/translation')

Device set to use cuda:0


In [43]:
translator("开源软件国际化之简体中文组")

[{'translation_text': '日本語翻訳:しのぱ'}]

### pytorch训练

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

optimizer = AdamW(model.parameters(), lr=2e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
output_dir = "./my_model/marian-finetuned-kde4-zh_CN-to-ja"

# 新增：使用MeCab分词
def tokenize_ja(text):
    return tagger.parse(text).strip()

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # 替换标签中的 -100,因为我们无法解码它们。
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 一些简单的后处理
    decoded_preds = [tokenize_ja(pred) for pred in decoded_preds]
    decoded_labels = [[tokenize_ja(label)] for label in decoded_labels]
    return decoded_preds, decoded_labels

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 评估
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # 需要填充预测和标签才能调用gather()
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # 保存和上传
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
