# 1 数据预处理

In [1]:
import os
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
data_path = "./data/hsk_1_4.txt"

def read_data(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    return lines

def handle_data(path):
    lines = read_data(path)
    '''
    english:  A gust of wind blew the door shut.
    hsk: 1
    mandarin: 一阵大风吹来，把门关上了。
    pinyin: yī zhèn dà fēng chuī lái， bǎ mén guān shàng le。
    --
    '''
    input = []
    target = []
    for line in lines:
        if "english:" in line:
            input.append("translate English to Chinese: " + line[8:].strip())
        elif "mandarin:" in line:
            target.append(line[9:].strip())
    return input, target

tokenizer_name = "google/mt5-small"
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(tokenizer_name, local_files_only=True)

input, target = handle_data(data_path)
encoded_input = tokenizer.batch_encode_plus(input, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
encoded_target = tokenizer.batch_encode_plus(target, max_length=128, padding="max_length", truncation=True, return_tensors="pt").input_ids
dataset = TensorDataset(encoded_input["input_ids"], encoded_input["attention_mask"], encoded_target)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# 2 模型训练

In [3]:
epochs, batch_size = 5, 16
lr = 5e-5
num_workers = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from tqdm import tqdm

model = T5ForConditionalGeneration.from_pretrained(model_name, num_labels=len(tokenizer.get_vocab()), local_files_only=True)
model.to(device)

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.Adam(model.parameters(), lr=lr)

max_norm = 1

for epoch in range(epochs):
    model.train()
    bar = tqdm(train_loader)
    bar.set_description(f"epoch: {epoch + 1}")
    for i, batch in enumerate(bar):
        input_ids, attention_mask, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        trainer.zero_grad()
        loss_value = outputs.loss
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        trainer.step()
        bar.set_postfix(loss=loss_value.item())


You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
epoch: 1:   0%|          | 0/1398 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
epoch: 1: 100%|██████████| 1398/1398 [03:06<00:00,  7.50it/s, loss=0.896]
epoch: 2: 100%|██████████| 1398/1398 [03:08<00:00,  7.43it/s, loss=0.461]
epoch: 3: 100%|██████████| 1398/1398 [03:11<00:00,  7.29it/s, loss=0.721]
epoch: 4: 100%|██████████| 1398/1398 [03:12<00:00,  7.27it/s, loss=0.618]
epoch: 5: 100%|██████████| 1398/1398 [03:10<00:00,  7.35it/s, loss=0.384]


# 3 效果验证

In [16]:
from  transformers import T5Config

sentences = ["A black swan is rare.",
             "A breeder means a person who breeds animals.",
             "A German scientist interrupted me and asked if I came from China.",
             "How old are you."
             ]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 使用训练完毕的模型参数进行预测输出结果
model_saved = "models/en_zh_translation-mt5-epoch5.pth"
config = T5Config.from_pretrained(model_name, num_labels=len(tokenizer.get_vocab()), local_files_only=True)
model = T5ForConditionalGeneration(config)
# model = T5ForConditionalGeneration.from_pretrained(model_saved, local_model_only=True)
model.load_state_dict(torch.load(model_saved, weights_only= True))
model.eval()
model.to(device)

for i, sentence in enumerate(sentences):
    print(f"en: {sentence}")
    input_ids = tokenizer("translate English to Chinese: " + sentence, return_tensors="pt", max_length=128, padding="max_length", truncation=True).input_ids
    input_ids = input_ids.to(device)

    outputs = model.generate(input_ids)
    # print(outputs)
    print(f"zh: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")


You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


en: A black swan is rare.
zh: 黑白的斑斓是稀罕的。

en: A breeder means a person who breeds animals.
zh: 驯养者是驯养动物的驯养者

en: A German scientist interrupted me and asked if I came from China.
zh: 一位德国科学家向我问我是否来自中国。

en: How old are you.
zh: 你怎样老?

