In [1]:
import os
from datasets import load_dataset
path = os.path.join("..","dataset","tiny_stories","en-zh")
data = load_dataset("json",data_files={"train":os.path.join(path,"train.jsonl"),"test":os.path.join(path,"valid.jsonl")})

In [2]:
from transformers import AutoTokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
tokenizer.pad_token
tokenizer.padding_side

'right'

In [5]:
tokenizer.model_max_length = 256

In [6]:
from transformers import AutoModelForSeq2SeqLM,Seq2SeqTrainingArguments,Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

In [7]:
def preprocess_function(examples):
    inputs = ["Translate English into  Chinese:" + example['text'] for example in examples['src_text']]
    tgts = [ example for example in examples['tgt_text'] ]
    model_inputs = tokenizer(inputs,text_target=tgts,padding = True,truncation=True)  
    return model_inputs

In [8]:
tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['src_text','tgt_text'])

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

In [9]:
tokenzied_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [15]:
tokenizer.decode(tokenzied_data['train'][0]['input_ids'])

'Translate English into Chinese:Once upon a time, there was a little girl named Lily. She loved to talk to her friends, and she loved to play with her toys. One day, she found a lemon in her kitchen. Lily thought the lemon was a toy, so she took it to her room to play with it. Lily started to feel helpless because the lemon was not fun like her other toys. She tried to talk to the lemon, but it did not talk back. Lily was sad and didn\'t know what to do. Then, something unexpected happened. The lemon started to grow bigger and bigger! Suddenly, the lemon turned into a big, friendly lemon man. He could talk, and he was not helpless at all! He said, "Hi Lily, I am Mr. Lemon. I was hiding in your kitchen, and now I am here to play with you." Lily was so happy and surprised. They played together all day long, and Lily had a new, fun friend.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [16]:
tokenizer.decode(tokenzied_data['train'][0]['labels'])

'从前,有一个叫莉莉的小女孩。她喜欢和她的朋友聊天,她也喜欢玩她的玩具。一天,她在厨房里发现了一个柠檬。莉莉以为柠檬是玩具,所以她把它带到房间里玩。莉莉开始感到无助,因为柠檬不像她的其他玩具那样有趣。她试着和柠檬说话,但它不顶嘴。莉莉很难过,不知道该怎么办。然后,意想不到的事情发生了。柠檬开始变得越来越大!突然,柠檬变成了一个又大又友好的柠檬人。他会说话,一点也不无助!他说:“嗨,莉莉,我是柠檬先生。我躲在你的厨房里,现在我来和你一起玩。”莉莉既高兴又惊讶。他们整天都在一起玩,莉莉有了一个有趣的新朋友。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [17]:
tokenzied_data['test']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

In [18]:
import bleu
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()for label in labels]]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    hypothesis, pre_references = postprocess_text(decoded_preds, decoded_labels)
    for i in range(len(pre_references)):
        assert len(hypothesis) == len(pre_references[i])

    references = []
    for i in range(len(hypothesis)):
        ref_for_instance = []
        for j in range(len(pre_references)):
            ref_for_instance.append(pre_references[j][i])
        references.append(ref_for_instance)
    assert len(references) == len(pre_references)*len(hypothesis)

    tokenized_hyps = [x.split() for x in hypothesis]
    tokenized_refs = [[x.split() for x in reference] for reference in references]
    
    result = {}
    result['BLEU'] = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)
    return result

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model = model)

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
epoch_nums = 2
training_args = Seq2SeqTrainingArguments(
    output_dir="mT5ForTranslate_English2Chinese",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=epoch_nums,
    predict_with_generate=True,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * epoch_nums),
    fp16=True,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    eval_dataset=tokenzied_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
