In [1]:
import os
from datasets import load_dataset
path = os.path.join("..","dataset","tiny_stories","en-zh")
data = load_dataset("json",data_files={"train":os.path.join(path,"train.jsonl"),"test":os.path.join(path,"valid.jsonl")})

In [2]:
from transformers import MT5Tokenizer
from transformers import RobertaForSequenceClassification,RobertaTokenizer,get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import Dataset
import os
import random
import numpy as np
import evaluate
import torch
import argparse
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.functional import F
from torch.cuda.amp import autocast as autocast,GradScaler
import pandas as pd
from sklearn.metrics import f1_score
from torch.optim import AdamW
import bleu


In [3]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
tokenizer.model_max_length = 512

In [5]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

In [6]:
def preprocess_function(examples):
    inputs = ["Translate English into Chinese:" + example['text'] for example in examples['src_text']]
    tgts = [ example for example in examples['tgt_text'] ]
    model_inputs = tokenizer(inputs,text_target=tgts,padding="max_length",truncation=True)  
    return model_inputs

In [7]:
tokenzied_data = data.load_from_disk('tokenzied_data')
#tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['src_text','tgt_text'])

In [8]:
tokenzied_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [9]:
tokenizer.decode(tokenzied_data['train'][0]['input_ids'])

'Translate English into Chinese:Once upon a time, there was a little girl named Lily. She loved to talk to her friends, and she loved to play with her toys. One day, she found a lemon in her kitchen. Lily thought the lemon was a toy, so she took it to her room to play with it. Lily started to feel helpless because the lemon was not fun like her other toys. She tried to talk to the lemon, but it did not talk back. Lily was sad and didn\'t know what to do. Then, something unexpected happened. The lemon started to grow bigger and bigger! Suddenly, the lemon turned into a big, friendly lemon man. He could talk, and he was not helpless at all! He said, "Hi Lily, I am Mr. Lemon. I was hiding in your kitchen, and now I am here to play with you." Lily was so happy and surprised. They played together all day long, and Lily had a new, fun friend.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [10]:
tokenizer.decode(tokenzied_data['train'][0]['labels'])

'从前,有一个叫莉莉的小女孩。她喜欢和她的朋友聊天,她也喜欢玩她的玩具。一天,她在厨房里发现了一个柠檬。莉莉以为柠檬是玩具,所以她把它带到房间里玩。莉莉开始感到无助,因为柠檬不像她的其他玩具那样有趣。她试着和柠檬说话,但它不顶嘴。莉莉很难过,不知道该怎么办。然后,意想不到的事情发生了。柠檬开始变得越来越大!突然,柠檬变成了一个又大又友好的柠檬人。他会说话,一点也不无助!他说:“嗨,莉莉,我是柠檬先生。我躲在你的厨房里,现在我来和你一起玩。”莉莉既高兴又惊讶。他们整天都在一起玩,莉莉有了一个有趣的新朋友。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [11]:
def collate_fn(examples):
    return tokenizer.pad(examples, padding="max_length", return_tensors="pt")

In [12]:
train_batch_size = 4
eval_batch_size = 2
train_dataloader = DataLoader(tokenzied_data['train'],collate_fn=collate_fn,batch_size = train_batch_size)
eval_dataloader = DataLoader(tokenzied_data['test'] , collate_fn=collate_fn,batch_size = eval_batch_size)

In [13]:
num_epochs = 1
lr = 1e-5
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)


In [17]:
device = "cuda"
model.to(device)
max_eval_codebleu = 0
iter_to_accumlate = 4
epochloss = []
for epoch in range(num_epochs):
    model.train()
    allloss = 0
    for step,batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss/iter_to_accumlate
        loss.backward()
        allloss += loss.item()
        epochloss.append(loss.item())
        if (step+1)%iter_to_accumlate==0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        if (step+1)%(4*iter_to_accumlate) == 0:
            print("epoch",epoch,"step",step,"loss",loss,sep=" ")
    print("epoch",epoch,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))
    
    count = 0
    model.eval()
    validloss = []
    all_me = []
    for step,batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            output = model.generate(**batch)
        # validloss += [output.loss.item()]
        references = []
        hypothesis = []
        label_strs = tokenizer.batch_decode(batch['labels'].cpu(),skip_special_tokens = True)
        output_strs = tokenizer.batch_decode(output.cpu())
        for i in range(len(label_strs)):
            all_me.append(bleu.sentence_bleu(references= [label_strs[i].split()], hypothesis=output_strs[i].split()))
        print(label_strs,output_strs,all_me)
    eval_bleu = sum(all_me)/len(all_me)
    print("epoch ",epoch,"bleu ",eval_bleu)
    if eval_bleu > max_eval_codebleu:
        min_eval_codebleu = eval_bleu
        model.save_pretrained("Jchew/mT5ForTranslate_English2Chinese")
        model.push_to_hub("Jchew/mT5ForTranslate_English2Chinese")
        tokenizer.push_to_hub("Jchew/mT5ForTranslate_English2Chinese")

  0%|          | 0/125000 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.91 GiB. GPU 0 has a total capacity of 23.67 GiB of which 1.49 GiB is free. Process 21164 has 2.04 GiB memory in use. Including non-PyTorch memory, this process has 20.01 GiB memory in use. Of the allocated memory 18.03 GiB is allocated by PyTorch, and 887.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)