<a href="https://colab.research.google.com/github/JieShenAI/torch/blob/main/huggingface/example/T5/t5_math_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install SentencePiece

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
import random

In [6]:
import time

In [3]:
config = {
    # "lr": 5e-5,
    "batch_size": 256,
    "max_length": 8,
    "model_name": "t5-base",
    "epochs": 60,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def generate_data():
    # 构造训练数据
    data = []
    for i in range(2048):
        a = random.randint(0, 9)
        b = random.randint(0, 9)
        op = random.choice(["+", "-"])
        if op == "+":
            result = a + b
        else:
            result = a - b
        data.append((f"{a} {op} {b}", str(result)))

    return data

In [5]:
# 自己定义输入的数据
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
 
    def __len__(self):
        return len(self.data)
 
    def __getitem__(self, idx):
        input_text, target_text = self.data[idx]
        inputs = self.tokenizer(input_text,
                      return_tensors="pt",
                      padding="max_length",
                      max_length=self.max_length,
                      truncation=True)
        
        targets = self.tokenizer(target_text,
                      return_tensors="pt",
                      padding="max_length",
                      max_length=self.max_length,
                      truncation=True)
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

In [7]:
def train(model,
          tokenizer,
          train_loader,
          optimizer,
          scheduler,
          device=device,
          epochs=config["epochs"],
          save=False):
    model.train()
    T = 0
    for epoch in range(epochs):
        epoch_loss = 0
        start = time.time()
        for batch in train_loader:
            for k, v in batch.items():
                batch[k] = v.to(device)

            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            epoch_loss += loss.item()
            optimizer.step()
            scheduler.step()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        end = time.time()
        T += end - start
        print(f"Epoch: {epoch}, Loss: {epoch_loss / len(train_loader):.6f}, Time: {end-start:.2f}")
    print("总计耗时:", T)

In [8]:
# 使用 T5-small 模型。 如果想用更大的模型，可以将 "t5-small" 替换为例如 "t5-base" 或 "t5-large"
model_name = config['model_name']
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
max_length = config['max_length']
train_dataset = MyDataset(generate_data(), tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)

In [10]:
optimizer = AdamW(model.parameters(),
                  # lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
total_steps = config['epochs'] * len(train_loader)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                        num_warmup_steps = 0, # Default value in run_glue.py
                        num_training_steps = total_steps)



In [None]:
train(model, tokenizer, train_loader, optimizer, scheduler)

```
Epoch: 0, Loss: 3.292939, Time: 6.602124214172363:.2f
Epoch: 1, Loss: 0.476708, Time: 4.942367792129517:.2f
Epoch: 2, Loss: 0.351704, Time: 4.672595739364624:.2f
Epoch: 3, Loss: 0.310444, Time: 4.753790616989136:.2f
Epoch: 4, Loss: 0.265826, Time: 4.661877632141113:.2f
Epoch: 5, Loss: 0.226169, Time: 4.674436092376709:.2f
Epoch: 6, Loss: 0.190767, Time: 4.6789538860321045:.2f
Epoch: 7, Loss: 0.165914, Time: 4.8687825202941895:.2f
Epoch: 8, Loss: 0.145451, Time: 4.667200803756714:.2f
Epoch: 9, Loss: 0.123473, Time: 4.660913467407227:.2f
Epoch: 10, Loss: 0.109209, Time: 4.689663887023926:.2f
Epoch: 11, Loss: 0.098137, Time: 4.678196907043457:.2f
Epoch: 12, Loss: 0.089378, Time: 4.6722846031188965:.2f
Epoch: 13, Loss: 0.081416, Time: 4.676730155944824:.2f
Epoch: 14, Loss: 0.072970, Time: 4.975130319595337:.2f
Epoch: 15, Loss: 0.067008, Time: 4.663740634918213:.2f
Epoch: 16, Loss: 0.055806, Time: 4.665766477584839:.2f
Epoch: 17, Loss: 0.050228, Time: 4.696298837661743:.2f
Epoch: 18, Loss: 0.043638, Time: 4.654682397842407:.2f
Epoch: 19, Loss: 0.038652, Time: 4.673689842224121:.2f
Epoch: 20, Loss: 0.029480, Time: 4.843160152435303:.2f
Epoch: 21, Loss: 0.021585, Time: 4.701387405395508:.2f
Epoch: 22, Loss: 0.015126, Time: 4.660534620285034:.2f
Epoch: 23, Loss: 0.011189, Time: 4.653157472610474:.2f
Epoch: 24, Loss: 0.008453, Time: 4.74365234375:.2f
Epoch: 25, Loss: 0.005319, Time: 4.697196960449219:.2f
Epoch: 26, Loss: 0.003726, Time: 4.678369522094727:.2f
Epoch: 27, Loss: 0.003522, Time: 4.810733318328857:.2f
Epoch: 28, Loss: 0.003389, Time: 4.682125806808472:.2f
Epoch: 29, Loss: 0.003568, Time: 4.673954725265503:.2f
Epoch: 30, Loss: 0.002386, Time: 4.6613709926605225:.2f
Epoch: 31, Loss: 0.002095, Time: 4.779157638549805:.2f
Epoch: 32, Loss: 0.001470, Time: 4.681389808654785:.2f
Epoch: 33, Loss: 0.001789, Time: 4.681319236755371:.2f
Epoch: 34, Loss: 0.002169, Time: 4.770907402038574:.2f
Epoch: 35, Loss: 0.001334, Time: 4.661845922470093:.2f
Epoch: 36, Loss: 0.001482, Time: 4.654255151748657:.2f
Epoch: 37, Loss: 0.001460, Time: 4.685880899429321:.2f
Epoch: 38, Loss: 0.001086, Time: 4.755070686340332:.2f
Epoch: 39, Loss: 0.000825, Time: 4.674823045730591:.2f
Epoch: 40, Loss: 0.001633, Time: 4.810059309005737:.2f
Epoch: 41, Loss: 0.001298, Time: 4.726668834686279:.2f
Epoch: 42, Loss: 0.000710, Time: 4.674699783325195:.2f
Epoch: 43, Loss: 0.000646, Time: 4.750946044921875:.2f
Epoch: 44, Loss: 0.001006, Time: 4.669639825820923:.2f
Epoch: 45, Loss: 0.000591, Time: 4.77149224281311:.2f
Epoch: 46, Loss: 0.000799, Time: 4.6659533977508545:.2f
Epoch: 47, Loss: 0.000553, Time: 4.829089164733887:.2f
Epoch: 48, Loss: 0.000549, Time: 4.681228160858154:.2f
Epoch: 49, Loss: 0.000702, Time: 4.673107862472534:.2f
Epoch: 50, Loss: 0.000487, Time: 4.700106143951416:.2f
```

In [13]:
@torch.no_grad()
def predict_demo(text):
    model.eval()
    inputs = tokenizer(text,
                return_tensors="pt",
                padding="max_length",
                max_length=config["max_length"],
                truncation=True)       
    inputs = inputs.to(device)
    outputs = model.generate(**inputs)
    # print(outputs)
    return tokenizer.decode(outputs[0])
predict_demo("1 - 2")



'<pad> -1</s>'

In [15]:
predict_demo("6 + 9")

'<pad> 15</s>'

In [16]:
predict_demo("6 - 9")

'<pad> -3</s>'

In [17]:
predict_demo("7 + 9")

'<pad> 16</s>'

In [None]:
# 保存模型
# model.save_pretrained("trained_t5")
# tokenizer.save_pretrained("trained_t5")