In [1]:
from datasets import load_dataset, load_metric
from transformers import (T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer,  DataCollatorForSeq2Seq)
import torch
import numpy as np
import matplotlib

Get Data

In [2]:
data_files = {
    'train': 't5_datasets/train.jsonl',
    'test': 't5_datasets/test.jsonl',
    'validation': 't5_datasets/validation.jsonl'
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 3584
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 768
    })
})

Get the token and the T5 model

In [4]:
model_name = 'google-t5/t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocess the data

In [5]:
def preprocess_function(data_p):
    prefix = "complete: "
    max_length = 512
    inputs = [prefix + d for d in data_p['input']]
    targets = [d for d in data_p['output']]
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)
        
    model_input['labels'] = labels['input_ids']  
    return model_input

In [6]:
trained_data = train_dataset.map(preprocess_function, batched=True)
validation_data = validation_dataset.map(preprocess_function, batched=True)
test_data = test_dataset.map(preprocess_function, batched=True)

In [7]:
type(validation_data)

datasets.arrow_dataset.Dataset

training arguments

In [8]:
batch_size = 16
epochs = 5
max_length = 512
output_dir = 't5_data/results'
logs_dir = 't5_data/logs'



args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    evaluation_strategy='steps',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=3e-5,
    num_train_epochs=epochs,
    logging_dir=logs_dir,
    eval_steps=200,
    logging_steps=200,
    save_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
    #predict_with_generate=True,
    warmup_steps=500
)

computing the metrics

In [9]:
metric1 = load_metric("bleu")


data_collator = DataCollatorForSeq2Seq(tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_labels = [[label] for label in decoded_labels]
    
    result = metric1.compute(predictions=predictions, references=decoded_labels)
    return {"bleu": result["score"]}
    

  metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


The trainer

In [10]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset= trained_data,
    eval_dataset= validation_data,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


GPU

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

Train the model

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
200,4.3016,0.289887
400,0.1481,0.065977
600,0.0734,0.049841
800,0.0541,0.040206
1000,0.0511,0.03499


Step,Training Loss,Validation Loss
200,4.3016,0.289887
400,0.1481,0.065977
600,0.0734,0.049841
800,0.0541,0.040206
1000,0.0511,0.03499


TrainOutput(global_step=1120, training_loss=0.831597021647862, metrics={'train_runtime': 21514.5043, 'train_samples_per_second': 0.833, 'train_steps_per_second': 0.052, 'total_flos': 1.09125253988352e+16, 'train_loss': 0.831597021647862, 'epoch': 5.0})

save model  and tokenizer

In [14]:
model_path = 't5_data/model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('t5_data/model\\tokenizer_config.json',
 't5_data/model\\special_tokens_map.json',
 't5_data/model\\spiece.model',
 't5_data/model\\added_tokens.json')

inferencing on test data