In [1]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-ru")
books = books["train"].train_test_split(test_size=0.2)
books["train"][0]

Reusing dataset opus_books (C:\Users\chris\.cache\huggingface\datasets\opus_books\en-ru\1.0.0\e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)
Loading cached split indices for dataset at C:\Users\chris\.cache\huggingface\datasets\opus_books\en-ru\1.0.0\e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf\cache-e5dd5cfa27ad0d1b.arrow and C:\Users\chris\.cache\huggingface\datasets\opus_books\en-ru\1.0.0\e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf\cache-f3fe11a8a7d1888b.arrow


{'id': '11639',
 'translation': {'en': "'He wrote with chalk.", 'ru': '-- Он писал мелом.'}}

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base", use_fast=False)

source_lang = "en"
target_lang = "ru"
prefix = "translate English to Russian"

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_books = books.map(preprocess_function, batched=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\opus_books\en-ru\1.0.0\e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf\cache-e7b436c41f8f9531.arrow
Loading cached processed dataset at C:\Users\chris\.cache\huggingface\datasets\opus_books\en-ru\1.0.0\e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf\cache-bc05bfae7ba16dc1.arrow


In [3]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [4]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [5]:
os.environ["WANDB_DISABLED"] = "true"
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, translation. If id, translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13996
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10497
  0%|          | 16/10497 [00:20<5:00:13,  1.72s/it]

In [None]:
trainer.save_model("en-ru.t5")
#trainer.push_to_hub()