In [1]:
from datasets import load_from_disk
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
)


In [2]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

In [3]:
dataset = load_from_disk("/data/nicolasmaier/dataset/hf_split_dataset")
dataset = dataset.remove_columns(["idx"])
#dataset = dataset.with_format("torch")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7931293
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 255994
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 476050
    })
})


In [9]:
BATCH_SIZE = 8

args = Seq2SeqTrainingArguments(
    output_dir="/data/nicolasmaier/model/codet5-finetuned-split",
    evaluation_strategy="steps",
    eval_steps=3000,
    logging_strategy="steps",
    logging_steps=3000,
    save_strategy="steps",
    save_steps=3000,
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=1000,
    num_train_epochs=3,
    predict_with_generate=True,
    #load_best_model_at_end=True,
    #metric_for_best_model="EM", # or BLEU?
    report_to="tensorboard",
    fp16=True, # train faster
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"].shuffle(seed=42).select(range(3000)),
    data_collator=data_collator,
    tokenizer=tokenizer,
)


Using cuda_amp half precision backend


In [10]:
trainer.train()

***** Running training *****
  Num examples = 7931293
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2974236
  Number of trainable parameters = 60492288


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 255994
  Batch size = 8


KeyboardInterrupt: 