In [2]:
from datasets import load_from_disk
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
)


In [3]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

In [5]:
dataset = load_from_disk("/data/nicolasmaier/dataset/hf_cropped_dataset")
dataset = dataset.remove_columns(["code", "contents", "xmi", "originalLine"])
#dataset = dataset.with_format("torch")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25156
    })
})


In [16]:
BATCH_SIZE = 8

args = Seq2SeqTrainingArguments(
    output_dir="../models/codet5-finetuned-2",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    predict_with_generate=True,
    #load_best_model_at_end=True,
    #metric_for_best_model="EM", # or BLEU?
    report_to="tensorboard",
    fp16=True, # train faster
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


PyTorch: setting up devices
Using cuda_amp half precision backend


In [17]:
trainer.train()

***** Running training *****
  Num examples = 425631
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 159612
  Number of trainable parameters = 60492288


Step,Training Loss,Validation Loss
500,0.1884,0.096273
1000,0.0967,0.06449
1500,0.0712,0.05297
2000,0.0591,0.043582
2500,0.0502,0.03837
3000,0.0441,0.031894
3500,0.0381,0.029232
4000,0.034,0.026426
4500,0.0303,0.022283
5000,0.0262,0.020855


***** Running Evaluation *****
  Num examples = 14634
  Batch size = 8
Saving model checkpoint to ../models/codet5-finetuned/checkpoint-500
Configuration saved in ../models/codet5-finetuned/checkpoint-500/config.json
Model weights saved in ../models/codet5-finetuned/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../models/codet5-finetuned/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../models/codet5-finetuned/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 14634
  Batch size = 8
Saving model checkpoint to ../models/codet5-finetuned/checkpoint-1000
Configuration saved in ../models/codet5-finetuned/checkpoint-1000/config.json
Model weights saved in ../models/codet5-finetuned/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../models/codet5-finetuned/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../models/codet5-finetuned/checkpoint-1000/special_tokens_map.json
***** Running E

KeyboardInterrupt: 