In [1]:
from datasets import load_from_disk
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
)


In [2]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

In [3]:
dataset = load_from_disk("../dataset/hf_dataset")
dataset = dataset.remove_columns(["code", "contents", "xmi", "originalLine"])
#dataset = dataset.with_format("torch")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25156
    })
})


In [4]:
BATCH_SIZE = 4

args = Seq2SeqTrainingArguments(
    output_dir="../models/codet5-finetuned",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="EM", # or BLEU?
    report_to="tensorboard",
    fp16=True, # train faster
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset["train"].select(range(10, 20)),
    eval_dataset=dataset["valid"].select(range(2)),
    data_collator=data_collator,
    tokenizer=tokenizer,
)


Using cuda_amp half precision backend


In [5]:
for example in dataset["train"].select(range(20)):
    print(len(example["input_ids"]), len(example["labels"]))

512 2754
512 1880
512 1501
512 2260
512 2176
512 2260
512 2061
512 1272
512 6891
512 3380
512 3703
512 3888
512 2002
512 1810
512 3818
512 2530
512 2675
512 2122
512 2133
512 3339


In [6]:
trainer.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 9
  Number of trainable parameters = 60492288


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.74 GiB (GPU 0; 10.76 GiB total capacity; 8.50 GiB already allocated; 1.35 GiB free; 8.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF