In [26]:
from datasets import load_dataset, list_metrics, load_metric
from transformers import AutoTokenizer,T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

In [27]:
SQUAD_PATH = r"./data/squad-v1.1-t5-question-generation/squad_modified_for_t5_qg.py"

In [28]:
dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation")
dataset

Found cached dataset parquet (C:/Users/ManuV/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--squad-v1.1-t5-question-generation-5bdfd922a02702a2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 248.51it/s]


DatasetDict({
    validation: Dataset({
        features: ['context', 'questions'],
        num_rows: 2067
    })
    train: Dataset({
        features: ['context', 'questions'],
        num_rows: 18896
    })
})

In [29]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [30]:
# Dataset is dict with keys: {id,title,context,answers: {text,answer_start}}
train = dataset['train']
val = dataset['validation']
print(train[0])

{'context': 'generate questions: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'questions': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? {sep_token} What is in front of the Notre Dame Main Building? {sep_token} The Basilica of the Sacred heart at Notre Dame is beside to which structure? {sep_token} What is the Grotto 

In [31]:
bleu_squad_metric = load_metric('bleu','squad')

model = T5ForConditionalGeneration.from_pretrained("t5-base")

Downloading (…)"pytorch_model.bin";: 100%|██████████| 892M/892M [02:01<00:00, 7.33MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 8.38kB/s]


In [49]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,               # log after every X steps
    evaluation_strategy='steps',     # evaluate every eval_steps
    eval_steps=500,                  # evaluate after every X steps
    save_total_limit=5,              # number of checkpoints to save
    save_steps=500,                  # save checkpoint after every X steps
    remove_unused_columns=False      # Very necessary
)

trainer = Trainer(
    model=model,                     # the instantiated T5 model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train,        # training dataset
    eval_dataset=val,            # evaluation dataset
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [50]:
print(len(train))
print(len(val))

18896
2067


In [51]:
trainer.train()

***** Running training *****
  Num examples = 18896
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7086
  Number of trainable parameters = 222903552


TypeError: can only join an iterable

In [None]:
prompt = "The quick brown fox jumps over the lazy dog."
inputs = tokenizer.encode(prompt, return_tensors='pt')
outputs = model.generate(inputs)
questions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(questions)