## Requirements
`pip install pytorch transformers datasets ntlk`
## Source
[source](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)

# Load and install

In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import list_datasets, load_dataset
from nltk.translate.bleu_score import sentence_bleu
import nltk
import numpy as np
import torch

In [30]:
model_name = "t5-small"
prefix = "context: "

In [31]:
# with open('datasets.txt','w') as f:
#     [f.write(d+'\n') for d in list_datasets()]

# with open('datasets.txt','r') as f:
#     lines = f.readlines()

# t5_qg_datasets = [line for line in lines if 't5' in line.lower() and ('qg' in line.lower() or 'question_generation' in line.lower())]
# t5_qg_datasets

In [32]:
dataset_squad = load_dataset('wiselinjayajos/squad_modified_for_t5_qg')

Found cached dataset parquet (C:/Users/ManuV/.cache/huggingface/datasets/wiselinjayajos___parquet/wiselinjayajos--squad_modified_for_t5_qg-a090cde2e8fceb0a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 326.67it/s]


In [33]:
dataset_squad

DatasetDict({
    train: Dataset({
        features: ['context', 'questions'],
        num_rows: 18896
    })
    validation: Dataset({
        features: ['context', 'questions'],
        num_rows: 2067
    })
})

In [34]:
# Dict with keys ['context','questions']
# 'context': 'generate questions: The Louvre ...'
# 'questions': 'Where ... ? {sep_token} How ... ? {sep_token} ...? {sep_token}'
dataset_squad["train"][0]

{'context': 'generate questions: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'questions': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? {sep_token} What is in front of the Notre Dame Main Building? {sep_token} The Basilica of the Sacred heart at Notre Dame is beside to which structure? {sep_token} What is the Grotto

# Preprocessing

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [36]:
max_input_length = 1024
max_target_length = 128

def preprocess(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["questions"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [37]:
preprocess(dataset_squad['train'][:2])# dict with keys ['input_ids','attention_mask','labels'] met inputs_ids=context en labels=questions



{'input_ids': [[2625, 10, 3806, 746, 10, 30797, 120, 6, 8, 496, 65, 3, 9, 6502, 1848, 5, 71, 2916, 8, 5140, 5450, 31, 7, 2045, 22161, 19, 3, 9, 7069, 12647, 13, 8, 16823, 3790, 5, 3, 29167, 16, 851, 13, 8, 5140, 5450, 11, 5008, 34, 6, 19, 3, 9, 8658, 12647, 13, 2144, 28, 6026, 3, 76, 24266, 28, 8, 9503, 96, 553, 15, 7980, 1980, 1212, 13285, 1496, 1280, 3021, 12, 8, 5140, 5450, 19, 8, 23711, 2617, 13, 8, 3, 24756, 6219, 5, 3, 29167, 1187, 8, 20605, 2617, 19, 8, 8554, 17, 235, 6, 3, 9, 17535, 286, 13, 7029, 11, 9619, 5, 94, 19, 3, 9, 16455, 13, 8, 3, 3844, 17, 235, 44, 301, 1211, 1395, 6, 1410, 213, 8, 16823, 3790, 3, 28285, 26, 120, 4283, 12, 2788, 8942, 9, 26, 1954, 264, 8371, 8283, 16, 507, 3449, 5, 486, 8, 414, 13, 8, 711, 1262, 41, 232, 16, 3, 9, 1223, 689, 24, 1979, 7, 190, 220, 12647, 7, 11, 8, 2540, 10576, 15, 201, 19, 3, 9, 650, 6, 941, 3372, 12647, 13, 3790, 5, 1], [2625, 10, 3806, 746, 10, 282, 44, 167, 119, 8278, 6, 7711, 3, 17084, 31, 7, 481, 661, 3, 9, 381, 13, 1506, 783, 1

In [38]:
tokenized_squad = dataset_squad.map(preprocess, batched=True)

Loading cached processed dataset at C:\Users\ManuV\.cache\huggingface\datasets\wiselinjayajos___parquet\wiselinjayajos--squad_modified_for_t5_qg-a090cde2e8fceb0a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-76d040fbf063f08b.arrow
Loading cached processed dataset at C:\Users\ManuV\.cache\huggingface\datasets\wiselinjayajos___parquet\wiselinjayajos--squad_modified_for_t5_qg-a090cde2e8fceb0a\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-0043e0412c06186c.arrow


# Fine-tuning

In [39]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [40]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,# Change to True if you train on GPU
    push_to_hub=False,
)

In [41]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [42]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Removing the sep_tokens so metric can be properly used
    decoded_preds = [decoded_pred.split('\n')[0] for decoded_pred in decoded_preds]
    decoded_labels = [decoded_label.split('\nsep_token') for decoded_label in decoded_labels]

    # Compute BLEU score
    bleu_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        bleu_score = sentence_bleu([label], pred)
        bleu_scores.append(bleu_score)

    result = {"bleu": np.mean(bleu_scores)}
    # result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)
    
    # return {k: round(v, 4) for k, v in result.items()}
    return result

In [43]:
import datasets

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=datasets.Dataset.from_dict(tokenized_squad["train"][:50]),
    eval_dataset=datasets.Dataset.from_dict(tokenized_squad["validation"][:30]),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [44]:
if model.device.type == 'cuda':
    print('Model is on GPU')

    # Set model to train on CPU
    device = torch.device('cpu')
    model.to(device)
    trainer.device = device
else:
    print('Model is on CPU')

Model is on CPU


In [45]:
trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                             
100%|██████████| 4/4 [00:28<00:00,  7.02s/it]

{'eval_loss': 4.268968105316162, 'eval_bleu': 0.0, 'eval_runtime': 10.6552, 'eval_samples_per_second': 2.816, 'eval_steps_per_second': 0.188, 'epoch': 1.0}
{'train_runtime': 28.0694, 'train_samples_per_second': 1.781, 'train_steps_per_second': 0.143, 'train_loss': 4.160477638244629, 'epoch': 1.0}





TrainOutput(global_step=4, training_loss=4.160477638244629, metrics={'train_runtime': 28.0694, 'train_samples_per_second': 1.781, 'train_steps_per_second': 0.143, 'train_loss': 4.160477638244629, 'epoch': 1.0})

# Inference

In [79]:
# Generate predictions on some input text
input_text = "generate question: Napoleon is een generaal van Frankrijk. Hij heeft 4 prijzen gewonnen in Duitsland. Hij kan over 3 boomstammen springen."
input_text = torch.tensor(tokenizer(input_text)['input_ids']).long().unsqueeze(0)
generated = model.generate(input_text, max_length=50, num_beams=4, early_stopping=True)

# Decode the generated output
output_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(output_text)

Hij kan over 3 boomstammen springen.


# Test

In [80]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

100%|██████████| 2/2 [00:02<00:00,  1.44s/it]

{'eval_loss': 4.268968105316162, 'eval_bleu': 0.0, 'eval_runtime': 8.4671, 'eval_samples_per_second': 3.543, 'eval_steps_per_second': 0.236, 'epoch': 1.0}





# Evaluate

In [90]:
import matplotlib.pyplot as plt

# Get the training and validation loss from the trainer
print(trainer.log_metrics("train",metrics="bleu"))
train_loss = trainer.state.log_history["loss"]
val_loss = trainer.state.log_history["eval_loss"]

# Plot the training and validation loss
plt.plot(train_loss, label="Training loss")
plt.plot(val_loss, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

***** train metrics *****


AttributeError: 'str' object has no attribute 'copy'