In [None]:
! pip install -q peft transformers datasets evaluate peft accelerate
! pip install -q huggingface_hub
! pip install -q bert_score

In [None]:
from datasets import load_dataset
import torch
import os
from evaluate import load
import evaluate
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
    PromptTuningInit,
    PromptTuningConfig,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dataset = load_dataset("minh21/cpgQA-v1.0-unique-context-for-flan-t5")
device = "cuda"
model_name = "google/flan-t5-large"
lr = 3e-5

In [None]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_fast=True
)  # Convert text to vector space
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Zero shot

In [None]:
bf_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def generate_predictions(example):
    question = example["question"]
    context = example["context"]
    id = example["id"]
    prompt = f"""
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        bf_model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )
    answer = {"prediction_text": output, "no_answer_probability": 0, "id": str(id)}
    return answer


predictions = []
for i, index in enumerate(test_dataset):
    predictions.append(generate_predictions(index))


predictions.__len__()

144

In [None]:
predictions

In [None]:
references_for_squad_v2 = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for id, ds in enumerate(test_dataset)
]

In [None]:
squad_metric = load("squad_v2")
results = squad_metric.compute(predictions=predictions, references=references)

results

{'exact': 59.72222222222222,
 'f1': 82.08038451752687,
 'total': 144,
 'HasAns_exact': 59.72222222222222,
 'HasAns_f1': 82.08038451752687,
 'HasAns_total': 144,
 'best_exact': 59.72222222222222,
 'best_exact_thresh': 0.0,
 'best_f1': 82.08038451752687,
 'best_f1_thresh': 0.0}

# One shot

In [None]:
def generate_predictions_one_shot(example, example2):
    question_train = example2["question"]
    context_train = example2["context"]
    answer_text_train = example2["answer_text"]

    question = example["question"]
    context = example["context"]
    id = example["id"]
    prompt = f"""
    Using this example:
    [EXAMPLE]Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context_train}\n\n{question_train}", "{answer_text_train}"

    To do
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )
    answer = {"prediction_text": output, "no_answer_probability": 0, "id": str(id)}
    return answer


predictions_one_shot = []
for i, index in enumerate(test_dataset):
    if i > 0:
        predictions_one_shot.append(
            generate_predictions_one_shot(test_dataset[i], test_dataset[i - 1])
        )
    else:
        predictions_one_shot.append(
            generate_predictions_one_shot(
                test_dataset[i], test_dataset[test_dataset.__len__() - 1]
            )
        )


predictions_one_shot

In [None]:
from evaluate import load

squad_metric = load("squad_v2")
results = squad_metric.compute(predictions=predictions_one_shot, references=references)

results

{'exact': 51.81818181818182,
 'f1': 75.3573430317572,
 'total': 110,
 'HasAns_exact': 51.81818181818182,
 'HasAns_f1': 75.3573430317572,
 'HasAns_total': 110,
 'best_exact': 51.81818181818182,
 'best_exact_thresh': 0.0,
 'best_f1': 75.3573430317572,
 'best_f1_thresh': 0.0}

In [None]:
def generate_predictions_one_shot(example, example2):
    question_train = example2["question"]
    context_train = example2["context"]
    answer_text_train = example2["answer_text"]

    question = example["question"]
    context = example["context"]
    id = example["id"]
    prompt = f"""
    Using this example:
    [CONTEXT]: {context_train}\n
    [QUESTION]: {question_train} \n
    [ANSWER]: {answer_text_train} \n

    [CONTEXT]: {context}\n
    [QUESTION]: {question} \n
    [ANSWER]:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )
    answer = {"prediction_text": output, "no_answer_probability": 0, "id": str(id)}
    return answer


predictions_one_shot = []
for i, index in enumerate(test_dataset):
    if i > 0:
        predictions_one_shot.append(
            generate_predictions_one_shot(test_dataset[i], test_dataset[i - 1])
        )
    else:
        predictions_one_shot.append(
            generate_predictions_one_shot(
                test_dataset[i], test_dataset[test_dataset.__len__() - 1]
            )
        )


predictions_one_shot

In [None]:
from evaluate import load

squad_metric = load("squad_v2")
results = squad_metric.compute(predictions=predictions_one_shot, references=references)

results

{'exact': 55.45454545454545,
 'f1': 79.78866711083056,
 'total': 110,
 'HasAns_exact': 55.45454545454545,
 'HasAns_f1': 79.78866711083056,
 'HasAns_total': 110,
 'best_exact': 55.45454545454545,
 'best_exact_thresh': 0.0,
 'best_f1': 79.78866711083056,
 'best_f1_thresh': 0.0}

# Several shot

In [None]:
def generate_predictions_several_shots(example, example2, example3, example4):
    question_train_2 = example2["question"]
    context_train_2 = example2["context"]
    answer_text_train_2 = example2["answer_text"]

    question_train_3 = example3["question"]
    context_train_3 = example3["context"]
    answer_text_train_3 = example3["answer_text"]

    question_train_4 = example4["question"]
    context_train_4 = example4["context"]
    answer_text_train_4 = example4["answer_text"]

    question = example["question"]
    context = example["context"]
    id = example["id"]
    prompt = f"""
    Using this example:
    [CONTEXT]: {context_train_2}\n
    [QUESTION]: {question_train_2}\n
    [ANSWER]: {answer_text_train_2}\n

    [CONTEXT]: {context_train_3}\n
    [QUESTION]: {question_train_3}\n
    [ANSWER]: {answer_text_train_3}\n

    [CONTEXT]: {context_train_4}\n
    [QUESTION]: {question_train_4}\n
    [ANSWER]: {answer_text_train_4}\n

    [CONTEXT]: {context}\n
    [QUESTION]: {question} \n
    [ANSWER]:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )
    answer = {"prediction_text": output, "no_answer_probability": 0, "id": str(id)}
    return answer


predictions_several_shots = []
for i, index in enumerate(test_dataset):
    if i > 3:
        predictions_several_shots.append(
            generate_predictions_several_shots(
                test_dataset[i],
                test_dataset[i - 1],
                test_dataset[i - 2],
                test_dataset[i - 3],
            )
        )
    else:
        predictions_several_shots.append(
            generate_predictions_several_shots(
                test_dataset[i],
                test_dataset[test_dataset.__len__() - i - 1],
                test_dataset[test_dataset.__len__() - i - 2],
                test_dataset[test_dataset.__len__() - i - 3],
            )
        )


predictions_several_shots

In [None]:
from evaluate import load

squad_metric = load("squad_v2")
results = squad_metric.compute(
    predictions=predictions_several_shots, references=references
)

results

{'exact': 55.45454545454545,
 'f1': 79.22830556919165,
 'total': 110,
 'HasAns_exact': 55.45454545454545,
 'HasAns_f1': 79.22830556919165,
 'HasAns_total': 110,
 'best_exact': 55.45454545454545,
 'best_exact_thresh': 0.0,
 'best_f1': 79.22830556919165,
 'best_f1_thresh': 0.0}

# Prompt Tuning

In [None]:
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=8,
    tokenizer_name_or_path=model_name,
)

In [None]:
model_max_length = tokenizer.model_max_length

In [None]:
max_token = 0
item_with_tokens_size_larger_than_limit = []
for data in train_dataset:
    text = data["context"] + "\n" + data["question"]
    l = tokenizer(text)["input_ids"].__len__()
    if l >= 512:
        item_with_tokens_size_larger_than_limit.append(data)
    if l >= max_token:
        max_token = l

print(max_token)

[]
481


In [None]:
def tokenize_function(data):
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """
    model_inputs = tokenizer(
        input, padding="max_length", max_length=512, truncation=True
    )

    labels = tokenizer(answer)

    model_inputs["labels"] = labels["input_ids"]
    labels["id"] = id
    return model_inputs

In [None]:
processed_datasets = {
    "train": train_dataset.map(
        tokenize_function,
        batched=False,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on train_dataset",
    ),
    "test": test_dataset.map(
        tokenize_function,
        batched=False,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on test_dataset",
    ),
}

Running tokenizer on train_dataset:   0%|          | 0/860 [00:00<?, ? examples/s]

Running tokenizer on test_dataset:   0%|          | 0/144 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = get_peft_model(model, peft_config)
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)


def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 65536
all model parameters: 783215616
percentage of trainable model parameters: 0.01%


In [None]:
training_args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    learning_rate=lr,
    do_eval=False,
    output_dir="./flan-t5-large-prompt-tuning-cpgQA",
    auto_find_batch_size=True,
    num_train_epochs=2,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    push_to_hub=True,
    save_strategy="no",
    logging_steps=200,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
trainer.push_to_hub()

## Load model from hub

In [None]:
peft_model_id = "minh21/flan-t5-large-prompt-tuning-cpgQA"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id)

print("Peft model loaded")

## Evaluation model

In [None]:
model.to("cuda")
predictions_for_squad = []
predictions_for_squad_v2 = []
predictions_for_bert_score = []
references_for_bert_score = []
for data in test_dataset:
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    "{context}\nTry to answer this question if possible (otherwise reply "
        "\"unanswerable\"): {question}"
    """
    model_inputs = tokenizer(
        input,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).to(torch.device("cuda"))

    model_output = tokenizer.decode(
        model.generate(
            input_ids=model_inputs["input_ids"],
            attention_mask=model_inputs["attention_mask"],
        )[0],
        skip_special_tokens=True,
    )

    predictions_for_squad.append(
        {
            "prediction_text": model_output,
            "id": str(id),
        }
    )

    predictions_for_squad_v2.append(
        {
            "prediction_text": model_output,
            "no_answer_probability": 0,
            "id": str(id),
        }
    )

    predictions_for_bert_score.append(model_output)
    references_for_bert_score.append(answer)

In [None]:
references_for_squad_v2 = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for ds in test_dataset
]

In [None]:
results = dict()

squad_metric = load("squad_v2")
results["squad_v2"] = squad_metric.compute(
    predictions=predictions_for_squad_v2, references=references_for_squad_v2
)

squad_metric = load("squad")
results["squad"] = squad_metric.compute(
    predictions=predictions_for_squad, references=references_for_squad_v2
)

bleu_metrics = load("bleu")
results["bleu"] = bleu_metrics.compute(
    predictions=predictions_for_bert_score, references=references_for_bert_score
)

bertscore_metric = load("bertscore")
berscore = bertscore_metric.compute(
    predictions=predictions_for_bert_score,
    references=references_for_bert_score,
    lang="en",
)
results["bertscore"] = {
    "precision": sum(berscore["precision"]) / len(berscore["precision"]),
    "recall": sum(berscore["recall"]) / len(berscore["recall"]),
    "f1": sum(berscore["f1"]) / len(berscore["f1"]),
}

results

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'squad_v2': {'exact': 50.0,
  'f1': 79.33653204790353,
  'total': 144,
  'HasAns_exact': 50.0,
  'HasAns_f1': 79.33653204790353,
  'HasAns_total': 144,
  'best_exact': 50.0,
  'best_exact_thresh': 0.0,
  'best_f1': 79.33653204790353,
  'best_f1_thresh': 0.0},
 'squad': {'exact_match': 50.0, 'f1': 79.33653204790353},
 'bleu': {'bleu': 0.5240908153729836,
  'precisions': [0.939453125,
   0.9238636363636363,
   0.910547396528705,
   0.9022082018927445],
  'brevity_penalty': 0.5703395250527857,
  'length_ratio': 0.6404002501563477,
  'translation_length': 1024,
  'reference_length': 1599},
 'bertscore': {'precision': 0.9678248527149359,
  'recall': 0.9443863949014081,
  'f1': 0.9554663168059455}}