# Install packages

In [None]:
!pip install -q peft transformers datasets evaluate peft -q accelerate
!pip install huggingface_hub
!pip install bert_score

# Load packages and variables

In [None]:
from datasets import load_dataset
import evaluate
from evaluate import load
from transformers.data.metrics.squad_metrics import compute_predictions_logits
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from peft import (
    TaskType,
    PromptEncoderConfig,
    get_peft_config,
    get_peft_model,
)
import torch
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader

In [None]:
device = "cuda"
model_name = "google/flan-t5-large"
lr = 3e-5

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset and process dataset

In [None]:
dataset = load_dataset("minh21/cpgQA-v1.0-unique-context-for-flan-t5")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

Downloading readme:   0%|          | 0.00/767 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/29.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/144 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/860 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_fast=True
)  # Convert text to vector space
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
model_max_length = tokenizer.model_max_length

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# Zero shot

In [None]:
bf_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def generate_predictions(example):
    question = example["question"]
    context = example["context"]
    id = example["id"]
    prompt = f"""
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        bf_model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )
    answer = {"prediction_text": output, "no_answer_probability": 0, "id": str(id)}
    return answer


predictions = []
for i, index in enumerate(test_dataset):
    # predictions.append(index)
    predictions.append(generate_predictions(index))


predictions.__len__()

144

In [None]:
references = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for ds in test_dataset
]

In [None]:
squad_metric = load("squad_v2")
results = squad_metric.compute(predictions=predictions, references=references)

results

{'exact': 59.72222222222222,
 'f1': 82.08038451752687,
 'total': 144,
 'HasAns_exact': 59.72222222222222,
 'HasAns_f1': 82.08038451752687,
 'HasAns_total': 144,
 'best_exact': 59.72222222222222,
 'best_exact_thresh': 0.0,
 'best_f1': 82.08038451752687,
 'best_f1_thresh': 0.0}

# P-Tuning

In [None]:
peft_config = PromptEncoderConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=32,
    encoder_hidden_size=1024,
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 3,214,336 || all params: 786,364,416 || trainable%: 0.4087590860672922




---
Test maximum token


In [None]:
max_token = 0
item_with_tokens_size_larger_than_limit = []
for data in train_dataset:
    text = data["context"] + "\n" + data["question"]
    l = tokenizer(text)["input_ids"].__len__()
    if l >= 512:
        item_with_tokens_size_larger_than_limit.append(data)
    if l >= max_token:
        max_token = l

print(item_with_tokens_size_larger_than_limit)
print(max_token)

[]
481




---

Tokenize function

In [None]:
def tokenize_function(data):
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """
    model_inputs = tokenizer(
        input, padding="max_length", max_length=512, truncation=True
    )

    labels = tokenizer(answer)

    model_inputs["labels"] = labels["input_ids"]
    labels["id"] = id
    return model_inputs

In [None]:
processed_datasets = {
    "train": train_dataset.map(
        tokenize_function,
        batched=False,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on train_dataset",
    ),
    "test": test_dataset.map(
        tokenize_function,
        batched=False,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on test_dataset",
    ),
}

Running tokenizer on train_dataset:   0%|          | 0/860 [00:00<?, ? examples/s]

Running tokenizer on test_dataset:   0%|          | 0/144 [00:00<?, ? examples/s]



---
Data collator



*   This is to batch tokenizer list into torch to feed to the model




In [None]:
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

## Train

In [None]:
training_args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    learning_rate=lr,
    do_eval=False,
    output_dir="./flan-t5-large-P-tuning-cpgQA",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    push_to_hub=True,
    save_strategy="no",
    logging_steps=200,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1733,0.121997
2,0.1577,0.121945


TrainOutput(global_step=430, training_loss=0.1638088403746139, metrics={'train_runtime': 164.4243, 'train_samples_per_second': 10.461, 'train_steps_per_second': 2.615, 'total_flos': 3964200875458560.0, 'train_loss': 0.1638088403746139, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/263k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.22k [00:00<?, ?B/s]

'https://huggingface.co/minh21/flan-t5-large-P-tuning-cpgQA/tree/main/'

## Load check point from hub

## Evaluation

In [None]:
model.to("cuda")
predictions_for_squad = []
predictions_for_squad_v2 = []
predictions_for_bert_score = []
references_for_bert_score = []
for data in test_dataset:
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    Read this and answer the question. If the question is unanswerable, "
    say \"unanswerable\".\n\n{context}\n\n{question}",
    """
    model_inputs = tokenizer(
        input,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).to(torch.device("cuda"))

    model_output = tokenizer.decode(
        model.generate(
            input_ids=model_inputs["input_ids"],
            attention_mask=model_inputs["attention_mask"],
        )[0],
        skip_special_tokens=True,
    )

    predictions_for_squad.append(
        {
            "prediction_text": model_output,
            "id": str(id),
        }
    )

    predictions_for_squad_v2.append(
        {
            "prediction_text": model_output,
            "no_answer_probability": 0,
            "id": str(id),
        }
    )

    predictions_for_bert_score.append(model_output)
    references_for_bert_score.append(answer)

In [None]:
references_for_squad_v2 = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for id, ds in enumerate(test_dataset)
]

In [None]:
results = dict()

squad_metric = load("squad_v2")
results["squad_v2"] = squad_metric.compute(
    predictions=predictions_for_squad_v2, references=references_for_squad_v2
)

squad_metric = load("squad")
results["squad"] = squad_metric.compute(
    predictions=predictions_for_squad, references=references_for_squad_v2
)

bleu_metrics = load("bleu")
results["bleu"] = bleu_metrics.compute(
    predictions=predictions_for_bert_score, references=references_for_bert_score
)

bertscore_metric = load("bertscore")
berscore = bertscore_metric.compute(
    predictions=predictions_for_bert_score,
    references=references_for_bert_score,
    lang="en",
)
results["bertscore"] = {
    "precision": sum(berscore["precision"]) / len(berscore["precision"]),
    "recall": sum(berscore["recall"]) / len(berscore["recall"]),
    "f1": sum(berscore["f1"]) / len(berscore["f1"]),
}

results

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'squad_v2': {'exact': 52.083333333333336,
  'f1': 81.16092345632813,
  'total': 144,
  'HasAns_exact': 52.083333333333336,
  'HasAns_f1': 81.16092345632813,
  'HasAns_total': 144,
  'best_exact': 52.083333333333336,
  'best_exact_thresh': 0.0,
  'best_f1': 81.16092345632813,
  'best_f1_thresh': 0.0},
 'squad': {'exact_match': 52.083333333333336, 'f1': 81.16092345632813},
 'bleu': {'bleu': 0.5759934232570879,
  'precisions': [0.9298085688240656,
   0.912906610703043,
   0.9024390243902439,
   0.8961593172119487],
  'brevity_penalty': 0.6327931622170238,
  'length_ratio': 0.6860537836147592,
  'translation_length': 1097,
  'reference_length': 1599},
 'bertscore': {'precision': 0.9670373934010664,
  'recall': 0.9496991361180941,
  'f1': 0.9578633184234301}}