# Package install

In [None]:
# install Hugging Face Libraries
!pip install "peft==0.2.0"
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install datasets
!pip install bert-score

In [None]:
from huggingface_hub import notebook_login

notebook_login()


# Process Dataset




In [None]:
dataset_id = "minh21/cpgQA-v1.0-unique-context"

In [None]:
# Load your dataset
from datasets import load_dataset

dataset = load_dataset("minh21/cpgQA-v1.0-unique-context-for-flan-t5")

In [None]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
train_dataset

Dataset({
    features: ['title', 'id', 'question', 'answer_text', 'answer_start', 'context'],
    num_rows: 860
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-large"
# model_id="facebook/bart-large"
# Load tokenizer of FLAN-t5-large
tokenizer = AutoTokenizer.from_pretrained(model_id)
max_length = 512

In [None]:
# def tokenize(batch):
#     inputs = tokenizer(f"[CONTEXT]: {batch['context']} \n [QUESTION]: {batch['question']}" , padding= True, truncation = True, max_length = max_length, return_tensors='pt')
#     targets = tokenizer(batch['answer_text'], padding= True, truncation = True, max_length = max_length, return_tensors='pt')
#     inputs['labels'] = targets['input_ids']
#     return inputs
def tokenize(batch):
    input = f"""
    Read this and answer the question. If the question is unanswerable, "say \"unanswerable\".\n\n{batch['context']}\n\n{batch['question']}"
    """
    inputs = tokenizer(input, padding=True, truncation=True, max_length=max_length)
    targets = tokenizer(
        batch["answer_text"], padding=True, truncation=True, max_length=max_length
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
tokenized_train_dataset = train_dataset.map(
    tokenize, batched=False, remove_columns=train_dataset.column_names
)
tokenized_test_dataset = test_dataset.map(
    tokenize, batched=False, remove_columns=test_dataset.column_names
)



In [None]:
tokenized_train_dataset[0]

In [None]:
first_context_decoded = tokenizer.decode(
    tokenized_train_dataset[0]["input_ids"], skip_special_tokens=True
)
print(first_context_decoded)

Read this and answer the question. If the question is unanswerable, "say "unanswerable". The Opioid Taper Decision Tool is designed to assist Primary Care providers in determining if an opioid taper is necessary for a specific patient, in performing the taper, and in providing follow-up and support during the taper. What is the purpose of Opioid Taper Decision Tool?" 


In [None]:
train_dataset[0]

# Fine-tune and evaluate FLAN-T5

In [None]:
from transformers import AutoModelForSeq2SeqLM

model_id = "google/flan-t5-large"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    r=39,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
# prepare int-8 model for training
# model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir = "lora-flan-t5-large"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,  # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    # eval_dataset = tokenized_test_dataset,
    # compute_metrics = compute_metrics
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# train model
trainer.train()

In [None]:
model.push_to_hub(
    "espiusedwards/flant5-large-lora",
    use_auth_token=True,
    commit_message="not 8 bit, r = 39",
    private=True,
    create_pr=1,
)

In [None]:
# trainer.evaluate()

In [None]:
# # Save our LoRA model & tokenizer results
# peft_model_id="results"
# trainer.model.save_pretrained(peft_model_id)
# tokenizer.save_pretrained(peft_model_id)
# # if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

# Evaluate

In [None]:
# Load adapters from hub
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "espiusedwards/flant5-large-lora"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(
    config.base_model_name_or_path, load_in_8bit=True, device_map={"": 0}
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"": 0})
model.eval()

print("Peft model loaded")

In [None]:
model.to("cuda")
predictions_for_squad = []
predictions_for_squad_v2 = []
predictions_for_bert_score = []
references_for_bert_score = []
for data in test_dataset:
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    Read this and answer the question. If the question is unanswerable, "say \"unanswerable\".\n\n{context}\n\n{question}"
    """
    model_inputs = tokenizer(
        input,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).to(torch.device("cuda"))

    model_output = tokenizer.decode(
        model.generate(
            input_ids=model_inputs["input_ids"],
            attention_mask=model_inputs["attention_mask"],
        )[0],
        skip_special_tokens=True,
    )

    predictions_for_squad.append(
        {
            "prediction_text": model_output,
            "id": str(id),
        }
    )

    predictions_for_squad_v2.append(
        {
            "prediction_text": model_output,
            "no_answer_probability": 0,
            "id": str(id),
        }
    )

    predictions_for_bert_score.append(model_output)
    references_for_bert_score.append(answer)
    # predictions.extend(predicted_texts)

In [None]:
references_for_squad_v2 = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for id, ds in enumerate(test_dataset)
]

In [None]:
from evaluate import load

results = dict()
squad_metric = load("squad_v2")
results["squad_v2"] = squad_metric.compute(
    predictions=predictions_for_squad_v2, references=references_for_squad_v2
)

squad_metric = load("squad")
results["squad"] = squad_metric.compute(
    predictions=predictions_for_squad, references=references_for_squad_v2
)

bleu_metrics = load("bleu")
results["bleu"] = bleu_metrics.compute(
    predictions=predictions_for_bert_score, references=references_for_bert_score
)

bertscore_metric = load("bertscore")
berscore = bertscore_metric.compute(
    predictions=predictions_for_bert_score,
    references=references_for_bert_score,
    lang="en",
)

results["bertscore"] = {
    "precision": sum(berscore["precision"]) / len(berscore["precision"]),
    "recall": sum(berscore["recall"]) / len(berscore["recall"]),
    "f1": sum(berscore["f1"]) / len(berscore["f1"]),
}
results

# Too small loss => Validation check on train dataset

In [None]:
# validation set
validation_dataset = train_dataset.select(range(100))
validation_dataset

In [None]:
model.to("cuda")
predictions_for_squad = []
predictions_for_squad_v2 = []
predictions_for_bert_score = []
references_for_bert_score = []
for data in validation_dataset:
    context = data["context"]
    question = data["question"]
    answer = data["answer_text"]
    id = data["id"]
    input = f"""
    Read this and answer the question. If the question is unanswerable, "say \"unanswerable\".\n\n{context}\n\n{question}"
    """
    model_inputs = tokenizer(
        input,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).to(torch.device("cuda"))

    model_output = tokenizer.decode(
        model.generate(
            input_ids=model_inputs["input_ids"],
            attention_mask=model_inputs["attention_mask"],
        )[0],
        skip_special_tokens=True,
    )

    predictions_for_squad.append(
        {
            "prediction_text": model_output,
            "id": str(id),
        }
    )

    predictions_for_squad_v2.append(
        {
            "prediction_text": model_output,
            "no_answer_probability": 0,
            "id": str(id),
        }
    )

    predictions_for_bert_score.append(model_output)
    references_for_bert_score.append(answer)
    # predictions.extend(predicted_texts)

In [None]:
references_for_squad_v2 = [
    {
        "answers": {"answer_start": [ds["answer_start"]], "text": [ds["answer_text"]]},
        "id": str(ds["id"]),
    }
    for id, ds in enumerate(validation_dataset)
]

In [None]:
from evaluate import load

results = dict()
squad_metric = load("squad_v2")
results["squad_v2"] = squad_metric.compute(
    predictions=predictions_for_squad_v2, references=references_for_squad_v2
)

squad_metric = load("squad")
results["squad"] = squad_metric.compute(
    predictions=predictions_for_squad, references=references_for_squad_v2
)

bleu_metrics = load("bleu")
results["bleu"] = bleu_metrics.compute(
    predictions=predictions_for_bert_score, references=references_for_bert_score
)

bertscore_metric = load("bertscore")
berscore = bertscore_metric.compute(
    predictions=predictions_for_bert_score,
    references=references_for_bert_score,
    lang="en",
)

results["bertscore"] = {
    "precision": sum(berscore["precision"]) / len(berscore["precision"]),
    "recall": sum(berscore["recall"]) / len(berscore["recall"]),
    "f1": sum(berscore["f1"]) / len(berscore["f1"]),
}
results

# Draft - Try functions

In [None]:
# import evaluate
# from transformers import T5ForConditionalGeneration, T5Tokenizer
# # Load the model and tokenizer
# model = T5ForConditionalGeneration.from_pretrained('espiusedwards/flant5-large-lora')
# tokenizer = T5Tokenizer.from_pretrained('t5-small') # adjust model size if necessary
# model.eval()

# # Load the metric
# metric = evaluate.load("squad_v2")

# # Assuming `test_data` is your test data
# # and test_data is a list of dictionaries with 'question', 'context', and 'id' keys

# # Get predictions
# predictions = []
# for i in range(len(tokenized_test_dataset)):
#     item = tokenized_test_dataset[i]
#     inputs = {'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']}
#     outputs = model.generate(**inputs, max_length=512, num_beams=4, length_penalty=2.0, early_stopping=True)
#     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     predictions.append({'prediction_text': prediction, 'id': item['id']})

# references = [{'answers': {'answer_start': [0], 'text': [item['answer']]}, 'id': item['id']} for item in tokenized_test_dataset]

# # Compute the metric
# result = metric.compute(predictions=predictions, references=references)

# # Display the result
# print(result)

In [None]:
# # evaluate
# import evaluate
# import numpy as np
# from datasets import load_from_disk
# from tqdm import tqdm

# # Metric
# metric = evaluate.load("squad_v2")

# def evaluate_peft_model(sample,max_target_length=50):
#     # generate summary
#     outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
#     prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
#     # decode eval sample
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
#     labels = tokenizer.decode(labels, skip_special_tokens=True)

#     # Some simple post-processing
#     return prediction, labels

# # dataset: test_dataset

# # run predictions
# # this can take ~45 minutes
# predictions, references = [] , []
# for sample in tqdm(test_dataset):
#     p,l = evaluate_peft_model(sample)
#     predictions.append(p)
#     references.append(l)

# # compute metric
# squad_v2 = metric.compute(predictions=predictions, references=references)

# # print results
# print(f"Exact: {squad_v2['exact']}")
# print(f"f1: {squad_v2['f1']}")

In [None]:
# from huggingface_hub import HfFolder
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# # Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"
# # Define training args
# training_args = Seq2SeqTrainingArguments(
#     output_dir=repository_id,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     fp16=False, # Overflows with fp16
#     learning_rate=5e-5,
#     num_train_epochs=5,
#     # logging & evaluation strategies
#     logging_dir=f"{repository_id}/logs",
#     logging_strategy="steps",
#     logging_steps=500,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     # metric_for_best_model="overall_f1",
#     # push to hub parameters
#     report_to="tensorboard",
#     push_to_hub=False,
#     hub_strategy="every_save",
#     #hub_model_id=repository_id,
#     hub_token=HfFolder.get_token(),
# )

In [None]:
# Compute metric
# import evaluate
# import numpy as np

# # Metric
# metric = evaluate.load("squad_v2")

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]

#     # Convert predictions and labels to the format expected by squad_v2 metric
#     predictions = [{'prediction_text': pred, 'id': i} for i, pred in enumerate(preds)]
#     references = [{'answers': {'answer_start': [0], 'text': [label]}, 'id': i} for i, label in enumerate(labels)]

#     result = metric.compute(predictions=predictions, references=references)
#     return result

In [None]:
# Compute metric
# import evaluate
# import numpy as np
# import nltk
# from nltk.tokenize import sent_tokenize
# nltk.download("punkt")

# # Metric
# metric = evaluate.load("squad")

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#     result = {k: round(v * 100, 4) for k, v in result.items()}
#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
#     result["gen_len"] = np.mean(prediction_lens)
#     return result

In [None]:
# # Create Trainer instance
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=tokenized_train_dataset,
#     eval_dataset=tokenized_test_dataset,
#     compute_metrics=compute_metrics,
# )