In [None]:
!pip install transformers[SentencePiece] datasets evaluate rouge-score SentencePiece accelerate sentence_transformers faiss-cpu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from datasets import load_dataset, concatenate_datasets 
import evaluate
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
    ) 
import nltk
nltk.download('punkt')

In [None]:
from qasper_utils import get_QAE2
import json

In [None]:
metric = evaluate.load("rouge")
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
squad_train = load_dataset("squad", split="train")
squad_dev = load_dataset("squad", split="validation")

In [None]:
train_path = "/content/drive/MyDrive/qasper-train-v0.3.json"
dev_path = "/content/drive/MyDrive/qasper-dev-v0.3.json"
test_path = "/content/drive/MyDrive/qasper-test-v0.3.json"

with open(train_path, 'r') as f:
    train_data = json.load(f)

with open(dev_path, 'r') as f:
    dev_data = json.load(f)
    
with open(test_path, 'r') as f:
    test_data = json.load(f)

In [None]:
qasper_train = get_QAE2(train_data)
qasper_dev = get_QAE2(dev_data)
qasper_test = get_QAE2(test_data)

In [None]:
import pandas as pd

qasper_train_df = pd.DataFrame(qasper_train)
qasper_train_df.to_csv("qasper_train_QAE.csv", index=False)

qasper_dev_df = pd.DataFrame(qasper_dev)
qasper_dev_df.to_csv("qasper_dev_QAE.csv", index=False)

In [None]:
qasper_train = load_dataset("csv", data_files="qasper_train_QAE.csv", split="train")
qasper_dev = load_dataset("csv", data_files="qasper_dev_QAE.csv", split="train")

In [None]:
model_checkpoint = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [None]:
def add_eos_to_examples_squad(example):
    example['input_text'] = f"question: {example['question']}  context: {example['context']} </s>"
    example['target_text'] = f"{example['answers']['text'][0]} </s>"
    return example

def add_eos_to_examples_qasper(example):
    example['input_text'] = f"question: {example['question']}  context: {example['evidence']} </s>"
    example['target_text'] = f"{example['answer']} </s>"
    return example

In [None]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=1024)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=128)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [None]:
squad_train_cleaned = squad_train.map(add_eos_to_examples_squad)
qasper_train_cleaned = qasper_train.map(add_eos_to_examples_qasper)

In [None]:
squad_train_tokenised = squad_train_cleaned.map(convert_to_features, batched=True, load_from_cache_file=False)
qasper_train_tokenised = qasper_train_cleaned.map(convert_to_features, batched=True, load_from_cache_file=False)

In [None]:
squad_train_tokenised = squad_train_tokenised.remove_columns(['id', 'title', 'context', 'question', 'answers'])

In [None]:
qasper_train_tokenised = qasper_train_tokenised.remove_columns(['data_id', 'paper_id', 'question', 'answer', 'evidence', 'type'])

In [None]:
concat_data = concatenate_datasets([squad_train_tokenised, qasper_train_tokenised])

In [None]:
import torch

torch.cuda.empty_cache()  # clear GPU

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-reader",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4,
    optim="adafactor"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
qasper_dev_cleaned = qasper_dev.map(add_eos_to_examples_qasper)
qasper_dev_tokenised = qasper_dev_cleaned.map(convert_to_features, batched=True, load_from_cache_file=False)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=concat_data,
    eval_dataset=qasper_dev_tokenised,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("flant5_reader")

In [None]:
!zip -r /content/flant5_reader.zip /content/flant5_reader