In [None]:
# !pip install transformers[SentencePiece] datasets evaluate rouge-score SentencePiece accelerate

In [None]:
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
    ) 
import nltk
nltk.download('punkt')

In [None]:
metric = evaluate.load("rouge")
device = "cuda" if torch.cuda.is_available() else "cpu"

squad_train = load_dataset("squad", split="train")
squad_dev = load_dataset("squad", split="validation")

model_checkpoint = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)

In [None]:
max_input_length = 512
max_target_length = 64
prefix = "generate question: "

def add_eos_examples(example):
  example['context'] = example['context'] + " </s>"
  example['question'] = example['question'] + " </s>"
  return example

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["question"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# tokenized_datasets_train = qasper_qc_train.map(preprocess_function, batched=True)
# tokenized_datasets_dev = qasper_qc_dev.map(preprocess_function, batched=True)
squad_train = squad_train.map(add_eos_examples)
squad_dev = squad_dev.map(add_eos_examples)
tokenized_datasets_train = squad_train.map(preprocess_function, batched=True)
tokenized_datasets_dev = squad_dev.map(preprocess_function, batched=True)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-qg",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4,
    optim="adafactor"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
context = "Although it is challenging for one model to translate all zero-shot directions between multiple distant language pairs of MultiUN, MLM+BRLM-SA still achieves better performances on Es $\\rightarrow $ Ar and Es $\\rightarrow $ Ru than strong pivoting$_{\\rm m}$, which uses MNMT to translate source to pivot then to target in two separate steps with each step receiving supervised signal of parallel corpora. Our approaches surpass pivoting$_{\\rm m}$ in all zero-shot directions by adding back translation BIBREF33 to generate pseudo parallel sentences for all zero-shot directions based on our pretrained models such as MLM+BRLM-SA, and further training our universal encoder-decoder model with these pseudo data. BIBREF22 gu2019improved introduces back translation into MNMT, while we adopt it in our transfer approaches. Finally, our best MLM+BRLM-SA with back translation outperforms pivoting$_{\\rm m}$ by 2.4 BLEU points averagely, and outperforms MNMT BIBREF22 by 4.6 BLEU points averagely. Again, in supervised translation directions, MLM+BRLM-SA with back translation also achieves better performance than the original supervised Transformer"

In [None]:
model.eval()

In [None]:
with torch.no_grad():
  context = context.replace('\t', ' ')
  # create input tokens
  input_ids = tokenizer.encode(prefix + context + "</s>", return_tensors='pt').to(device)
  # generate output tokens (query generation)
  outputs = model.generate(
      input_ids=input_ids,
      max_length=64,
      do_sample=True,
      top_p=0.95,
      num_return_sequences=3
  )
  # decode output tokens to human-readable language
  for output in outputs:
      query = tokenizer.decode(output, skip_special_tokens=True)
      print(query)

In [None]:
trainer.save_model("flant5_squad_qg_finetune")

In [None]:
# !zip -r /content/flant5_squad_qg_finetune.zip /content/flant5_squad_qg_finetune

### Generate Questions for all paragraphs

In [None]:
import pandas as pd

In [None]:
all_paragraphs_qasper = pd.read_csv("all_paragraphs_train.tsv", sep="\t")

In [None]:
all_paragraphs_qasper.head()

In [None]:
for name, group in all_paragraphs_qasper.groupby("paper_id"):
  print(group.shape)
  break

In [None]:
paragraphs = all_paragraphs_qasper["paragraph"].values
len(paragraphs)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from tqdm.auto import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained('/content/flant5_squad_qg_finetune')
model = T5ForConditionalGeneration.from_pretrained('/content/flant5_squad_qg_finetune').to(device)
model.eval()

In [None]:
pairs = []
file_count = 0
prefix = "generate question: "

# set to no_grad as we don't need to calculate gradients for back prop
with torch.no_grad():
    # loop through each passage individually
    for p in tqdm(paragraphs):
        p = p.replace('\t', ' ')
        # create input tokens
        input_ids = tokenizer.encode(prefix + p + "</s>", return_tensors='pt').to(device)
        # generate output tokens (query generation)
        outputs = model.generate(
            input_ids=input_ids,
            max_length=64,
            do_sample=True,
            top_p=0.95,
            num_return_sequences=2
        )
        # decode output tokens to human-readable language
        for output in outputs:
            query = tokenizer.decode(output, skip_special_tokens=True)
            # append (query, passage) pair to pairs list, separate by \t
            pairs.append(query.replace('\t', ' ')+'\t'+p)
        
        # once we have 1024 pairs write to file
        if len(pairs) > 2048:
            with open(f'data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(pairs))
            file_count += 1
            pairs = []


if pairs is not None:
    # save the final, smaller than 1024 batch
    with open(f'data/pairs_{file_count}.tsv', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(pairs))


In [None]:
dfs = [pd.read_csv(f'data/pairs_{i}.tsv',sep = '\t', header=None) for i in range(42)]

In [None]:
all_pairs = pd.concat(dfs, axis=0)           

### Cleaning

In [None]:
all_pairs_na = all_pairs.dropna()

In [None]:
all_pairs_na_no_eq = all_pairs_na[all_pairs_na[1].str.contains("\$\$") == False]

In [None]:
all_pairs_na_no_eq_only_questions = all_pairs_na_no_eq[all_pairs_na_no_eq[0].str.contains("\?") == True]

In [None]:
all_pairs_na_no_eq2_only_questions = all_pairs_na_no_eq_only_questions[all_pairs_na_no_eq_only_questions[1].str.contains(r'\$\\') == False]

In [None]:
df2 = all_pairs_na_no_eq2_only_questions[all_pairs_na_no_eq2_only_questions[1].str.contains(r'\\begin{') == False]

In [None]:
df3 = df2[df2[1].str.contains(r'hline') == False]

In [None]:
df4 = df2 = df3[df3[1].str.split().apply(len) >= 30]

In [None]:
all_pairs_na_no_eq_only_questions.to_csv("qasper-flant5-genq-pairs-2-cleaned.tsv", sep='\t', index=False)

In [None]:
df4.to_csv("qasper-flant5-genq-pairs-2-supercleaned.tsv", sep='\t', index=False)

In [None]:
df5 = df4[df4[1].str.contains(r'\\text') == False]
df5.info()

In [None]:
df6 = df5[df5[1].str.contains(r'\$\{\\') == False]
df6.info()

In [None]:
df7 = df6[df6[1].str.contains("thank") == False]
df7.info()

In [None]:
df7.to_csv("qasper-flant5-genq-pairs-2-final-cleaned.tsv", sep='\t', index=False)