In [None]:
!pip install datasets transformers nltk apache_beam rouge_score sacrebleu optuna

In [None]:
# Computing metrics 

import numpy as np
from rouge_score import rouge_scorer
from nltk.translate import meteor_score
import sacrebleu
from nltk import word_tokenize 
import nltk
nltk.download('wordnet')
from transformers import AutoTokenizer

model_name = 't5-squad'
model_dir = f"gdrive/MyDrive/Models/{model_name}"
tokeniser = AutoTokenizer.from_pretrained(model_dir)

def compute_metrics(predictions_labels):
    predictions, labels = predictions_labels
    decoded_predictions = tokeniser.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokeniser.batch_decode(labels, skip_special_tokens=True)

    result = {}

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge.score(ref, pred) for ref, pred in zip(decoded_labels, decoded_predictions)]
    rouge1_f_scores = [score['rouge1'].fmeasure for score in rouge_scores]
    rouge2_f_scores = [score['rouge2'].fmeasure for score in rouge_scores]
    rougeL_f_scores = [score['rougeL'].fmeasure for score in rouge_scores]
    result['rouge1'] = np.mean(rouge1_f_scores)
    result['rouge2'] = np.mean(rouge2_f_scores)
    result['rougeL'] = np.mean(rougeL_f_scores)

    word_tokenized_predictions = [word_tokenize(pred) for pred in decoded_predictions]
    word_tokenized_labels = [word_tokenize(label) for label in decoded_labels]
    meteor_scores = [meteor_score.single_meteor_score(ref, pred) for ref, pred in zip(word_tokenized_labels, word_tokenized_predictions)]
    result['meteor'] = np.mean(meteor_scores)
  
    label_sequence = [[label] for label in decoded_labels]
    result['bleu'] = sacrebleu.corpus_bleu(decoded_predictions, label_sequence).score / 100
    # Add average length of prediction 
    prediction_lens = [np.count_nonzero(pred != tokeniser.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return result

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Preprocessing 

from transformers import AutoTokenizer

model_name = 't5-squad'
model_dir = f"gdrive/MyDrive/Models/{model_name}"
tokeniser = AutoTokenizer.from_pretrained(model_dir)

def preprocess_squad(data):
    inputs = ['summarize: ' + context.strip() for context in data['context']]
    questions = [question for question in data['question']]
    answers = [answer['text'][0] for answer in data['answers']]
    labels = []
    for i in range(len(questions)):
        labels.append(questions[i] + '~' + answers[i])
  
    input_text = tokeniser(inputs, truncation=True, max_length=max_input_length)
    target_text = tokeniser(labels, truncation=True, max_length=max_input_length)
    return {'input_ids': input_text['input_ids'], 'attention_mask': input_text['attention_mask'], 'labels': target_text['input_ids']}
  
def preprocess_nq(data):
    inputs = ['summarize: ' + context.strip() for context in data['context']]
    questions = [question for question in data['question']]
    answers = [answer for answer in data['answer']]
    labels = []
    for i in range(len(questions)):
        labels.append(questions[i] + '~' + answers[i])
  
    input_text = tokeniser(inputs, truncation=True, max_length=max_input_length)
    target_text = tokeniser(labels, truncation=True, max_length=max_input_length)
    return {'input_ids': input_text['input_ids'], 'attention_mask': input_text['attention_mask'], 'labels': target_text['input_ids']}

In [None]:
# Training

from google.colab import drive
from datasets import load_dataset, load_metric, Dataset
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
import json

drive.mount('/content/gdrive')
dataset_name = 'squad'
if dataset_name == 'squad':
    model_name = 't5-squad'
else:
    model_name = 't5-nq'
model_dir = f"gdrive/MyDrive/Models/{model_name}"

model_checkpoint = 't5-base'
tokeniser = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512

if dataset_name == 'squad':
    dataset = load_dataset('squad')
    dataset_to_split = dataset['train']
    train_dataset, val_dataset = train_test_split(dataset_to_split, test_size=0.2, random_state=42)
    tokenised_dataset_train = Dataset.from_dict(train_dataset).map(preprocess_squad, batched=True)
    tokenised_dataset_validation = Dataset.from_dict(val_dataset).map(preprocess_squad, batched=True)
else:
    with open('gdrive/MyDrive/Datasets/nq-train.json', 'r') as f:
        data = json.load(f)

    features = {
        "context": [item["context"] for item in data],
        "question": [item["question"] for item in data],
        "answer": [item["answer"] for item in data],
    }

    dataset = Dataset.from_dict(features)
    train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
    tokenised_dataset_train = Dataset.from_dict(train_dataset).map(preprocess_nq, batched=True)
    tokenised_dataset_validation = Dataset.from_dict(val_dataset).map(preprocess_nq, batched=True)

def create_t5_trainer(trial, tokenised_dataset_train, tokenised_dataset_validation):
    training_args = Seq2SeqTrainingArguments(
        output_dir=model_dir,
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        save_strategy='epoch',
        learning_rate=3e-5,
        weight_decay=0.01,
       fp16=True,
       predict_with_generate=True,
    )

  data_collator = DataCollatorForSeq2Seq(tokeniser)

  trainer = Seq2SeqTrainer(
    model=AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint),
    args=training_args,
    train_dataset=tokenised_dataset_train,
    eval_dataset=tokenised_dataset_validation,
    data_collator=data_collator,
    tokenizer=tokeniser
    compute_metrics=compute_metrics
  )

  return trainer

trainer = create_t5_trainer(trial, tokenised_dataset_train, tokenised_dataset_validation)
trainer.train()

In [None]:
# Testing

from torch.utils.data import DataLoader
from google.colab import drive
from datasets import load_dataset, Dataset
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json

dataset_name = 'squad'
if dataset_name == 'squad':
    model_name = 't5-squad'
else:
    model_name = 't5-nq'
  
model_dir = f"gdrive/MyDrive/Models/{model_name}"
tokeniser = AutoTokenizer.from_pretrained(model_dir)
max_input_length = 512 

if dataset_name == 'squad':
    dataset = load_dataset('squad')
    test_dataset = dataset['validation']
    tokenised_dataset_test = test_dataset.map(preprocess_squad, batched=True)
else:
    with open('gdrive/MyDrive/Datasets/nq-dev.json', 'r') as f:
        data = json.load(f)

    features = {
        "context": [item["context"] for item in data],
        "question": [item["question"] for item in data],
        "answer": [item["answer"] for item in data],
    }

    test_dataset = Dataset.from_dict(features)
    tokenised_dataset_test = test_dataset.map(preprocess_nq, batched=True)

dataloader = torch.utils.data.DataLoader(tokenised_dataset_test, batch_size=16)

all_predictions = []
all_labels = []
for batch in enumerate(dataloader):
    input_ids = batch["input_ids"].squeeze()
    label_ids = batch["labels"].squeeze()
  
    with torch.no_grad():
        predictions = model.generate(input_ids, num_beams=8, max_length=64)

    all_predictions += predictions
    all_labels += labels

predictions_labels = [all_predictions, all_labels]
metrics = compute_metrics(predictions_labels)
print(metrics)