# Prepare SQuAD_tiny Dataset for Assignment 2

This code prepare SQuAD_tiny from the SQuAD dataset. 

# 0. Import libraries

In [1]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer,AutoTokenizer, TrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback, AutoModelForSeq2SeqLM,TrainerCallback
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import logging as transformers_logging

In [2]:
# Set seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x139175a10>

# 1. Load and preprocess SQuAD dataset

In [3]:
# 1. Load and preprocess SQuAD dataset
dataset = load_dataset("squad")

In [4]:
# Take subsets to avoid overload
train_dataset = dataset["train"].select(range(10000,11000))
val_dataset = dataset["validation"].select(range(3000,3100))
test_dataset = dataset["validation"].select(range(3100, 3200))  # No official SQuAD test set

In [5]:
print("Size of training set:", len(train_dataset))
print("Size of validation set:", len(val_dataset))
print("Size of testing set:", len(test_dataset))

Size of training set: 1000
Size of validation set: 100
Size of testing set: 100


In [6]:
MODEL_NAME = "t5-small"
#MODEL_NAME = "t5-base"
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Preprocessing data before put into t-5

In [7]:
def encode_question_and_context(question, context):
    return f"question: {question}  context: {context}"

# Obtains the context, question and answer from a given sample.
def extract_sample_parts(sample):
    context = sample["context"]
    question = sample["question"]
    answer = sample["answers"]['text'][0]
    question_with_context = encode_question_and_context(question, context)
    return (question_with_context, question, answer)

# Encodes the sample, returning token IDs.
def preprocess(sample):
    # Extract data from sample.
    question_with_context, question, answer = extract_sample_parts(sample)

    # Using truncation causes the tokenizer to emit a warning for every sample.
    # This will generate a significant amount of messages, and likely crash
    # your browser tab. We temporarily disable log messages to work around this.
    # See https://github.com/huggingface/transformers/issues/14285
    old_level = transformers_logging.get_verbosity()
    transformers_logging.set_verbosity_error()
    
    # Generate tokens for the input.
    # We include both the context and the question (first two parameters).
    input_tokens = tokenizer(question_with_context, question, padding="max_length",
                             truncation=True, max_length=MAX_INPUT_LENGTH)

    # Generate tokens for the expected answer. There is no need to include the 
    output_tokens = tokenizer(answer, padding="max_length", truncation=True,
                              max_length=MAX_OUTPUT_LENGTH)

    # Restore old logging level, see above.
    transformers_logging.set_verbosity(old_level)

    # The output of the tokenizer is a map containing {input_ids, attention_mask}.
    # For trianing, we need to add the labels (answer/output tokens) to the map.
    input_tokens["labels"] = np.array(output_tokens["input_ids"])

    return input_tokens

In [8]:
# Preprocess the datasets
training_set_enc = train_dataset.map(preprocess, batched=False)
validation_set_enc = val_dataset.map(preprocess, batched=False)
testing_set_enc = test_dataset.map(preprocess, batched=False)

In [9]:
# Prepare 20 data points for qualitative analysis
q_data = test_dataset.select(range(20))
q_data

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 20
})

# Fine tuning the model

In [10]:
columns = ["input_ids", "attention_mask", "labels"]
training_set_enc.set_format(type="torch", columns=columns)
validation_set_enc.set_format(type="torch", columns=columns)
testing_set_enc.set_format(type="torch", columns=columns)

In [11]:
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
save_total_limit=2,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
)

In [12]:
# Switch the model to training mode, enabling dropout etc layers.
model.train()
trainer = Trainer(
model=model,
args=training_args,
train_dataset=training_set_enc,
eval_dataset=validation_set_enc,
processing_class=tokenizer,
data_collator=DataCollatorForSeq2Seq(tokenizer),
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.6253,0.200828
2,0.231,0.183631
3,0.1938,0.169238


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=375, training_loss=2.217956089655558, metrics={'train_runtime': 541.0495, 'train_samples_per_second': 5.545, 'train_steps_per_second': 0.693, 'total_flos': 406025404416000.0, 'train_loss': 2.217956089655558, 'epoch': 3.0})

In [13]:
trainer.save_model("t5_pretrained")

# Model Evaluation

In [14]:
def display_evaluation(setname, results):
    print(f"{setname} Set Loss:", round(results["eval_loss"], 3))
# Switch the model to evaluation mode, disabling dropout etc layers.
model.eval()
# Evaluate the datasets.
display_evaluation("Training", trainer.evaluate(training_set_enc))
display_evaluation("Testing", trainer.evaluate(testing_set_enc))



Training Set Loss: 0.17
Testing Set Loss: 0.17


In [15]:
from itertools import batched
# Generates a response for a single input/question.
def generate_response(tokenizer, model, question):
# Convert the sentences into a list of numeric tokens. We instruct the tokenizer
# to return PyTorch tensors ("pt") so that we can feed them directly into the model.
    tokenized = tokenizer(question, return_tensors="pt", padding=True, truncation=True,max_length=MAX_OUTPUT_LENGTH).to(model.device)
    # Generate outputs using the model.
    with torch.no_grad():
        outputs = model.generate(**tokenized)
    # The model outputs a list of numeric tokens. To convert these tokens back to
    # sentences, we can use the batch_decode function from the tokenizer.
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs
# Generates a list of responses from the specified model, optionally including
# the context in the prompt. If limit is set, then answers will only be generated
# for the first N questions of the dataset.
def generate_answers(tokenizer, model, dataset, use_context=True, limit=None):
# Subsampling if requested.
    if limit is not None:
        dataset = dataset.select(range(limit))
# Create list of encoded tokens, similarly to how we preprocessed the data for
# training. We do this so we can use batch processing to speed up inference.
    questions = []
    inputs = []
    references = []
    for sample in dataset:
        question_with_context, question, answer = extract_sample_parts(sample)
# Only include the context if the caller requested it.
        if use_context:
            inputs.append(question_with_context)
        else:
            inputs.append(question)
# Include the original question/answer.
        questions.append(question)
        references.append(answer)
# Generate responses for each of the prompts/inputs.
# Submitting each question to the model separately would significantly
# increase processing time, especially if the model is located on the GPU.
# Instead, we group questions together in the same batch size that we used
# for training.
    outputs = []
    for samples in batched(inputs, 128):
        # Python's batched() function returns a tuple of the batch
        # size, which we have to first convert to a list.
        responses = generate_response(tokenizer, model, list(samples))
        # generate_responses() returns an equal-sized list of responses.
        outputs.extend(responses)
    # The length of the reference responses should equal the length of the
    # generated responses.
    assert (len(outputs) == len(references))
    return outputs, references, questions

In [16]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(tokenizer, model, test_dataset, True, 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(tokenizer, model, test_dataset, False, 100)

In [17]:
def display_answer_and_references(question, answer, reference):
    print("*** Without context ***")
    print("Question:", question)
    print("Generated answer:", answer)
    print("Reference answer:", reference)
    print()
# for i in range(5):
#     display_answer_and_references(questions_ctx[i], answers_ctx[i],refs_ctx[i])
# print("*** Without context ***")
for i in range(5):
    display_answer_and_references(questions_noctx[i],answers_noctx[i], refs_noctx[i])

*** Without context ***
Question: What country initially received the largest number of Huguenot refugees?
Generated answer: 
Reference answer: the Dutch Republic

*** Without context ***
Question: How many refugees emigrated to the Dutch Republic?
Generated answer: Wie viele Flüchtlinge migrierten in die Niederlande?
Reference answer: an estimated total of 75,000 to 100,000 people

*** Without context ***
Question: What was the population of the Dutch Republic before this emigration?
Generated answer: 
Reference answer: ca. 2 million

*** Without context ***
Question: What two areas in the Republic were first to grant rights to the Huguenots?
Generated answer: Welche beide Gebiete in der Republik waren die Ersten, die die Huguenots
Reference answer: Amsterdam and the area of West Frisia

*** Without context ***
Question: What declaration predicated the emigration of Huguenot refugees?
Generated answer: Quelle déclaration a proclamé l'émigration des Huguenot refugees?
Reference answer:

# Rouge

In [18]:
# Computes the average score of a given metric from a list of ROUGE scores.
def compute_average_score(scores, metric, key):
    total = 0
    for i in range(len(scores)):
    # Since it's not a map, we have to manually read the attribute.
        total += getattr(scores[i][metric], key)
    return total / len(scores)
    # Computes ROGUE-1, ROGUE-2 and ROGUE-L scores for the given generated
    # answers and reference answers.
def compute_rouge(predictions, references):
    # Compute ROUGE-1, ROGUE-2 and ROUGE-L.
    metrics = ["rouge1", "rouge2", "rougeL"]
    # Use Porter stemmer to strip word suffixes to improve matching.
    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
    # For each answer/reference pair, compute the ROUGE metrics.
    scores = []
    for prediction, reference in zip(predictions, references):
        scores.append(scorer.score(reference, prediction))
    # Compute the average precision, recall and F1 score for each metric.
    results = {}
    for metric in metrics:
        for k in ["precision", "recall", "fmeasure"]:
            results[f"{metric}_{k}"] = compute_average_score(
                scores, metric, k)
    return results

In [19]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

ROUGE with context: {'rouge1_precision': 0.32556060606060605, 'rouge1_recall': 0.2933791208791209, 'rouge1_fmeasure': 0.29371724431399354, 'rouge2_precision': 0.19666666666666668, 'rouge2_recall': 0.16175, 'rouge2_fmeasure': 0.17185714285714287, 'rougeL_precision': 0.3246515151515152, 'rougeL_recall': 0.29212912087912085, 'rougeL_fmeasure': 0.2926646127350462}

ROUGE without context: {'rouge1_precision': 0.0019090909090909091, 'rouge1_recall': 0.0044444444444444444, 'rouge1_fmeasure': 0.0025384615384615385, 'rouge2_precision': 0.0, 'rouge2_recall': 0.0, 'rouge2_fmeasure': 0.0, 'rougeL_precision': 0.0019090909090909091, 'rougeL_recall': 0.0044444444444444444, 'rougeL_fmeasure': 0.0025384615384615385}


# mrm8488/t5-base-finetuned-squadv2

In [20]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-squadv2")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-squadv2")

def get_answer(question, context):
  input_text = "question: %s  context: %s" % (question, context)
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])
  
  return tokenizer.decode(output[0])

context = "Manuel have created RuPERTa-base with the support of HF-Transformers and Google"
question = "Who has supported Manuel?"

get_answer(question, context)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


'<pad> HF-Transformers and Google</s>'

In [21]:
# Generates a response for a single input/question.
def generate_response(tokenizer, model, question):
# Convert the sentences into a list of numeric tokens. We instruct the tokenizer
# to return PyTorch tensors ("pt") so that we can feed them directly into the model.
    tokenized = tokenizer(question, return_tensors="pt", padding=True, truncation=True,max_length=MAX_OUTPUT_LENGTH).to(model.device)
    # Generate outputs using the model.
    with torch.no_grad():
        outputs = model.generate(**tokenized)
    # The model outputs a list of numeric tokens. To convert these tokens back to
    # sentences, we can use the batch_decode function from the tokenizer.
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs
# Generates a list of responses from the specified model, optionally including
# the context in the prompt. If limit is set, then answers will only be generated
# for the first N questions of the dataset.
def generate_answers(tokenizer, model, dataset, use_context=True, limit=None):
# Subsampling if requested.
    if limit is not None:
        dataset = dataset.select(range(limit))
# Create list of encoded tokens, similarly to how we preprocessed the data for
# training. We do this so we can use batch processing to speed up inference.
    questions = []
    inputs = []
    references = []
    for sample in dataset:
        question_with_context, question, answer = extract_sample_parts(sample)
# Only include the context if the caller requested it.
        if use_context:
            inputs.append(question_with_context)
        else:
            inputs.append(question)
# Include the original question/answer.
        questions.append(question)
        references.append(answer)
# Generate responses for each of the prompts/inputs.
# Submitting each question to the model separately would significantly
# increase processing time, especially if the model is located on the GPU.
# Instead, we group questions together in the same batch size that we used
# for training.
    outputs = []
    for samples in batched(inputs, 128):
        # Python's batched() function returns a tuple of the batch
        # size, which we have to first convert to a list.
        responses = generate_response(tokenizer, model, list(samples))
        # generate_responses() returns an equal-sized list of responses.
        outputs.extend(responses)
    # The length of the reference responses should equal the length of the
    # generated responses.
    assert (len(outputs) == len(references))
    return outputs, references, questions

In [22]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(tokenizer, model, test_dataset, True, 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(tokenizer, model, test_dataset, False, 100)

In [23]:
def display_answer_and_references(question, answer, reference):
    print("*** Without context ***")
    print("Question:", question)
    print("Generated answer:", answer)
    print("Reference answer:", reference)
    print()
# for i in range(5):
#     display_answer_and_references(questions_ctx[i], answers_ctx[i],refs_ctx[i])
# print("*** Without context ***")
for i in range(5):
    display_answer_and_references(questions_noctx[i],answers_noctx[i], refs_noctx[i])

*** Without context ***
Question: What country initially received the largest number of Huguenot refugees?
Generated answer: Lieu initial de réfugié(n0,n1)|
Reference answer: the Dutch Republic

*** Without context ***
Question: How many refugees emigrated to the Dutch Republic?
Generated answer: divide(n0,const_100)|divide(#0,const_
Reference answer: an estimated total of 75,000 to 100,000 people

*** Without context ***
Question: What was the population of the Dutch Republic before this emigration?
Generated answer: add(const_1,const_4)|population_year(n0)|
Reference answer: ca. 2 million

*** Without context ***
Question: What two areas in the Republic were first to grant rights to the Huguenots?
Generated answer: 
Reference answer: Amsterdam and the area of West Frisia

*** Without context ***
Question: What declaration predicated the emigration of Huguenot refugees?
Generated answer:          
Reference answer: the revocation of the Edict of Nantes



In [24]:
# Computes the average score of a given metric from a list of ROUGE scores.
def compute_average_score(scores, metric, key):
    total = 0
    for i in range(len(scores)):
    # Since it's not a map, we have to manually read the attribute.
        total += getattr(scores[i][metric], key)
    return total / len(scores)
    # Computes ROGUE-1, ROGUE-2 and ROGUE-L scores for the given generated
    # answers and reference answers.
def compute_rouge(predictions, references):
    # Compute ROUGE-1, ROGUE-2 and ROUGE-L.
    metrics = ["rouge1", "rouge2", "rougeL"]
    # Use Porter stemmer to strip word suffixes to improve matching.
    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
    # For each answer/reference pair, compute the ROUGE metrics.
    scores = []
    for prediction, reference in zip(predictions, references):
        scores.append(scorer.score(reference, prediction))
    # Compute the average precision, recall and F1 score for each metric.
    results = {}
    for metric in metrics:
        for k in ["precision", "recall", "fmeasure"]:
            results[f"{metric}_{k}"] = compute_average_score(
                scores, metric, k)
    return results

In [25]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

ROUGE with context: {'rouge1_precision': 0.5609801587301587, 'rouge1_recall': 0.5676111111111111, 'rouge1_fmeasure': 0.5423138626079801, 'rouge2_precision': 0.3511666666666666, 'rouge2_recall': 0.35424999999999995, 'rouge2_fmeasure': 0.3412698412698413, 'rougeL_precision': 0.5609801587301587, 'rougeL_recall': 0.5676111111111111, 'rougeL_fmeasure': 0.5423138626079801}

ROUGE without context: {'rouge1_precision': 0.0057619047619047615, 'rouge1_recall': 0.0037777777777777775, 'rouge1_fmeasure': 0.004451324389404885, 'rouge2_precision': 0.0, 'rouge2_recall': 0.0, 'rouge2_fmeasure': 0.0, 'rougeL_precision': 0.0057619047619047615, 'rougeL_recall': 0.0037777777777777775, 'rougeL_fmeasure': 0.004451324389404885}
