In [1]:
! pip install datasets pandas transformers evaluate tqdm numpy optuna accelerate nltk
import pandas as pd



In [2]:
from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict

# Load SQuAD 2.0 dataset
squad2 = load_dataset("squad_v2")

def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """Splits a HuggingFace dataset into train, validation, and test sets while maintaining the ratio of impossible and possible answers."""

    train_data = dataset["train"]

    def is_impossible(example):
        return len(example["answers"]["text"]) == 0

    # Separate possible and impossible answers
    possible_answers = train_data.filter(lambda x: not is_impossible(x))
    impossible_answers = train_data.filter(is_impossible)

    # Calculate sizes for train and validation sets
    train_size = int(len(train_data) * train_prop)
    val_size = int(len(train_data) * val_prop)

    # Calculate proportions for possible and impossible answers
    possible_prop = len(possible_answers) / len(train_data)
    impossible_prop = len(impossible_answers) / len(train_data)

    # Create train and validation sets
    train_possible = possible_answers.select(range(int(train_size * possible_prop)))
    train_impossible = impossible_answers.select(range(int(train_size * impossible_prop)))
    train_dataset = concatenate_datasets([train_possible, train_impossible]).shuffle(seed=42)

    remaining_possible = possible_answers.select(range(int(train_size * possible_prop), len(possible_answers)))
    remaining_impossible = impossible_answers.select(range(int(train_size * impossible_prop), len(impossible_answers)))
    remaining_dataset = concatenate_datasets([remaining_possible, remaining_impossible]).shuffle(seed=42)

    val_possible = remaining_dataset.filter(lambda x: not is_impossible(x)).select(range(int(val_size * possible_prop)))
    val_impossible = remaining_dataset.filter(is_impossible).select(range(int(val_size * impossible_prop)))
    val_dataset = concatenate_datasets([val_possible, val_impossible]).shuffle(seed=42)

    test_dataset = dataset["validation"]

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })

# Calculate target train/val/test proportions (adjust as needed)
target_train_prop = 0.7001
target_val_prop = 0.2399
target_test_prop = 0.06

# Split the datasets
squad2_split = split_dataset(squad2, target_train_prop, target_val_prop, target_test_prop)
print(len(squad2_split["train"]))
print(len(squad2_split["validation"]))
print(len(squad2_split["test"]))

# Sample 5% of the data from each split
sample_prop = 0.05
squad2_split_sampled = DatasetDict({
    "train": squad2_split["train"].shuffle(seed=42).select(range(int(len(squad2_split["train"]) * sample_prop))),
    "validation": squad2_split["validation"].shuffle(seed=42).select(range(int(len(squad2_split["validation"]) * sample_prop))),
    "test": squad2_split["test"].shuffle(seed=42).select(range(int(len(squad2_split["test"]) * sample_prop))),
})

# Print the resulting sizes for verification
print("Train size:", len(squad2_split_sampled["train"]))
print("Validation size:", len(squad2_split_sampled["validation"]))
print("Test size:", len(squad2_split_sampled["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


91235
31262
11873
Train size: 4561
Validation size: 1563
Test size: 593


In [3]:
from tqdm.auto import tqdm
import collections
import numpy as np
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu

squad_metric = load_metric("squad_v2", trust_remote_code=True)
n_best = 20
max_answer_length = 30

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Handle no answer possibility
        no_answer_score = start_logit[0] + end_logit[0]

        # Select the answer with the best score or no answer
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            if best_answer["logit_score"] > no_answer_score:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": best_answer["text"],
                        "no_answer_probability": 1 - sigmoid(best_answer["logit_score"] - no_answer_score)
                    }
                )
            else:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": "",
                        "no_answer_probability": sigmoid(no_answer_score - best_answer["logit_score"])
                    }
                )
        else:
            predicted_answers.append(
                {
                    "id": example_id,
                    "prediction_text": "",
                    "no_answer_probability": 1.0
                }
            )

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]  # SQuAD v2 structure
        if ref_texts and pred_text:  # Only compute BLEU if there are reference answers and a prediction
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

  squad_metric = load_metric("squad_v2", trust_remote_code=True)


# Roberta-Base

In [4]:
from transformers import AutoTokenizer

roberta_model_checkpoint = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_checkpoint)
roberta_tokenizer.is_fast

True

In [5]:
import torch


max_length = 512  # Increased from 384
stride = 128

def roberta_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
max_length = 512  # Increased from 384
stride = 128

import torch

def roberta_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [7]:
roberta_train_dataset = squad2_split_sampled["train"].map(
    roberta_preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(roberta_train_dataset)

(4561, 4572)

In [8]:
roberta_validation_dataset = squad2_split_sampled["validation"].map(
    roberta_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(roberta_validation_dataset)

(1563, 1565)

In [9]:
roberta_test_dataset = squad2_split_sampled["test"].map(
    roberta_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(roberta_test_dataset)

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

(593, 596)

In [10]:
from transformers import AutoModelForQuestionAnswering
roberta_model = AutoModelForQuestionAnswering.from_pretrained(roberta_model_checkpoint)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=roberta_model,
    args=args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_validation_dataset,
    tokenizer=roberta_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.858800,No log
3,1.858800,No log


Training took 579.45 seconds


In [12]:
predictions, _, _ = trainer.predict(roberta_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, roberta_test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 59.02192242833052, 'f1': 63.645406193200635, 'total': 593, 'HasAns_exact': 57.394366197183096, 'HasAns_f1': 67.04833053721119, 'HasAns_total': 284, 'NoAns_exact': 60.51779935275081, 'NoAns_f1': 60.51779935275081, 'NoAns_total': 309, 'best_exact': 63.57504215851602, 'best_exact_thresh': 0.006277706008404493, 'best_f1': 66.20342152382626, 'best_f1_thresh': 0.006602569483220577, 'bleu': 0.1455383555895281}


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Albert-large-v2

In [13]:
from transformers import AutoTokenizer

albert_large_model_checkpoint = "albert-large-v2"
albert_large_tokenizer = AutoTokenizer.from_pretrained(albert_large_model_checkpoint)
albert_large_tokenizer.is_fast

True

In [14]:
import torch

max_length = 512  # Increased from 384
stride = 128

def albert_large_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = albert_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_token_type_ids=True
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [15]:
import torch

def albert_large_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = albert_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [16]:
albert_large_train_dataset = squad2_split_sampled["train"].map(
    albert_large_preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(albert_large_train_dataset)

(4561, 4574)

In [17]:
albert_large_test_dataset = squad2_split_sampled["test"].map(
    albert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(albert_large_test_dataset)

(593, 596)

In [18]:
albert_large_validation_dataset = squad2_split_sampled["validation"].map(
    albert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(albert_large_validation_dataset)

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

(1563, 1565)

In [19]:
from transformers import AutoModelForQuestionAnswering
albert_large_model = AutoModelForQuestionAnswering.from_pretrained(albert_large_model_checkpoint)

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=8, # Increased gradient accumulation
    fp16=True, # Enable mixed precision training (if supported)
    report_to="none",
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=albert_large_model,
    args=args,
    train_dataset=albert_large_train_dataset,
    eval_dataset=albert_large_validation_dataset,
    tokenizer=albert_large_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
0,No log,No log
1,1.541400,No log
2,1.541400,No log


Training took 2042.20 seconds


In [21]:
predictions, _, _ = trainer.predict(albert_large_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, albert_large_test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 73.69308600337268, 'f1': 77.53975109507422, 'total': 593, 'HasAns_exact': 64.78873239436619, 'HasAns_f1': 72.82067746260216, 'HasAns_total': 284, 'NoAns_exact': 81.87702265372168, 'NoAns_f1': 81.87702265372168, 'NoAns_total': 309, 'best_exact': 74.19898819561551, 'best_exact_thresh': 0.07369626313447952, 'best_f1': 77.70838515915516, 'best_f1_thresh': 0.4941408932209015, 'bleu': 0.19201939296258064}


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Roberta-large

In [22]:
from transformers import AutoTokenizer
roberta_large_model_checkpoint = "roberta-large"
roberta_large_tokenizer = AutoTokenizer.from_pretrained(roberta_large_model_checkpoint)
roberta_large_tokenizer.is_fast

True

In [23]:
import torch

def roberta_large_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [24]:
def roberta_large_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [25]:
roberta_large_train_dataset = squad2_split_sampled["train"].map(
    roberta_large_preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(roberta_large_train_dataset)

(4561, 4572)

In [26]:
roberta_large_test_dataset = squad2_split_sampled["test"].map(
    roberta_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(roberta_large_test_dataset)

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

(593, 596)

In [27]:
roberta_large_validation_dataset = squad2_split_sampled["validation"].map(
    roberta_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(roberta_large_validation_dataset)

(1563, 1565)

In [28]:
from transformers import AutoModelForQuestionAnswering
roberta_large_model = AutoModelForQuestionAnswering.from_pretrained(roberta_large_model_checkpoint)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=4,  # Further reduce batch size
    per_device_eval_batch_size=4,  # Further reduce batch size
    gradient_accumulation_steps=4,  # Use gradient accumulation to increase effective batch size
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=roberta_large_model,
    args=args,
    train_dataset=roberta_large_train_dataset,
    eval_dataset=roberta_large_validation_dataset,
    tokenizer=roberta_large_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
0,No log,No log
1,1.402100,No log
2,1.402100,No log


Training took 2058.54 seconds


In [30]:
predictions, _, _ = trainer.predict(roberta_large_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, roberta_large_test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 78.58347386172007, 'f1': 82.7091501849123, 'total': 593, 'HasAns_exact': 72.88732394366197, 'HasAns_f1': 81.5018523227218, 'HasAns_total': 284, 'NoAns_exact': 83.81877022653721, 'NoAns_f1': 83.81877022653721, 'NoAns_total': 309, 'best_exact': 78.92074198988196, 'best_exact_thresh': 0.17412921786308289, 'best_f1': 82.93399560368695, 'best_f1_thresh': 0.3402658998966217, 'bleu': 0.18077179444984273}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Distilbert-base-uncased

In [31]:
from transformers import AutoTokenizer

distilbert_model_checkpoint = "distilbert-base-uncased"
distilbert_tokenizer = AutoTokenizer.from_pretrained(distilbert_model_checkpoint)
distilbert_tokenizer.is_fast

True

In [32]:
def distilbert_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = distilbert_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [33]:
def distilbert_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = distilbert_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [34]:
distilbert_train_dataset = squad2_split_sampled["train"].map(
    distilbert_preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(distilbert_train_dataset)

(4561, 4571)

In [35]:
distilbert_test_dataset = squad2_split_sampled["test"].map(
    distilbert_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(distilbert_test_dataset)

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

(593, 596)

In [36]:
distilbert_validation_dataset = squad2_split_sampled["validation"].map(
    distilbert_preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(distilbert_validation_dataset)

(1563, 1564)

In [37]:
from transformers import AutoModelForQuestionAnswering
distilbert_model = AutoModelForQuestionAnswering.from_pretrained(distilbert_model_checkpoint)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=distilbert_model,
    args=args,
    train_dataset=distilbert_train_dataset,
    eval_dataset=distilbert_validation_dataset,
    tokenizer=distilbert_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,2.668300,No log
3,2.668300,No log


Training took 294.78 seconds


In [39]:
predictions, _, _ = trainer.predict(distilbert_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, distilbert_test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 46.03709949409781, 'f1': 47.921865000279844, 'total': 593, 'HasAns_exact': 25.704225352112676, 'HasAns_f1': 29.63966882100685, 'HasAns_total': 284, 'NoAns_exact': 64.72491909385113, 'NoAns_f1': 64.72491909385113, 'NoAns_total': 309, 'best_exact': 53.6256323777403, 'best_exact_thresh': 0.01279651839286089, 'best_f1': 53.77017586123825, 'best_f1_thresh': 0.013070785440504551, 'bleu': 0.04168191400717958}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
