In [25]:
! pip install datasets pandas transformers evaluate tqdm numpy optuna accelerate nltk
import pandas as pd



In [26]:
from datasets import load_dataset
from datasets import DatasetDict

# Load SQuAD datasets
squad1 = load_dataset("squad")


def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """Splits a HuggingFace dataset into train, validation, and test sets."""
    total_size = len(dataset["train"])
    train_size = int(total_size * train_prop)
    val_size = int(total_size * val_prop)

    # Randomly select subsets for each split
    train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
    remaining_dataset = dataset["train"].shuffle(seed=42).select(range(train_size, len(dataset["train"])))
    val_dataset = remaining_dataset.select(range(val_size))
    test_dataset = dataset["validation"]

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })

# Calculate target train/val/test proportions (adjust as needed)
target_train_prop = 0.78
target_val_prop = 0.22
target_test_prop = 0.1

# Split the datasets
squad1_split = split_dataset(squad1, target_train_prop, target_val_prop, target_test_prop)

print(len(squad1_split["train"]))
print(len(squad1_split["validation"]))
print(len(squad1_split["test"]))

# Sample 5% of the data from each split
sample_prop = 0.05
squad1_split_sampled = DatasetDict({
    "train": squad1_split["train"].shuffle(seed=42).select(range(int(len(squad1_split["train"]) * sample_prop))),
    "validation": squad1_split["validation"].shuffle(seed=42).select(range(int(len(squad1_split["validation"]) * sample_prop))),
    "test": squad1_split["test"].shuffle(seed=42).select(range(int(len(squad1_split["test"]) * sample_prop))),
})

# Print the resulting sizes for verification
print("Train size:", len(squad1_split_sampled["train"]))
print("Validation size:", len(squad1_split_sampled["validation"]))
print("Test size:", len(squad1_split_sampled["test"]))

68327
19271
10570
Train size: 3416
Validation size: 963
Test size: 528


In [27]:
squad1_split_sampled["train"].filter(lambda x: len(x["answers"]["text"]) != 1)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 0
})

# Roberta-Base

In [28]:
from transformers import AutoTokenizer

roberta_model_checkpoint = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_checkpoint)
roberta_tokenizer.is_fast

True

In [29]:
def roberta_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        sequence_ids = inputs.sequence_ids(i)

        # Modify the offset mapping
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(inputs["offset_mapping"][i])
        ]

    inputs["example_id"] = example_ids
    return inputs

In [30]:
max_length = 512  # Increased from 384
stride = 128

import torch

def roberta_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = 0
        while sequence_ids[context_start] != 1:
            context_start += 1
        context_end = len(sequence_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [31]:
roberta_train_dataset = squad1_split_sampled["train"].map(
    roberta_preprocess_training_examples,
    batched=True,
    remove_columns=squad1_split_sampled["train"].column_names,
)
len(squad1_split_sampled["train"]), len(roberta_train_dataset)

(3416, 3422)

In [32]:
roberta_validation_dataset = squad1_split_sampled["validation"].map(
    roberta_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["validation"].column_names,
)
len(squad1_split_sampled["validation"]), len(roberta_validation_dataset)

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

(963, 965)

In [33]:
roberta_test_dataset = squad1_split_sampled["test"].map(
    roberta_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["test"].column_names,
)
len(squad1_split_sampled["test"]), len(roberta_test_dataset)

(528, 528)

In [34]:
import collections
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import numpy as np

squad_metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, inputs, examples):
    n_best = 20
    max_answer_length = 30

    example_to_features = collections.defaultdict(list)
    for idx, example_id in enumerate(inputs["example_id"]):
        example_to_features[example_id].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = inputs["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]
        if ref_texts and pred_text:
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

In [35]:
from transformers import AutoModelForQuestionAnswering
roberta_model = AutoModelForQuestionAnswering.from_pretrained(roberta_model_checkpoint)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=roberta_model,
    args=args,
    train_dataset=roberta_train_dataset,
    eval_dataset=roberta_validation_dataset,
    tokenizer=roberta_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,1.536300,No log


Training took 435.82 seconds


In [37]:
predictions, _, _ = trainer.predict(roberta_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, roberta_test_dataset, squad1_split_sampled["test"])
print(squad_results)

Computing metrics: 100%|██████████| 528/528 [04:23<00:00,  2.00it/s]

{'exact_match': 75.0, 'f1': 85.10238982344521, 'bleu': 0.14592521034638078}



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Albert-large-v2

In [38]:
from transformers import AutoTokenizer

albert_large_model_checkpoint = "albert-large-v2"
albert_large_tokenizer = AutoTokenizer.from_pretrained(albert_large_model_checkpoint)
albert_large_tokenizer.is_fast

True

In [39]:
import torch

max_length = 512  # Increased from 384
stride = 128

def albert_large_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = albert_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


In [40]:
import torch

def albert_large_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = albert_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [41]:
albert_large_train_dataset = squad1_split_sampled["train"].map(
    albert_large_preprocess_training_examples,
    batched=True,
    remove_columns=squad1_split_sampled["train"].column_names,
)
len(squad1_split_sampled["train"]), len(albert_large_train_dataset)

(3416, 3423)

In [42]:
albert_large_test_dataset = squad1_split_sampled["test"].map(
    albert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["test"].column_names,
)
len(squad1_split_sampled["test"]), len(albert_large_test_dataset)

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

(528, 529)

In [43]:
albert_large_validation_dataset = squad1_split_sampled["validation"].map(
    albert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["validation"].column_names,
)
len(squad1_split_sampled["validation"]), len(albert_large_validation_dataset)

(963, 965)

In [44]:
import collections
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import numpy as np

squad_metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, inputs, examples):
    n_best = 20
    max_answer_length = 30

    example_to_features = collections.defaultdict(list)
    for idx, example_id in enumerate(inputs["example_id"]):
        example_to_features[example_id].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = inputs["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]
        if ref_texts and pred_text:
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

In [45]:
from transformers import AutoModelForQuestionAnswering
albert_large_model = AutoModelForQuestionAnswering.from_pretrained(albert_large_model_checkpoint)

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=8, # Increased gradient accumulation
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=albert_large_model,
    args=args,
    train_dataset=albert_large_train_dataset,
    eval_dataset=albert_large_validation_dataset,
    tokenizer=albert_large_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,1.368200,No log


Training took 1516.42 seconds


In [47]:
predictions, _, _ = trainer.predict(albert_large_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, albert_large_test_dataset, squad1_split_sampled["test"])
print(squad_results)

Computing metrics: 100%|██████████| 528/528 [04:23<00:00,  2.01it/s]

{'exact_match': 77.46212121212122, 'f1': 87.30480161424889, 'bleu': 0.16383815222511844}



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Roberta-large

In [48]:
from transformers import AutoTokenizer
roberta_large_model_checkpoint = "roberta-large"
roberta_large_tokenizer = AutoTokenizer.from_pretrained(roberta_large_model_checkpoint)
roberta_large_tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

True

In [49]:
import torch

def roberta_large_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [50]:
def roberta_large_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = roberta_large_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs



In [51]:
roberta_large_train_dataset = squad1_split_sampled["train"].map(
    roberta_large_preprocess_training_examples,
    batched=True,
    remove_columns=squad1_split_sampled["train"].column_names,
)
len(squad1_split_sampled["train"]), len(roberta_large_train_dataset)

Map:   0%|          | 0/3416 [00:00<?, ? examples/s]

(3416, 3422)

In [52]:
roberta_large_test_dataset = squad1_split_sampled["test"].map(
    roberta_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["test"].column_names,
)
len(squad1_split_sampled["test"]), len(roberta_large_test_dataset)

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

(528, 528)

In [53]:
roberta_large_validation_dataset = squad1_split_sampled["validation"].map(
    roberta_large_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["validation"].column_names,
)
len(squad1_split_sampled["validation"]), len(roberta_large_validation_dataset)

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

(963, 965)

In [54]:
import collections
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import numpy as np

squad_metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, inputs, examples):
    n_best = 20
    max_answer_length = 30

    example_to_features = collections.defaultdict(list)
    for idx, example_id in enumerate(inputs["example_id"]):
        example_to_features[example_id].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = inputs["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]
        if ref_texts and pred_text:
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

In [55]:
from transformers import AutoModelForQuestionAnswering
roberta_large_model = AutoModelForQuestionAnswering.from_pretrained(roberta_large_model_checkpoint)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=4,  # Further reduce batch size
    per_device_eval_batch_size=4,  # Further reduce batch size
    gradient_accumulation_steps=4,  # Use gradient accumulation to increase effective batch size
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=roberta_large_model,
    args=args,
    train_dataset=roberta_large_train_dataset,
    eval_dataset=roberta_large_validation_dataset,
    tokenizer=roberta_large_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,1.231000,No log


Training took 1626.63 seconds


In [57]:
predictions, _, _ = trainer.predict(roberta_large_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, roberta_large_test_dataset, squad1_split_sampled["test"])
print(squad_results)

Computing metrics: 100%|██████████| 528/528 [04:22<00:00,  2.01it/s]

{'exact_match': 81.62878787878788, 'f1': 90.36575396280833, 'bleu': 0.17048226776322115}



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Distilbert-base-uncased

In [58]:
from transformers import AutoTokenizer

distilbert_model_checkpoint = "distilbert-base-uncased"
distilbert_tokenizer = AutoTokenizer.from_pretrained(distilbert_model_checkpoint)
distilbert_tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

True

In [59]:
def distilbert_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = distilbert_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [60]:
def distilbert_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = distilbert_tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [61]:
distilbert_train_dataset = squad1_split_sampled["train"].map(
    distilbert_preprocess_training_examples,
    batched=True,
    remove_columns=squad1_split_sampled["train"].column_names,
)
len(squad1_split_sampled["train"]), len(distilbert_train_dataset)

Map:   0%|          | 0/3416 [00:00<?, ? examples/s]

(3416, 3422)

In [62]:
distilbert_test_dataset = squad1_split_sampled["test"].map(
    distilbert_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["test"].column_names,
)
len(squad1_split_sampled["test"]), len(distilbert_test_dataset)

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

(528, 529)

In [63]:
distilbert_validation_dataset = squad1_split_sampled["validation"].map(
    distilbert_preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["validation"].column_names,
)
len(squad1_split_sampled["validation"]), len(distilbert_validation_dataset)

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

(963, 965)

In [64]:
import collections
from tqdm.auto import tqdm
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
import evaluate

squad_metric = evaluate.load("squad")
n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]  # Assuming this is a list of reference texts
        if ref_texts and pred_text:
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

In [65]:
from transformers import AutoModelForQuestionAnswering
distilbert_model = AutoModelForQuestionAnswering.from_pretrained(distilbert_model_checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=distilbert_model,
    args=args,
    train_dataset=distilbert_train_dataset,
    eval_dataset=distilbert_validation_dataset,
    tokenizer=distilbert_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,2.656600,No log


Training took 213.46 seconds


In [67]:
predictions, _, _ = trainer.predict(distilbert_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, distilbert_test_dataset, squad1_split_sampled["test"])
print(squad_results)

  0%|          | 0/528 [00:00<?, ?it/s]

{'exact_match': 53.97727272727273, 'f1': 66.29326103467068, 'bleu': 0.09222208490505376}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
