In [1]:
! pip install datasets pandas transformers evaluate tqdm numpy optuna accelerate
import pandas as pd

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/547.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict

# Load SQuAD 2.0 dataset
squad2 = load_dataset("squad_v2")

def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """Splits a HuggingFace dataset into train, validation, and test sets while maintaining the ratio of impossible and possible answers."""

    train_data = dataset["train"]

    def is_impossible(example):
        return len(example["answers"]["text"]) == 0

    # Separate possible and impossible answers
    possible_answers = train_data.filter(lambda x: not is_impossible(x))
    impossible_answers = train_data.filter(is_impossible)

    # Calculate sizes for train and validation sets
    train_size = int(len(train_data) * train_prop)
    val_size = int(len(train_data) * val_prop)

    # Calculate proportions for possible and impossible answers
    possible_prop = len(possible_answers) / len(train_data)
    impossible_prop = len(impossible_answers) / len(train_data)

    # Create train and validation sets
    train_possible = possible_answers.select(range(int(train_size * possible_prop)))
    train_impossible = impossible_answers.select(range(int(train_size * impossible_prop)))
    train_dataset = concatenate_datasets([train_possible, train_impossible]).shuffle(seed=42)

    remaining_possible = possible_answers.select(range(int(train_size * possible_prop), len(possible_answers)))
    remaining_impossible = impossible_answers.select(range(int(train_size * impossible_prop), len(impossible_answers)))
    remaining_dataset = concatenate_datasets([remaining_possible, remaining_impossible]).shuffle(seed=42)

    val_possible = remaining_dataset.filter(lambda x: not is_impossible(x)).select(range(int(val_size * possible_prop)))
    val_impossible = remaining_dataset.filter(is_impossible).select(range(int(val_size * impossible_prop)))
    val_dataset = concatenate_datasets([val_possible, val_impossible]).shuffle(seed=42)

    test_dataset = dataset["validation"]

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })

# Calculate target train/val/test proportions (adjust as needed)
target_train_prop = 0.7001
target_val_prop = 0.2399
target_test_prop = 0.06

# Split the datasets
squad2_split = split_dataset(squad2, target_train_prop, target_val_prop, target_test_prop)
print(len(squad2_split["train"]))
print(len(squad2_split["validation"]))
print(len(squad2_split["test"]))

# Sample 5% of the data from each split
sample_prop = 0.05
squad2_split_sampled = DatasetDict({
    "train": squad2_split["train"].shuffle(seed=42).select(range(int(len(squad2_split["train"]) * sample_prop))),
    "validation": squad2_split["validation"].shuffle(seed=42).select(range(int(len(squad2_split["validation"]) * sample_prop))),
    "test": squad2_split["test"].shuffle(seed=42).select(range(int(len(squad2_split["test"]) * sample_prop))),
})

# Print the resulting sizes for verification
print("Train size:", len(squad2_split_sampled["train"]))
print("Validation size:", len(squad2_split_sampled["validation"]))
print("Test size:", len(squad2_split_sampled["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/39084 [00:00<?, ? examples/s]

Filter:   0%|          | 0/39084 [00:00<?, ? examples/s]

91235
31262
11873
Train size: 4561
Validation size: 1563
Test size: 593


In [3]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [4]:
tokenizer.is_fast

True

In [4]:
import torch


max_length = 512  # Increased from 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [5]:
train_dataset = squad2_split_sampled["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(train_dataset)

Map:   0%|          | 0/4561 [00:00<?, ? examples/s]

(4561, 4573)

In [6]:
import torch

def preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [7]:
validation_dataset = squad2_split_sampled["validation"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(validation_dataset)

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

(1563, 1566)

In [8]:
test_dataset = squad2_split_sampled["test"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(test_dataset)

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

(593, 596)

In [9]:

from tqdm.auto import tqdm
import collections
import numpy as np
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu

squad_metric = load_metric("squad_v2", trust_remote_code=True)
n_best = 20
max_answer_length = 30

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Handle no answer possibility
        no_answer_score = start_logit[0] + end_logit[0]

        # Select the answer with the best score or no answer
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            if best_answer["logit_score"] > no_answer_score:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": best_answer["text"],
                        "no_answer_probability": 1 - sigmoid(best_answer["logit_score"] - no_answer_score)
                    }
                )
            else:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": "",
                        "no_answer_probability": sigmoid(no_answer_score - best_answer["logit_score"])
                    }
                )
        else:
            predicted_answers.append(
                {
                    "id": example_id,
                    "prediction_text": "",
                    "no_answer_probability": 1.0
                }
            )

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]  # SQuAD v2 structure
        if ref_texts and pred_text:  # Only compute BLEU if there are reference answers and a prediction
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

  squad_metric = load_metric("squad_v2", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

In [10]:
from transformers import AutoModelForQuestionAnswering
def model_init():
    return AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# Raw performance on squad2.0/ without Hyperparameter Optimization

In [12]:
from transformers import TrainingArguments
from transformers import Trainer
import time
#wandb.init(mode="offline")

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=None,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

start_time = time.time()
trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,2.143100,No log
3,2.143100,No log


Training took 419.25 seconds


In [13]:
predictions, _, _ = trainer.predict(test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 50.42158516020236, 'f1': 53.41100338520661, 'total': 593, 'HasAns_exact': 45.42253521126761, 'HasAns_f1': 51.664524674040585, 'HasAns_total': 284, 'NoAns_exact': 55.016181229773466, 'NoAns_f1': 55.016181229773466, 'NoAns_total': 309, 'best_exact': 55.31197301854975, 'best_exact_thresh': 0.051082734018564224, 'best_f1': 56.537826908320184, 'best_f1_thresh': 0.051082734018564224, 'bleu': 0.09367478224119868}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Random Search

In [14]:
import optuna
import time
import torch

# Objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    random_trainer.args.num_train_epochs = num_train_epochs
    random_trainer.args.learning_rate = learning_rate
    random_trainer.args.weight_decay = weight_decay
    random_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    random_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    random_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = random_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
random_args = TrainingArguments(
    output_dir="./results_lora_random",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_torch",  # Standard optimizer for LoRA
)

random_trainer = Trainer(
    model=None,
    args=random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the random sampler
random_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler())

# Optimize the hyperparameters using Optuna with random sampler
random_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for random sampler
print("Best Hyperparameters (Random Sampler):", random_study.best_params)
print("Best Trial (Random Sampler):", random_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from random sampling
best_random_args = TrainingArguments(
    output_dir="./results_lora_best_random",
    evaluation_strategy=random_args.evaluation_strategy,
    save_strategy=random_args.save_strategy,
    learning_rate=random_study.best_params['learning_rate'],
    num_train_epochs=random_study.best_params['num_train_epochs'],
    weight_decay=random_study.best_params['weight_decay'],
    per_device_train_batch_size=random_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=random_study.best_params['per_device_eval_batch_size'],
    report_to=random_args.report_to,
    fp16=random_args.fp16,
    optim=random_args.optim,
)

# Create a new trainer with the best arguments from random sampling
best_random_trainer = Trainer(
    model=model_init(),
    args=best_random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from random sampling
print("Retraining with Random Sampler's Best Hyperparameters")
start_time = time.time()
best_random_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from random sampling
predictions, _, _ = best_random_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_random_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (Random Sampler):", best_random_metrics)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-21 22:05:09,999] A new study created in memory with name: no-name-eca67cd8-2541-4713-bf7b-eb54dc4bdb6c
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,2.549400,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:14:17,552] Trial 0 finished with value: 47.08766916677423 and parameters: {'num_train_epochs': 4, 'learning_rate': 1.0429187523691336e-05, 'weight_decay': 0.015987670383637645, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}. Best is trial 0 with value: 47.08766916677423.
  learning_rate

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:21:09,006] Trial 1 finished with value: 50.64754541932925 and parameters: {'num_train_epochs': 3, 'learning_rate': 3.961643050241437e-05, 'weight_decay': 0.008761877917209594, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16}. Best is trial 1 with value: 50.64754541932925.
  learning_rate

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:25:51,236] Trial 2 finished with value: 48.29431376248598 and parameters: {'num_train_epochs': 2, 'learning_rate': 3.9281686208593703e-05, 'weight_decay': 0.017809919432482838, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32}. Best is trial 1 with value: 50.64754541932925.


Best Hyperparameters (Random Sampler): {'num_train_epochs': 3, 'learning_rate': 3.961643050241437e-05, 'weight_decay': 0.008761877917209594, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16}
Best Trial (Random Sampler): FrozenTrial(number=1, state=TrialState.COMPLETE, values=[50.64754541932925], datetime_start=datetime.datetime(2024, 7, 21, 22, 14, 17, 553722), datetime_complete=datetime.datetime(2024, 7, 21, 22, 21, 9, 6159), params={'num_train_epochs': 3, 'learning_rate': 3.961643050241437e-05, 'weight_decay': 0.008761877917209594, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 16, 32)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Random Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


Training took 391.78 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (Random Sampler): {'exact': 50.42158516020236, 'f1': 53.532863961047845, 'total': 593, 'HasAns_exact': 40.49295774647887, 'HasAns_f1': 46.98939552430061, 'HasAns_total': 284, 'NoAns_exact': 59.54692556634304, 'NoAns_f1': 59.54692556634304, 'NoAns_total': 309, 'best_exact': 53.6256323777403, 'best_exact_thresh': 0.2091464400291443, 'best_f1': 56.021402867945866, 'best_f1_thresh': 0.2091464400291443, 'bleu': 0.09434410433040467}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Grid Search

In [15]:
import optuna
import time
import torch

def create_grid_search_space():
    # Define the hyperparameter search space for grid search
    search_space = {
        "num_train_epochs": [2, 3, 4, 5],
        "learning_rate": [1e-5, 3e-5, 5e-5],
        "weight_decay": [0.005, 0.01, 0.02],
        "per_device_train_batch_size": [8, 16, 32],
        "per_device_eval_batch_size": [8, 16, 32]
    }
    return search_space

def objective(trial):
    # Retrieve the hyperparameters for the current trial
    num_train_epochs = trial.suggest_categorical("num_train_epochs", grid_search_space["num_train_epochs"])
    learning_rate = trial.suggest_categorical("learning_rate", grid_search_space["learning_rate"])
    weight_decay = trial.suggest_categorical("weight_decay", grid_search_space["weight_decay"])
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", grid_search_space["per_device_train_batch_size"])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", grid_search_space["per_device_eval_batch_size"])

    # Update training arguments with the hyperparameters
    grid_trainer.args.num_train_epochs = num_train_epochs
    grid_trainer.args.learning_rate = learning_rate
    grid_trainer.args.weight_decay = weight_decay
    grid_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    grid_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the hyperparameters
    grid_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = grid_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
grid_args = TrainingArguments(
    output_dir="./results_lora_grid",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_torch",  # Standard optimizer for LoRA
)

grid_trainer = Trainer(
    model=None,
    args=grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create the grid search space
grid_search_space = create_grid_search_space()

# Create an Optuna study with the GridSampler
grid_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(grid_search_space))

# Optimize the hyperparameters using Optuna with the grid search space
grid_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for grid sampler
print("Best Hyperparameters (Grid Sampler):", grid_study.best_params)
print("Best Trial (Grid Sampler):", grid_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from grid sampling
best_grid_args = TrainingArguments(
    output_dir="./results_lora_best_grid",
    evaluation_strategy=grid_args.evaluation_strategy,
    save_strategy=grid_args.save_strategy,
    learning_rate=grid_study.best_params['learning_rate'],
    num_train_epochs=grid_study.best_params['num_train_epochs'],
    weight_decay=grid_study.best_params['weight_decay'],
    per_device_train_batch_size=grid_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=grid_study.best_params['per_device_eval_batch_size'],
    report_to=grid_args.report_to,
    fp16=grid_args.fp16,
    optim=grid_args.optim,
)

# Create a new trainer with the best arguments from grid sampling
best_grid_trainer = Trainer(
    model=model_init(),
    args=best_grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from grid sampling
print("Retraining with Grid Sampler's Best Hyperparameters")
start_time = time.time()
best_grid_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from grid sampling
predictions, _, _ = best_grid_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_grid_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (Grid Sampler):", best_grid_metrics)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-21 22:32:30,673] A new study created in memory with name: no-name-2b215d4e-dc33-49e1-ad84-187da7ab8dbc
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:39:17,540] Trial 0 finished with value: 41.8697626734346 and parameters: {'num_train_epochs': 3, 'learning_rate': 1e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}. Best is trial 0 with value: 41.8697626734346.
Some weights of BertForQuestionAnswering were not 

Epoch,Training Loss,Validation Loss
1,2.3357,No log
2,1.3279,No log
3,0.6715,No log
4,0.3345,No log
5,0.1712,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:52:18,486] Trial 1 finished with value: 51.84829798172436 and parameters: {'num_train_epochs': 5, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}. Best is trial 1 with value: 51.84829798172436.
Some weights of BertForQuestionAnswering were n

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-21 22:56:58,607] Trial 2 finished with value: 46.26821233951971 and parameters: {'num_train_epochs': 2, 'learning_rate': 3e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32}. Best is trial 1 with value: 51.84829798172436.


Best Hyperparameters (Grid Sampler): {'num_train_epochs': 5, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}
Best Trial (Grid Sampler): FrozenTrial(number=1, state=TrialState.COMPLETE, values=[51.84829798172436], datetime_start=datetime.datetime(2024, 7, 21, 22, 39, 17, 541675), datetime_complete=datetime.datetime(2024, 7, 21, 22, 52, 18, 485857), params={'num_train_epochs': 5, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}, user_attrs={}, system_attrs={'search_space': {'learning_rate': [1e-05, 3e-05, 5e-05], 'num_train_epochs': [2, 3, 4, 5], 'per_device_eval_batch_size': [8, 16, 32], 'per_device_train_batch_size': [8, 16, 32], 'weight_decay': [0.005, 0.01, 0.02]}, 'grid_id': 1}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': CategoricalDistribution(choices=(1e-05, 3e-05, 5e-05)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Grid Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,2.339,No log
2,1.3186,No log
3,0.6874,No log
4,0.3232,No log
5,0.1615,No log


Training took 767.00 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (Grid Sampler): {'exact': 54.30016863406408, 'f1': 57.27179609473031, 'total': 593, 'HasAns_exact': 38.38028169014085, 'HasAns_f1': 44.58512353582777, 'HasAns_total': 284, 'NoAns_exact': 68.93203883495146, 'NoAns_f1': 68.93203883495146, 'NoAns_total': 309, 'best_exact': 54.974704890387855, 'best_exact_thresh': 1.8959251974592917e-05, 'best_f1': 57.339249720362716, 'best_f1_thresh': 0.4532618522644043, 'bleu': 0.11017605802559273}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# TPE

In [11]:
import optuna
from transformers import TrainingArguments, Trainer
import time
import torch

def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    tpe_trainer.args.num_train_epochs = num_train_epochs
    tpe_trainer.args.learning_rate = learning_rate
    tpe_trainer.args.weight_decay = weight_decay
    tpe_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    tpe_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    tpe_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = tpe_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
tpe_args = TrainingArguments(
    output_dir="./results_lora_tpe",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_torch",  # Standard optimizer for LoRA
)

tpe_trainer = Trainer(
    model=None,
    args=tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the TPE sampler
tpe_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())

# Optimize the hyperparameters using Optuna with TPE sampler
tpe_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for TPE sampler
print("Best Hyperparameters (TPE Sampler):", tpe_study.best_params)
print("Best Trial (TPE Sampler):", tpe_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from TPE sampling
best_tpe_args = TrainingArguments(
    output_dir="./results_lora_best_tpe",
    evaluation_strategy=tpe_args.evaluation_strategy,
    save_strategy=tpe_args.save_strategy,
    learning_rate=tpe_study.best_params['learning_rate'],
    num_train_epochs=tpe_study.best_params['num_train_epochs'],
    weight_decay=tpe_study.best_params['weight_decay'],
    per_device_train_batch_size=tpe_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=tpe_study.best_params['per_device_eval_batch_size'],
    report_to=tpe_args.report_to,
    fp16=tpe_args.fp16,
    optim=tpe_args.optim,
)

# Create a new trainer with the best arguments from TPE sampling
best_tpe_trainer = Trainer(
    model=model_init(),
    args=best_tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from TPE sampling
print("Retraining with TPE Sampler's Best Hyperparameters")
start_time = time.time()
best_tpe_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from TPE sampling
predictions, _, _ = best_tpe_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_tpe_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (TPE Sampler):", best_tpe_metrics)



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-22 17:29:56,491] A new study created in memory with name: no-name-4cedc01b-b428-4b04-8e27-1e1530dbdd39
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-22 17:34:45,627] Trial 0 finished with value: 47.96739549170168 and parameters: {'num_train_epochs': 2, 'learning_rate': 3.8266926953592744e-05, 'weight_decay': 0.01371571597679647, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 47.96739549170168.
  learning_rate

Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.985700,No log
3,1.985700,No log
4,0.623800,No log
5,0.623800,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-22 17:47:04,587] Trial 1 finished with value: 51.630108292115736 and parameters: {'num_train_epochs': 5, 'learning_rate': 4.972502894910213e-05, 'weight_decay': 0.007717792652707877, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}. Best is trial 1 with value: 51.630108292115736.
  learning_ra

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-22 17:52:01,424] Trial 2 finished with value: 48.05130981909277 and parameters: {'num_train_epochs': 2, 'learning_rate': 3.677530475809141e-05, 'weight_decay': 0.013671394945872127, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32}. Best is trial 1 with value: 51.630108292115736.


Best Hyperparameters (TPE Sampler): {'num_train_epochs': 5, 'learning_rate': 4.972502894910213e-05, 'weight_decay': 0.007717792652707877, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}
Best Trial (TPE Sampler): FrozenTrial(number=1, state=TrialState.COMPLETE, values=[51.630108292115736], datetime_start=datetime.datetime(2024, 7, 22, 17, 34, 45, 628173), datetime_complete=datetime.datetime(2024, 7, 22, 17, 47, 4, 587162), params={'num_train_epochs': 5, 'learning_rate': 4.972502894910213e-05, 'weight_decay': 0.007717792652707877, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 16, 32)), 

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with TPE Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.941600,No log
3,1.941600,No log
4,0.635200,No log
5,0.635200,No log


Training took 721.94 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (TPE Sampler): {'exact': 52.44519392917369, 'f1': 55.22346630831064, 'total': 593, 'HasAns_exact': 34.859154929577464, 'HasAns_f1': 40.66026591840921, 'HasAns_total': 284, 'NoAns_exact': 68.6084142394822, 'NoAns_f1': 68.6084142394822, 'NoAns_total': 309, 'best_exact': 55.98650927487353, 'best_exact_thresh': 0.0046636550687253475, 'best_f1': 57.978167845740536, 'best_f1_thresh': 0.0046636550687253475, 'bleu': 0.11356018470010991}


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
