# QLORA

In [1]:
! pip install datasets pandas transformers evaluate tqdm numpy optuna accelerate peft bitsandbytes
import pandas as pd

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting optuna
  Using cached optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting peft
  Using cached peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py3

In [2]:
from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict

# Load SQuAD 2.0 dataset
squad2 = load_dataset("squad_v2")

def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """Splits a HuggingFace dataset into train, validation, and test sets while maintaining the ratio of impossible and possible answers."""

    train_data = dataset["train"]

    def is_impossible(example):
        return len(example["answers"]["text"]) == 0

    # Separate possible and impossible answers
    possible_answers = train_data.filter(lambda x: not is_impossible(x))
    impossible_answers = train_data.filter(is_impossible)

    # Calculate sizes for train and validation sets
    train_size = int(len(train_data) * train_prop)
    val_size = int(len(train_data) * val_prop)

    # Calculate proportions for possible and impossible answers
    possible_prop = len(possible_answers) / len(train_data)
    impossible_prop = len(impossible_answers) / len(train_data)

    # Create train and validation sets
    train_possible = possible_answers.select(range(int(train_size * possible_prop)))
    train_impossible = impossible_answers.select(range(int(train_size * impossible_prop)))
    train_dataset = concatenate_datasets([train_possible, train_impossible]).shuffle(seed=42)

    remaining_possible = possible_answers.select(range(int(train_size * possible_prop), len(possible_answers)))
    remaining_impossible = impossible_answers.select(range(int(train_size * impossible_prop), len(impossible_answers)))
    remaining_dataset = concatenate_datasets([remaining_possible, remaining_impossible]).shuffle(seed=42)

    val_possible = remaining_dataset.filter(lambda x: not is_impossible(x)).select(range(int(val_size * possible_prop)))
    val_impossible = remaining_dataset.filter(is_impossible).select(range(int(val_size * impossible_prop)))
    val_dataset = concatenate_datasets([val_possible, val_impossible]).shuffle(seed=42)

    test_dataset = dataset["validation"]

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })

# Calculate target train/val/test proportions (adjust as needed)
target_train_prop = 0.7001
target_val_prop = 0.2399
target_test_prop = 0.06

# Split the datasets
squad2_split = split_dataset(squad2, target_train_prop, target_val_prop, target_test_prop)
print(len(squad2_split["train"]))
print(len(squad2_split["validation"]))
print(len(squad2_split["test"]))

# Sample 5% of the data from each split
sample_prop = 0.05
squad2_split_sampled = DatasetDict({
    "train": squad2_split["train"].shuffle(seed=42).select(range(int(len(squad2_split["train"]) * sample_prop))),
    "validation": squad2_split["validation"].shuffle(seed=42).select(range(int(len(squad2_split["validation"]) * sample_prop))),
    "test": squad2_split["test"].shuffle(seed=42).select(range(int(len(squad2_split["test"]) * sample_prop))),
})

# Print the resulting sizes for verification
print("Train size:", len(squad2_split_sampled["train"]))
print("Validation size:", len(squad2_split_sampled["validation"]))
print("Test size:", len(squad2_split_sampled["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/39084 [00:00<?, ? examples/s]

Filter:   0%|          | 0/39084 [00:00<?, ? examples/s]

91235
31262
11873
Train size: 4561
Validation size: 1563
Test size: 593


In [3]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [4]:
tokenizer.is_fast

True

In [5]:
import torch


max_length = 512  # Increased from 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else None
        end_char = start_char + len(answer["text"][0]) if answer["text"] else None

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_char is None or end_char is None:
            start_positions.append(0)
            end_positions.append(0)
        elif offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [6]:
train_dataset = squad2_split_sampled["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad2_split_sampled["train"].column_names,
)
len(squad2_split_sampled["train"]), len(train_dataset)

Map:   0%|          | 0/4561 [00:00<?, ? examples/s]

(4561, 4573)

In [7]:
import torch

def preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Modify the offset mapping
        offset_mapping = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
        inputs["offset_mapping"][i] = offset_mapping

    inputs["example_id"] = example_ids
    return inputs

In [8]:
validation_dataset = squad2_split_sampled["validation"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["validation"].column_names,
)
len(squad2_split_sampled["validation"]), len(validation_dataset)

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

(1563, 1566)

In [9]:
test_dataset = squad2_split_sampled["test"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad2_split_sampled["test"].column_names,
)
len(squad2_split_sampled["test"]), len(test_dataset)

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

(593, 596)

In [10]:

from tqdm.auto import tqdm
import collections
import numpy as np
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu

squad_metric = load_metric("squad_v2", trust_remote_code=True)
n_best = 20
max_answer_length = 30

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Handle no answer possibility
        no_answer_score = start_logit[0] + end_logit[0]

        # Select the answer with the best score or no answer
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            if best_answer["logit_score"] > no_answer_score:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": best_answer["text"],
                        "no_answer_probability": 1 - sigmoid(best_answer["logit_score"] - no_answer_score)
                    }
                )
            else:
                predicted_answers.append(
                    {
                        "id": example_id,
                        "prediction_text": "",
                        "no_answer_probability": sigmoid(no_answer_score - best_answer["logit_score"])
                    }
                )
        else:
            predicted_answers.append(
                {
                    "id": example_id,
                    "prediction_text": "",
                    "no_answer_probability": 1.0
                }
            )

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]  # SQuAD v2 structure
        if ref_texts and pred_text:  # Only compute BLEU if there are reference answers and a prediction
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

  squad_metric = load_metric("squad_v2", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

In [11]:
import torch
from transformers import AutoModelForQuestionAnswering, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

def create_qlora_model(model_name="bert-base-uncased"):
    # BitsAndBytesConfig for 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load the model with quantization config
    base_model = AutoModelForQuestionAnswering.from_pretrained(
        model_name,
        quantization_config=bnb_config
    )

    # Prepare the model for k-bit training
    base_model = prepare_model_for_kbit_training(base_model)

    # LoRA config
    lora_config = LoraConfig(
      r=16,  # rank
      lora_alpha=32,
      target_modules=["query", "key", "value"],
      lora_dropout=0.05,
      bias="none",
      task_type=TaskType.CAUSAL_LM
    )

    # Get the PEFT model
    model = get_peft_model(base_model, lora_config)
    # Ignore 'labels' in the forward pass
    def forward(self, **kwargs):
        if 'labels' in kwargs:
            kwargs.pop('labels')
        return self.base_model(**kwargs)

    model.forward = forward.__get__(model)

    return model

In [12]:
from transformers import AutoModelForQuestionAnswering
def model_init():
    return create_qlora_model()

# Raw performance on squad2.0/ without Hyperparameter Optimization

In [13]:
from transformers import TrainingArguments
from transformers import Trainer
import time
#wandb.init(mode="offline")

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_bnb_8bit"
)

trainer = Trainer(
    model=None,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

start_time = time.time()
trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.772900,No log
3,4.772900,No log




Training took 625.45 seconds


In [14]:
predictions, _, _ = trainer.predict(test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print(squad_results)

  0%|          | 0/593 [00:00<?, ?it/s]

{'exact': 52.1079258010118, 'f1': 52.1079258010118, 'total': 593, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 284, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 309, 'best_exact': 52.1079258010118, 'best_exact_thresh': 0.0, 'best_f1': 52.1079258010118, 'best_f1_thresh': 0.0, 'bleu': 0.0}


# Random Search

In [15]:
import optuna
import time
import torch

# Objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    random_trainer.args.num_train_epochs = num_train_epochs
    random_trainer.args.learning_rate = learning_rate
    random_trainer.args.weight_decay = weight_decay
    random_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    random_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    random_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = random_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
random_args = TrainingArguments(
    output_dir="./results_lora_random",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

random_trainer = Trainer(
    model=None,
    args=random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the random sampler
random_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler())

# Optimize the hyperparameters using Optuna with random sampler
random_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for random sampler
print("Best Hyperparameters (Random Sampler):", random_study.best_params)
print("Best Trial (Random Sampler):", random_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from random sampling
best_random_args = TrainingArguments(
    output_dir="./results_lora_best_random",
    evaluation_strategy=random_args.evaluation_strategy,
    save_strategy=random_args.save_strategy,
    learning_rate=random_study.best_params['learning_rate'],
    num_train_epochs=random_study.best_params['num_train_epochs'],
    weight_decay=random_study.best_params['weight_decay'],
    per_device_train_batch_size=random_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=random_study.best_params['per_device_eval_batch_size'],
    report_to=random_args.report_to,
    fp16=random_args.fp16,
    optim=random_args.optim,
)

# Create a new trainer with the best arguments from random sampling
best_random_trainer = Trainer(
    model=model_init(),
    args=best_random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from random sampling
print("Retraining with Random Sampler's Best Hyperparameters")
start_time = time.time()
best_random_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from random sampling
predictions, _, _ = best_random_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_random_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (Random Sampler):", best_random_metrics)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 09:01:19,347] A new study created in memory with name: no-name-ef034840-2e56-4922-9869-57d225565d6a
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,5.5007,No log
2,4.3665,No log
3,4.1546,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 09:12:47,416] Trial 0 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 3, 'learning_rate': 1.3480432520411644e-05, 'weight_decay': 0.01123461238586223, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 33.141394753678824.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.925700,No log
3,4.925700,No log
4,3.932800,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 09:27:05,513] Trial 1 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 4, 'learning_rate': 2.3609375937082575e-05, 'weight_decay': 0.018156323297160864, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32}. Best is trial 0 with value: 33.141394753678824.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 09:34:16,214] Trial 2 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 2, 'learning_rate': 1.9102365903626952e-05, 'weight_decay': 0.014939225269813974, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 33.141394753678824.


Best Hyperparameters (Random Sampler): {'num_train_epochs': 3, 'learning_rate': 1.3480432520411644e-05, 'weight_decay': 0.01123461238586223, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16}
Best Trial (Random Sampler): FrozenTrial(number=0, state=TrialState.COMPLETE, values=[33.141394753678824], datetime_start=datetime.datetime(2024, 7, 25, 9, 1, 19, 348700), datetime_complete=datetime.datetime(2024, 7, 25, 9, 12, 47, 416587), params={'num_train_epochs': 3, 'learning_rate': 1.3480432520411644e-05, 'weight_decay': 0.01123461238586223, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 16, 32))

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Random Sampler's Best Hyperparameters




Epoch,Training Loss,Validation Loss
1,5.4016,No log
2,4.1871,No log
3,4.0714,No log




Training took 663.95 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (Random Sampler): {'exact': 52.1079258010118, 'f1': 52.1079258010118, 'total': 593, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 284, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 309, 'best_exact': 52.1079258010118, 'best_exact_thresh': 0.0, 'best_f1': 52.1079258010118, 'best_f1_thresh': 0.0, 'bleu': 0.0}


# Grid Search

In [16]:
import optuna
import time
import torch

def create_grid_search_space():
    # Define the hyperparameter search space for grid search
    search_space = {
        "num_train_epochs": [2, 3, 4, 5],
        "learning_rate": [1e-5, 3e-5, 5e-5],
        "weight_decay": [0.005, 0.01, 0.02],
        "per_device_train_batch_size": [8, 16, 32],
        "per_device_eval_batch_size": [8, 16, 32]
    }
    return search_space

def objective(trial):
    # Retrieve the hyperparameters for the current trial
    num_train_epochs = trial.suggest_categorical("num_train_epochs", grid_search_space["num_train_epochs"])
    learning_rate = trial.suggest_categorical("learning_rate", grid_search_space["learning_rate"])
    weight_decay = trial.suggest_categorical("weight_decay", grid_search_space["weight_decay"])
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", grid_search_space["per_device_train_batch_size"])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", grid_search_space["per_device_eval_batch_size"])

    # Update training arguments with the hyperparameters
    grid_trainer.args.num_train_epochs = num_train_epochs
    grid_trainer.args.learning_rate = learning_rate
    grid_trainer.args.weight_decay = weight_decay
    grid_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    grid_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the hyperparameters
    grid_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = grid_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
grid_args = TrainingArguments(
    output_dir="./results_lora_grid",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

grid_trainer = Trainer(
    model=None,
    args=grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create the grid search space
grid_search_space = create_grid_search_space()

# Create an Optuna study with the GridSampler
grid_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(grid_search_space))

# Optimize the hyperparameters using Optuna with the grid search space
grid_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for grid sampler
print("Best Hyperparameters (Grid Sampler):", grid_study.best_params)
print("Best Trial (Grid Sampler):", grid_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from grid sampling
best_grid_args = TrainingArguments(
    output_dir="./results_lora_best_grid",
    evaluation_strategy=grid_args.evaluation_strategy,
    save_strategy=grid_args.save_strategy,
    learning_rate=grid_study.best_params['learning_rate'],
    num_train_epochs=grid_study.best_params['num_train_epochs'],
    weight_decay=grid_study.best_params['weight_decay'],
    per_device_train_batch_size=grid_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=grid_study.best_params['per_device_eval_batch_size'],
    report_to=grid_args.report_to,
    fp16=grid_args.fp16,
    optim=grid_args.optim,
)

# Create a new trainer with the best arguments from grid sampling
best_grid_trainer = Trainer(
    model=model_init(),
    args=best_grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from grid sampling
print("Retraining with Grid Sampler's Best Hyperparameters")
start_time = time.time()
best_grid_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from grid sampling
predictions, _, _ = best_grid_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_grid_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (Grid Sampler):", best_grid_metrics)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 09:45:31,509] A new study created in memory with name: no-name-c7fa08b2-6234-4b52-9278-3dd0247d1c94
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 09:52:44,731] Trial 0 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 2, 'learning_rate': 3e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}. Best is trial 0 with value: 33.141394753678824.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,5.642000,No log
3,5.642000,No log
4,4.632200,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 10:07:03,890] Trial 1 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 4, 'learning_rate': 1e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32}. Best is trial 0 with value: 33.141394753678824.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,4.5349,No log
2,3.5948,No log
3,3.5192,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 10:18:32,760] Trial 2 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 33.141394753678824.


Best Hyperparameters (Grid Sampler): {'num_train_epochs': 2, 'learning_rate': 3e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}
Best Trial (Grid Sampler): FrozenTrial(number=0, state=TrialState.COMPLETE, values=[33.141394753678824], datetime_start=datetime.datetime(2024, 7, 25, 9, 45, 31, 511547), datetime_complete=datetime.datetime(2024, 7, 25, 9, 52, 44, 731087), params={'num_train_epochs': 2, 'learning_rate': 3e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}, user_attrs={}, system_attrs={'search_space': {'learning_rate': [1e-05, 3e-05, 5e-05], 'num_train_epochs': [2, 3, 4, 5], 'per_device_eval_batch_size': [8, 16, 32], 'per_device_train_batch_size': [8, 16, 32], 'weight_decay': [0.005, 0.01, 0.02]}, 'grid_id': 0}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': CategoricalDistribution(choices=(1e-05, 3e-05, 5e-05)), 

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Grid Sampler's Best Hyperparameters




Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log




Training took 407.12 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (Grid Sampler): {'exact': 52.1079258010118, 'f1': 52.1079258010118, 'total': 593, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 284, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 309, 'best_exact': 52.1079258010118, 'best_exact_thresh': 0.0, 'best_f1': 52.1079258010118, 'best_f1_thresh': 0.0, 'bleu': 0.0}


# TPE

In [17]:
import optuna
from transformers import TrainingArguments, Trainer
import time
import torch

def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    tpe_trainer.args.num_train_epochs = num_train_epochs
    tpe_trainer.args.learning_rate = learning_rate
    tpe_trainer.args.weight_decay = weight_decay
    tpe_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    tpe_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    tpe_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = tpe_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad2_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
tpe_args = TrainingArguments(
    output_dir="./results_lora_tpe",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

tpe_trainer = Trainer(
    model=None,
    args=tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the TPE sampler
tpe_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())

# Optimize the hyperparameters using Optuna with TPE sampler
tpe_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for TPE sampler
print("Best Hyperparameters (TPE Sampler):", tpe_study.best_params)
print("Best Trial (TPE Sampler):", tpe_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from TPE sampling
best_tpe_args = TrainingArguments(
    output_dir="./results_lora_best_tpe",
    evaluation_strategy=tpe_args.evaluation_strategy,
    save_strategy=tpe_args.save_strategy,
    learning_rate=tpe_study.best_params['learning_rate'],
    num_train_epochs=tpe_study.best_params['num_train_epochs'],
    weight_decay=tpe_study.best_params['weight_decay'],
    per_device_train_batch_size=tpe_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=tpe_study.best_params['per_device_eval_batch_size'],
    report_to=tpe_args.report_to,
    fp16=tpe_args.fp16,
    optim=tpe_args.optim,
)

# Create a new trainer with the best arguments from TPE sampling
best_tpe_trainer = Trainer(
    model=model_init(),
    args=best_tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from TPE sampling
print("Retraining with TPE Sampler's Best Hyperparameters")
start_time = time.time()
best_tpe_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from TPE sampling
predictions, _, _ = best_tpe_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_tpe_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad2_split_sampled["test"])
print("Best Test Metrics (TPE Sampler):", best_tpe_metrics)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 10:25:31,626] A new study created in memory with name: no-name-784c9697-f40c-4605-8124-b8d3bcc3bdcf
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.708100,No log
3,4.708100,No log




Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.708100,No log
3,4.708100,No log
4,3.777400,No log


  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 10:39:54,774] Trial 0 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 4, 'learning_rate': 3.110632363329061e-05, 'weight_decay': 0.006306182416947088, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 33.141394753678824.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,5.1723,No log
2,4.037,No log
3,3.8517,No log
4,3.7081,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 10:54:59,695] Trial 1 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 4, 'learning_rate': 1.9886452118302058e-05, 'weight_decay': 0.011337622805106622, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}. Best is trial 0 with value: 33.141394753678824.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,4.5899,No log
2,3.631,No log
3,3.5462,No log




  0%|          | 0/1563 [00:00<?, ?it/s]

[I 2024-07-25 11:06:30,634] Trial 2 finished with value: 33.141394753678824 and parameters: {'num_train_epochs': 3, 'learning_rate': 4.562695245136106e-05, 'weight_decay': 0.011301617376262844, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8}. Best is trial 0 with value: 33.141394753678824.


Best Hyperparameters (TPE Sampler): {'num_train_epochs': 4, 'learning_rate': 3.110632363329061e-05, 'weight_decay': 0.006306182416947088, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}
Best Trial (TPE Sampler): FrozenTrial(number=0, state=TrialState.COMPLETE, values=[33.141394753678824], datetime_start=datetime.datetime(2024, 7, 25, 10, 25, 31, 627653), datetime_complete=datetime.datetime(2024, 7, 25, 10, 39, 54, 774658), params={'num_train_epochs': 4, 'learning_rate': 3.110632363329061e-05, 'weight_decay': 0.006306182416947088, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 16, 32)),

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with TPE Sampler's Best Hyperparameters




Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.514200,No log
3,4.514200,No log
4,3.851800,No log




Training took 836.84 seconds


  0%|          | 0/593 [00:00<?, ?it/s]

Best Test Metrics (TPE Sampler): {'exact': 52.1079258010118, 'f1': 52.1079258010118, 'total': 593, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 284, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 309, 'best_exact': 52.1079258010118, 'best_exact_thresh': 0.0, 'best_f1': 52.1079258010118, 'best_f1_thresh': 0.0, 'bleu': 0.0}
