# LORA

In [2]:
!pip install datasets pandas transformers evaluate tqdm numpy optuna accelerate peft bitsandbytes
import pandas as pd


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-a

In [3]:
from datasets import load_dataset
from datasets import DatasetDict

# Load SQuAD datasets
squad1 = load_dataset("squad")

def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """Splits a HuggingFace dataset into train, validation, and test sets."""
    total_size = len(dataset["train"])
    train_size = int(total_size * train_prop)
    val_size = int(total_size * val_prop)

    # Randomly select subsets for each split
    train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
    remaining_dataset = dataset["train"].shuffle(seed=42).select(range(train_size, len(dataset["train"])))
    val_dataset = remaining_dataset.select(range(val_size))
    test_dataset = dataset["validation"]

    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })

# Calculate target train/val/test proportions (adjust as needed)
target_train_prop = 0.78
target_val_prop = 0.22
target_test_prop = 0.1

# Split the datasets
squad1_split = split_dataset(squad1, target_train_prop, target_val_prop, target_test_prop)

print(len(squad1_split["train"]))
print(len(squad1_split["validation"]))
print(len(squad1_split["test"]))

# Sample 5% of the data from each split
sample_prop = 0.05
squad1_split_sampled = DatasetDict({
    "train": squad1_split["train"].shuffle(seed=42).select(range(int(len(squad1_split["train"]) * sample_prop))),
    "validation": squad1_split["validation"].shuffle(seed=42).select(range(int(len(squad1_split["validation"]) * sample_prop))),
    "test": squad1_split["test"].shuffle(seed=42).select(range(int(len(squad1_split["test"]) * sample_prop))),
})

# Print the resulting sizes for verification
print("Train size:", len(squad1_split_sampled["train"]))
print("Validation size:", len(squad1_split_sampled["validation"]))
print("Test size:", len(squad1_split_sampled["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

68327
19271
10570
Train size: 3416
Validation size: 963
Test size: 528


In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [5]:
max_length = 512  # Increased from 384
stride = 128
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt"
    )
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1
        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [6]:
train_dataset = squad1_split_sampled["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad1_split_sampled["train"].column_names,
)
len(squad1_split_sampled["train"]), len(train_dataset)

Map:   0%|          | 0/3416 [00:00<?, ? examples/s]

(3416, 3422)

In [7]:
import torch

def preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt"
    )
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        # Convert the list comprehension result to a tensor
        new_offset = torch.tensor([
            o if sequence_ids[k] == 1 else (-1, -1) for k, o in enumerate(offset)
        ], dtype=torch.long)
        inputs["offset_mapping"][i] = new_offset
    inputs["example_id"] = example_ids
    return inputs

In [8]:
validation_dataset = squad1_split_sampled["validation"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["validation"].column_names,
)
len(squad1_split_sampled["validation"]), len(validation_dataset)

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

(963, 965)

In [9]:
test_dataset = squad1_split_sampled["test"].map(
    preprocess_eval_examples,
    batched=True,
    remove_columns=squad1_split_sampled["test"].column_names,
)
len(squad1_split_sampled["test"]), len(test_dataset)

Map:   0%|          | 0/528 [00:00<?, ? examples/s]

(528, 529)

In [10]:
import collections
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import numpy as np

squad_metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, inputs, examples):
    n_best = 20
    max_answer_length = 30
    example_to_features = collections.defaultdict(list)
    for idx, example_id in enumerate(inputs["example_id"]):
        example_to_features[example_id].append(idx)

    predicted_answers = []
    for example in tqdm(examples, desc="Computing metrics"):
        example_id = example["id"]
        context = example["context"]
        answers = []
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = inputs["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        # Handle different possible structures of ref["answers"]
        if isinstance(ref["answers"], str):
            ref_texts = [ref["answers"]]
        elif isinstance(ref["answers"], list):
            if all(isinstance(ans, str) for ans in ref["answers"]):
                ref_texts = ref["answers"]
            elif all(isinstance(ans, dict) for ans in ref["answers"]):
                ref_texts = [ans.get("text", "") for ans in ref["answers"]]
            else:
                ref_texts = [str(ans) for ans in ref["answers"]]
        else:
            ref_texts = [str(ref["answers"])]

        bleu_score = sentence_bleu(ref_texts, pred_text)
        bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score

    return squad_results

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [11]:
from peft import get_peft_model, LoraConfig, TaskType
import bitsandbytes as bnb

def create_lora_model(base_model):
    lora_config = LoraConfig(
        r=16,  # rank
        lora_alpha=32,
        target_modules=["query", "key", "value"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(base_model, lora_config)

    # Ignore 'labels' in the forward pass
    def forward(self, **kwargs):
        if 'labels' in kwargs:
            kwargs.pop('labels')
        return self.base_model(**kwargs)

    model.forward = forward.__get__(model)  # Bind the modified forward to the model

    return model

In [12]:
from transformers import AutoModelForQuestionAnswering
def model_init():
    base_model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
    return create_lora_model(base_model)

# Raw performance on squad1.1/ without Hyperparameter Optimization

In [12]:
from transformers import TrainingArguments
from transformers import Trainer
import time
#wandb.init(mode="offline")

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Increased learning rate
    num_train_epochs=3,  # Increased number of epochs
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_bnb_8bit"
)

trainer = Trainer(
    model=None,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

start_time = time.time()
trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,4.991000,No log


Training took 268.46 seconds


In [13]:
predictions, _, _ = trainer.predict(test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, test_dataset, squad1_split_sampled["test"])
print(squad_results)

Computing metrics: 100%|██████████| 528/528 [08:41<00:00,  1.01it/s]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'exact_match': 4.166666666666667, 'f1': 10.244636331165001, 'bleu': 0.035186037189333254}


# Random Search

In [14]:
import optuna
import time
import torch
from transformers import TrainingArguments
from transformers import Trainer


# Objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    random_trainer.args.num_train_epochs = num_train_epochs
    random_trainer.args.learning_rate = learning_rate
    random_trainer.args.weight_decay = weight_decay
    random_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    random_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    random_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = random_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad1_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
random_args = TrainingArguments(
    output_dir="./results_lora_random",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

random_trainer = Trainer(
    model=None,
    args=random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the random sampler
random_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler())

# Optimize the hyperparameters using Optuna with random sampler
random_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for random sampler
print("Best Hyperparameters (Random Sampler):", random_study.best_params)
print("Best Trial (Random Sampler):", random_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from random sampling
best_random_args = TrainingArguments(
    output_dir="./results_lora_best_random",
    evaluation_strategy=random_args.evaluation_strategy,
    save_strategy=random_args.save_strategy,
    learning_rate=random_study.best_params['learning_rate'],
    num_train_epochs=random_study.best_params['num_train_epochs'],
    weight_decay=random_study.best_params['weight_decay'],
    per_device_train_batch_size=random_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=random_study.best_params['per_device_eval_batch_size'],
    report_to=random_args.report_to,
    fp16=random_args.fp16,
    optim=random_args.optim,
)

# Create a new trainer with the best arguments from random sampling
best_random_trainer = Trainer(
    model=model_init(),
    args=best_random_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from random sampling
print("Retraining with Random Sampler's Best Hyperparameters")
start_time = time.time()
best_random_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from random sampling
predictions, _, _ = best_random_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_random_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad1_split_sampled["test"])
print("Best Test Metrics (Random Sampler):", best_random_metrics)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 14:27:11,987] A new study created in memory with name: no-name-6ba1b654-6d98-4b9e-a63b-3dc591e3b005
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,6.036100,No log


Computing metrics: 100%|██████████| 963/963 [28:59<00:00,  1.81s/it]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 14:59:30,229] Trial 0 finished with value: 6.763477211447924 and parameters: {'num_train_epochs': 2, 'learning_rate': 1.0477533599595964e-05, 'weight_decay': 0.01522456347545029, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size':

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,5.022000,No log


Computing metrics: 100%|██████████| 963/963 [29:21<00:00,  1.83s/it]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 15:33:32,173] Trial 1 finished with value: 10.250354606155378 and parameters: {'num_train_epochs': 3, 'learning_rate': 2.9135114771536494e-05, 'weight_decay': 0.012920128801145112, 'per_device_train_batch_size': 16, 'per_device_eval_batch_siz

Epoch,Training Loss,Validation Loss
1,No log,No log
2,5.833400,No log
3,4.628400,No log
4,4.242200,No log


Computing metrics: 100%|██████████| 963/963 [29:25<00:00,  1.83s/it]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 16:09:22,023] Trial 2 finished with value: 9.854651876755579 and parameters: {'num_train_epochs': 4, 'learning_rate': 1.2611299036641317e-05, 'weight_decay': 0.010294486209690031, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size'

Best Hyperparameters (Random Sampler): {'num_train_epochs': 3, 'learning_rate': 2.9135114771536494e-05, 'weight_decay': 0.012920128801145112, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}
Best Trial (Random Sampler): FrozenTrial(number=1, state=TrialState.COMPLETE, values=[10.250354606155378], datetime_start=datetime.datetime(2024, 7, 25, 14, 59, 30, 230064), datetime_complete=datetime.datetime(2024, 7, 25, 15, 33, 32, 173696), params={'num_train_epochs': 3, 'learning_rate': 2.9135114771536494e-05, 'weight_decay': 0.012920128801145112, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 1

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Random Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,5.050900,No log


Training took 271.93 seconds


Computing metrics: 100%|██████████| 528/528 [08:53<00:00,  1.01s/it]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Best Test Metrics (Random Sampler): {'exact_match': 3.787878787878788, 'f1': 13.699972598109298, 'bleu': 0.04689054566908976}


# Grid Search

In [15]:
import optuna
import time
import torch

def create_grid_search_space():
    # Define the hyperparameter search space for grid search
    search_space = {
        "num_train_epochs": [2, 3, 4, 5],
        "learning_rate": [1e-5, 3e-5, 5e-5],
        "weight_decay": [0.005, 0.01, 0.02],
        "per_device_train_batch_size": [8, 16, 32],
        "per_device_eval_batch_size": [8, 16, 32]
    }
    return search_space

def objective(trial):
    # Retrieve the hyperparameters for the current trial
    num_train_epochs = trial.suggest_categorical("num_train_epochs", grid_search_space["num_train_epochs"])
    learning_rate = trial.suggest_categorical("learning_rate", grid_search_space["learning_rate"])
    weight_decay = trial.suggest_categorical("weight_decay", grid_search_space["weight_decay"])
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", grid_search_space["per_device_train_batch_size"])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", grid_search_space["per_device_eval_batch_size"])

    # Update training arguments with the hyperparameters
    grid_trainer.args.num_train_epochs = num_train_epochs
    grid_trainer.args.learning_rate = learning_rate
    grid_trainer.args.weight_decay = weight_decay
    grid_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    grid_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the hyperparameters
    grid_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = grid_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad1_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
grid_args = TrainingArguments(
    output_dir="./results_lora_grid",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

grid_trainer = Trainer(
    model=None,
    args=grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create the grid search space
grid_search_space = create_grid_search_space()

# Create an Optuna study with the GridSampler
grid_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(grid_search_space))

# Optimize the hyperparameters using Optuna with the grid search space
grid_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for grid sampler
print("Best Hyperparameters (Grid Sampler):", grid_study.best_params)
print("Best Trial (Grid Sampler):", grid_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from grid sampling
best_grid_args = TrainingArguments(
    output_dir="./results_lora_best_grid",
    evaluation_strategy=grid_args.evaluation_strategy,
    save_strategy=grid_args.save_strategy,
    learning_rate=grid_study.best_params['learning_rate'],
    num_train_epochs=grid_study.best_params['num_train_epochs'],
    weight_decay=grid_study.best_params['weight_decay'],
    per_device_train_batch_size=grid_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=grid_study.best_params['per_device_eval_batch_size'],
    report_to=grid_args.report_to,
    fp16=grid_args.fp16,
    optim=grid_args.optim,
)

# Create a new trainer with the best arguments from grid sampling
best_grid_trainer = Trainer(
    model=model_init(),
    args=best_grid_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from grid sampling
print("Retraining with Grid Sampler's Best Hyperparameters")
start_time = time.time()
best_grid_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from grid sampling
predictions, _, _ = best_grid_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_grid_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad1_split_sampled["test"])
print("Best Test Metrics (Grid Sampler):", best_grid_metrics)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 16:22:53,793] A new study created in memory with name: no-name-fb7fb692-d3ce-4acf-ac04-d7922fe82e23
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.638500,No log


Computing metrics: 100%|██████████| 963/963 [29:14<00:00,  1.82s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 16:55:25,786] Trial 0 finished with value: 15.934282717384749 and parameters: {'num_train_epochs': 2, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}. Best is trial 0 with valu

Epoch,Training Loss,Validation Loss
1,No log,No log
2,5.989700,No log
3,4.946800,No log
4,4.501000,No log


Computing metrics: 100%|██████████| 963/963 [29:28<00:00,  1.84s/it]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 17:31:20,666] Trial 1 finished with value: 9.098275463606734 and parameters: {'num_train_epochs': 4, 'learning_rate': 1e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16}. Best is trial 0 with value

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log


Computing metrics: 100%|██████████| 963/963 [29:13<00:00,  1.82s/it]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 18:06:34,493] Trial 2 finished with value: 12.732595329713531 and parameters: {'num_train_epochs': 4, 'learning_rate': 5e-05, 'weight_decay': 0.02, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 8}. Best is trial 0 with valu

Best Hyperparameters (Grid Sampler): {'num_train_epochs': 2, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}
Best Trial (Grid Sampler): FrozenTrial(number=0, state=TrialState.COMPLETE, values=[15.934282717384749], datetime_start=datetime.datetime(2024, 7, 25, 16, 22, 53, 794698), datetime_complete=datetime.datetime(2024, 7, 25, 16, 55, 25, 785723), params={'num_train_epochs': 2, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}, user_attrs={}, system_attrs={'search_space': {'learning_rate': [1e-05, 3e-05, 5e-05], 'num_train_epochs': [2, 3, 4, 5], 'per_device_eval_batch_size': [8, 16, 32], 'per_device_train_batch_size': [8, 16, 32], 'weight_decay': [0.005, 0.01, 0.02]}, 'grid_id': 0}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': CategoricalDistribution(choices=(1e-05, 3e-05, 5e-05))

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with Grid Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.640200,No log


Training took 186.67 seconds


Computing metrics: 100%|██████████| 528/528 [08:46<00:00,  1.00it/s]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Best Test Metrics (Grid Sampler): {'exact_match': 17.234848484848484, 'f1': 31.524370945520925, 'bleu': 0.03820073493233467}


# TPE

In [13]:
import optuna
from transformers import TrainingArguments, Trainer
import time
import torch

def objective(trial):
    # Define the hyperparameter search space
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 4, 5])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    per_device_eval_batch_size = trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32])

    # Update training arguments with the sampled hyperparameters
    tpe_trainer.args.num_train_epochs = num_train_epochs
    tpe_trainer.args.learning_rate = learning_rate
    tpe_trainer.args.weight_decay = weight_decay
    tpe_trainer.args.per_device_train_batch_size = per_device_train_batch_size
    tpe_trainer.args.per_device_eval_batch_size = per_device_eval_batch_size

    # Train the model with the sampled hyperparameters
    tpe_trainer.train()

    # Evaluate the model on the validation set
    predictions, _, _ = tpe_trainer.predict(validation_dataset)
    start_logits, end_logits = predictions
    val_metrics = compute_metrics(start_logits, end_logits, validation_dataset, squad1_split_sampled["validation"])

    # Return the validation F1 score as the objective value
    return val_metrics["f1"]

# Set the initial hyperparameters
tpe_args = TrainingArguments(
    output_dir="./results_lora_tpe",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,
    optim="adamw_bnb_8bit"
)

tpe_trainer = Trainer(
    model=None,
    args=tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

# Create an Optuna study with the TPE sampler
tpe_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())

# Optimize the hyperparameters using Optuna with TPE sampler
tpe_study.optimize(objective, n_trials=3)

# Print the best hyperparameters and best trial for TPE sampler
print("Best Hyperparameters (TPE Sampler):", tpe_study.best_params)
print("Best Trial (TPE Sampler):", tpe_study.best_trial)

# Create a new TrainingArguments with the best hyperparameters from TPE sampling
best_tpe_args = TrainingArguments(
    output_dir="./results_lora_best_tpe",
    evaluation_strategy=tpe_args.evaluation_strategy,
    save_strategy=tpe_args.save_strategy,
    learning_rate=tpe_study.best_params['learning_rate'],
    num_train_epochs=tpe_study.best_params['num_train_epochs'],
    weight_decay=tpe_study.best_params['weight_decay'],
    per_device_train_batch_size=tpe_study.best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=tpe_study.best_params['per_device_eval_batch_size'],
    report_to=tpe_args.report_to,
    fp16=tpe_args.fp16,
    optim=tpe_args.optim,
)

# Create a new trainer with the best arguments from TPE sampling
best_tpe_trainer = Trainer(
    model=model_init(),
    args=best_tpe_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model on the full training set with the best hyperparameters from TPE sampling
print("Retraining with TPE Sampler's Best Hyperparameters")
start_time = time.time()
best_tpe_trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training took {elapsed_time:.2f} seconds")

# Predict on the test set using the best model from TPE sampling
predictions, _, _ = best_tpe_trainer.predict(test_dataset)
start_logits, end_logits = predictions
best_tpe_metrics = compute_metrics(start_logits, end_logits, test_dataset, squad1_split_sampled["test"])
print("Best Test Metrics (TPE Sampler):", best_tpe_metrics)



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-07-25 20:43:25,812] A new study created in memory with name: no-name-6f677196-c438-4033-b7eb-d2c432762b64
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.005, 0.02)
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,4.520700,No log


Computing metrics: 100%|██████████| 963/963 [29:25<00:00,  1.83s/it]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 21:20:15,769] Trial 0 finished with value: 13.48320660164526 and parameters: {'num_train_epochs': 5, 'learning_rate': 4.2815281172458345e-05, 'weight_decay': 0.016319639425340707, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size

Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.560300,No log
3,3.075100,No log
4,2.663600,No log
5,2.549100,No log


Computing metrics: 100%|██████████| 963/963 [29:43<00:00,  1.85s/it]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 21:58:01,444] Trial 1 finished with value: 41.782467811700805 and parameters: {'num_train_epochs': 5, 'learning_rate': 4.984094890338228e-05, 'weight_decay': 0.007466352710332346, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size'

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,4.526300,No log


Computing metrics: 100%|██████████| 963/963 [30:05<00:00,  1.87s/it]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
[I 2024-07-25 22:35:33,003] Trial 2 finished with value: 13.385350436226394 and parameters: {'num_train_epochs': 5, 'learning_rate': 4.258845084891667e-05, 'weight_decay': 0.011918800379818911, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size

Best Hyperparameters (TPE Sampler): {'num_train_epochs': 5, 'learning_rate': 4.984094890338228e-05, 'weight_decay': 0.007466352710332346, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}
Best Trial (TPE Sampler): FrozenTrial(number=1, state=TrialState.COMPLETE, values=[41.782467811700805], datetime_start=datetime.datetime(2024, 7, 25, 21, 20, 15, 770697), datetime_complete=datetime.datetime(2024, 7, 25, 21, 58, 1, 443851), params={'num_train_epochs': 5, 'learning_rate': 4.984094890338228e-05, 'weight_decay': 0.007466352710332346, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_train_epochs': CategoricalDistribution(choices=(2, 3, 4, 5)), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'weight_decay': FloatDistribution(high=0.02, log=False, low=0.005, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(8, 16, 32)), 'p

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Retraining with TPE Sampler's Best Hyperparameters


Epoch,Training Loss,Validation Loss
1,No log,No log
2,4.513200,No log
3,2.918100,No log
4,2.633600,No log
5,2.496400,No log


Training took 473.38 seconds


Computing metrics: 100%|██████████| 528/528 [08:59<00:00,  1.02s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Best Test Metrics (TPE Sampler): {'exact_match': 33.333333333333336, 'f1': 47.274909656547024, 'bleu': 0.03275244174172243}
