In [5]:
# !pip install transformers
# !pip install datasets==2.21.0
# !pip install wandb

In [9]:
import torch
from transformers import GPT2ForQuestionAnswering, GPT2TokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from torch.nn.utils.rnn import pad_sequence
import os
import wandb
from transformers.integrations import WandbCallback
import numpy as np

from datasets import load_metric # used in compute_metrics
from transformers.trainer_utils import EvalPrediction
from typing import Dict, List


def freeze_layers(model, variant_type):
    if variant_type == "noNorm":
        for name, param in model.named_parameters():
            if "ln" in name:
                param.requires_grad = False
    elif variant_type == "AttnOnly":
        for name, param in model.named_parameters():
            if "ln_2" in name:  # Freeze FFN layer norm
                param.requires_grad = False
    elif variant_type == "FFOnly":
        for name, param in model.named_parameters():
            if "ln_1" in name:  # Freeze attention layer norm
                param.requires_grad = False
    # For baseModel, we don't freeze any layers

def prepare_squad_dataset(tokenizer):
    dataset = load_dataset("squad")

    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        contexts = [c.strip() for c in examples["context"]]

        # Tokenize questions and contexts together
        tokenized_examples = tokenizer(
            [tokenizer.cls_token + q for q in questions], # new addition
            contexts,
            max_length=384,
            truncation="only_second",
            stride=128,
            return_overflowing_tokens=True,
            padding="max_length",
            return_offsets_mapping=True,
        )

        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        offset_mapping = tokenized_examples["offset_mapping"]

        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = 0

            sequence_ids = tokenized_examples.sequence_ids(i)

            sample_index = sample_mapping[i]
            answers = examples["answers"][sample_index]

            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1

                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )

    return tokenized_datasets


def fine_tune_model(model, tokenizer, dataset, output_dir, variant, num_train_epochs=3):
    # Initialize wandb run
    wandb.init(project=f"GPT-Valkyrie_LN-124m__{variant}__SQuAD_Simple", reinit=True)
    run_name = wandb.run.name

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,  # Changed from max_steps to num_train_epochs
        per_device_train_batch_size=24,
        per_device_eval_batch_size=24,
        warmup_ratio=0.1,  # Changed from warmup_steps to warmup_ratio
        weight_decay=0.01,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="wandb",
        run_name=run_name,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=tokenizer,
        callbacks=[WandbCallback()],
    )

    trainer.train()
    wandb.finish()
    return trainer.model, run_name

In [12]:
# MAIN LOOP
wandb.login()

variants = ["noNorm", "AttnOnly", "FFNonly", "baseModel"]

from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.cls_token = "[CLS]"
tokenizer.add_special_tokens({'cls_token': '[CLS]'})

dataset = prepare_squad_dataset(tokenizer)



In [11]:
for variant in variants:
    print(f"Processing {variant} model...")

    # Use the correct base model for each variant
    model_path = f"shng2025/GPT-Valkyrie_LN-124m__{variant}__"
    model = GPT2ForQuestionAnswering.from_pretrained(model_path)

    freeze_layers(model, variant)

    output_dir = f"./results/{variant}"
    fine_tuned_model, run_name = fine_tune_model(model, tokenizer, dataset, output_dir, variant)

    # Save the model locally
    local_save_dir = f"./local_models/GPT-Valkyrie_LN-124m__{variant}__SQuAD"
    fine_tuned_model.save_pretrained(local_save_dir)
    tokenizer.save_pretrained(local_save_dir)
    print(f"Model saved locally to {local_save_dir}")

    # Push the model to your HuggingFace Hub repository
    new_repo_name = f"shng2025/GPT-Valkyrie_LN-124m__{variant}__SQuAD"
    fine_tuned_model.push_to_hub(new_repo_name, branch=run_name)
    tokenizer.push_to_hub(new_repo_name, branch=run_name)
    print(f"Model pushed to HuggingFace Hub: {new_repo_name}, branch: {run_name}")

Processing noNorm model...


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__noNorm__ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▃▃▄▄▆▆██
train/global_step,▁▁▃▃▅▅▆▆██
train/grad_norm,██▃▃▂▂▃▃▁▁
train/learning_rate,▁▁▃▃▅▅▆▆██
train/loss,██▆▆▂▂▂▂▁▁

0,1
train/epoch,0.00452
train/global_step,50.0
train/grad_norm,32.81399
train/learning_rate,5e-05
train/loss,4.9877


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
200,4.5675,4.35315


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁▁
eval/runtime,▁▁
eval/samples_per_second,▁▁
eval/steps_per_second,▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/grad_norm,████▇▇▇▇▅▅▄▄▄▄▃▃▁▁▁▁▂▂▁▁▁▁▂▂▁▁▂▂▃▃▁▁▄▄▃▃
train/learning_rate,▂▂▄▄▅▅▇▇████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train/loss,██▆▆▄▄▅▅▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▂▂▁▁▂▂▂▂▂▂▁▁▁▁▂▂

0,1
eval/loss,4.35315
eval/runtime,123.7931
eval/samples_per_second,87.113
eval/steps_per_second,10.889
total_flos,313556108083200.0
train/epoch,0.01808
train/global_step,200.0
train/grad_norm,22.55183
train/learning_rate,0.0
train/loss,4.5675


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__noNorm__SQuAD


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD, branch: proud-galaxy-2
Processing AttnOnly model...


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__AttnOnly__ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
200,4.4022,4.137598


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁▁
eval/runtime,▁▁
eval/samples_per_second,▁▁
eval/steps_per_second,▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/grad_norm,██▆▆▄▄▄▄▃▃▄▄▂▂▂▂▂▂▂▂▃▃▁▁▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▂
train/learning_rate,▂▂▄▄▅▅▇▇████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train/loss,██▇▇▄▄▄▄▃▃▂▂▃▃▂▂▂▂▂▂▂▂▁▁▂▂▁▁▂▂▂▂▂▂▁▁▁▁▂▂

0,1
eval/loss,4.1376
eval/runtime,123.919
eval/samples_per_second,87.025
eval/steps_per_second,10.878
total_flos,313556108083200.0
train/epoch,0.01808
train/global_step,200.0
train/grad_norm,12.97218
train/learning_rate,0.0
train/loss,4.4022


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD, branch: young-frost-1
Processing FFNonly model...


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__FFNonly__ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
200,4.5659,4.339179


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁▁
eval/runtime,▁▁
eval/samples_per_second,▁▁
eval/steps_per_second,▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/grad_norm,██▅▅▄▄▃▃▂▂▄▄▁▁▂▂▁▁▁▁▁▁▁▁▂▂▂▂▁▁▁▁▁▁▁▁▂▂▂▂
train/learning_rate,▂▂▄▄▅▅▇▇████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train/loss,██▆▆▄▄▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂

0,1
eval/loss,4.33918
eval/runtime,123.989
eval/samples_per_second,86.975
eval/steps_per_second,10.872
total_flos,313556108083200.0
train/epoch,0.01808
train/global_step,200.0
train/grad_norm,29.25665
train/learning_rate,0.0
train/loss,4.5659


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__FFNonly__SQuAD


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD, branch: clear-planet-1
Processing baseModel model...


config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__baseModel__ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
200,4.3632,3.861568


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁▁
eval/runtime,▁▁
eval/samples_per_second,▁▁
eval/steps_per_second,▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███████
train/grad_norm,██▅▅▄▄▅▅▃▃▃▃▆▆▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂
train/learning_rate,▂▂▄▄▅▅▇▇████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train/loss,██▆▆▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▂▂▂▂▂▂▁▁▁▁▂▂

0,1
eval/loss,3.86157
eval/runtime,124.0682
eval/samples_per_second,86.92
eval/steps_per_second,10.865
total_flos,313556108083200.0
train/epoch,0.01808
train/global_step,200.0
train/grad_norm,18.29948
train/learning_rate,0.0
train/loss,4.3632


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__baseModel__SQuAD


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD, branch: genial-sound-1


In [None]:
for variant in variants:
    print(f"Processing {variant} model...")

    # Use the correct base model for each variant
    model_path = f"shng2025/GPT-Valkyrie_RMSN-124m__{variant}__"
    model = GPT2ForQuestionAnswering.from_pretrained(model_path)

    freeze_layers(model, variant)

    output_dir = f"./results/{variant}"
    fine_tuned_model, run_name = fine_tune_model(model, tokenizer, dataset, output_dir, variant)

    # Save the model locally
    local_save_dir = f"./local_models/GPT-Valkyrie_RMSN-124m__{variant}__SQuAD"
    fine_tuned_model.save_pretrained(local_save_dir)
    tokenizer.save_pretrained(local_save_dir)
    print(f"Model saved locally to {local_save_dir}")

    # Push the model to your HuggingFace Hub repository
    new_repo_name = f"shng2025/GPT-Valkyrie_RMSN-124m__{variant}__SQuAD"
    fine_tuned_model.push_to_hub(new_repo_name, branch=run_name)
    tokenizer.push_to_hub(new_repo_name, branch=run_name)
    print(f"Model pushed to HuggingFace Hub: {new_repo_name}, branch: {run_name}")