In [4]:
import torch
from transformers import GPT2ForQuestionAnswering, GPT2TokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from torch.nn.utils.rnn import pad_sequence
import os
import wandb
from transformers.integrations import WandbCallback
import numpy as np

from datasets import load_metric # used in compute_metrics

def freeze_layers(model, variant_type):
    if variant_type == "noNorm":
        for name, param in model.named_parameters():
            if "ln" in name:
                param.requires_grad = False
    elif variant_type == "AttnOnly":
        for name, param in model.named_parameters():
            if "ln_2" in name:  # Freeze FFN layer norm
                param.requires_grad = False
    elif variant_type == "FFOnly":
        for name, param in model.named_parameters():
            if "ln_1" in name:  # Freeze attention layer norm
                param.requires_grad = False
    # For baseModel, we don't freeze any layers

def prepare_squad_dataset(tokenizer):
    dataset = load_dataset("squad")
    
    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        contexts = [c.strip() for c in examples["context"]]
        
        # Tokenize questions and contexts together
        tokenized_examples = tokenizer(
            questions,
            contexts,
            max_length=384,
            truncation="only_second",
            stride=128,
            return_overflowing_tokens=True,
            padding="max_length",
            return_offsets_mapping=True,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping", None)
        if sample_mapping is None:
            sample_mapping = list(range(len(questions)))

        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples["offset_mapping"]

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            
            # For GPT-2, we'll use the first token as our "impossible answer" token
            impossible_answer_index = 0

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i) if hasattr(tokenized_examples, "sequence_ids") else None

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples["answers"][sample_index]
            
            # If no answers are given, set the impossible_answer_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(impossible_answer_index)
                tokenized_examples["end_positions"].append(impossible_answer_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids is not None and token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids is not None and token_end_index >= 0 and sequence_ids[token_end_index] != 1:
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the impossible_answer_index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(impossible_answer_index)
                    tokenized_examples["end_positions"].append(impossible_answer_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )

    return tokenized_datasets


def compute_metrics(eval_pred):
    metric = load_metric("squad")
    logits, labels = eval_pred
    
    start_logits, end_logits = logits
    
    start_predictions = np.argmax(start_logits, axis=-1)
    end_predictions = np.argmax(end_logits, axis=-1)
    
    predictions = {
        'start_logits': start_logits,
        'end_logits': end_logits,
        'start_predictions': start_predictions,
        'end_predictions': end_predictions
    }
    
    # Ensure labels is a numpy array
    if isinstance(labels, tuple):
        start_positions, end_positions = labels
    else:
        start_positions = labels[:, 0]
        end_positions = labels[:, 1]
    
    references = {
        'start_positions': start_positions,
        'end_positions': end_positions
    }
    
    # Add error handling
    try:
        results = metric.compute(predictions=predictions, references=references)
        return results
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        print(f"predictions: {predictions}")
        print(f"references: {references}")
        return {'error': str(e)}


def fine_tune_model(model, tokenizer, dataset, output_dir, variant, num_train_epochs=2):
    # Initialize wandb run
    wandb.init(project=f"GPT-Valkyrie_LN-124m__{variant}__SQuAD", reinit=True)
    run_name = wandb.run.name

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=86,
        per_device_eval_batch_size=86,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=5,
        save_steps=10,
        load_best_model_at_end=True,
        report_to="wandb",
        run_name=run_name,
    )
    
    from transformers import default_data_collator

    # Use default_data_collator instead of a custom one
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[WandbCallback()],
    )

    trainer.train()
    wandb.finish()
    return trainer.model, run_name

In [5]:
# MAIN LOOP
wandb.login()

variants = ["noNorm", "AttnOnly", "FFNonly", "baseModel"]
base_model_path = "shng2025/GPT-Valkyrie_LN-124m__baseModel__"  # Changed to LN model

from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

dataset = prepare_squad_dataset(tokenizer)



In [6]:
for variant in variants:
    print(f"Processing {variant} model...")
        
    # Use the correct base model for each variant
    model_path = f"shng2025/GPT-Valkyrie_LN-124m__{variant}__"    
    model = GPT2ForQuestionAnswering.from_pretrained(model_path)
    
    freeze_layers(model, variant)
    
    output_dir = f"./results/{variant}"
    fine_tuned_model, run_name = fine_tune_model(model, tokenizer, dataset, output_dir, variant)
    
    # Save the model locally
    local_save_dir = f"./local_models/GPT-Valkyrie_LN-124m__{variant}__SQuAD"
    fine_tuned_model.save_pretrained(local_save_dir)
    tokenizer.save_pretrained(local_save_dir)
    print(f"Model saved locally to {local_save_dir}")
    
    # Push the model to your HuggingFace Hub repository
    new_repo_name = f"shng2025/GPT-Valkyrie_LN-124m__{variant}__SQuAD"
    fine_tuned_model.push_to_hub(new_repo_name, branch=run_name)
    tokenizer.push_to_hub(new_repo_name, branch=run_name)
    print(f"Model pushed to HuggingFace Hub: {new_repo_name}, branch: {run_name}")

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__noNorm__ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing noNorm model...


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112049744446672, max=1.0…

You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


Step,Training Loss,Validation Loss


AttributeError: 'str' object has no attribute 'get'