In [1]:
!pip install torch transformers datasets tqdm pandas
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
from evaluate import load as load_metric
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return model, tokenizer

def evaluate_model(model_name, dataset, max_samples=100):
    model, tokenizer = load_model_and_tokenizer(model_name)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    all_predictions = []
    all_references = []

    for i, example in enumerate(tqdm(dataset, desc=f"Evaluating {model_name}")):
        if i >= max_samples:
            break

        inputs = tokenizer(
            example["question"],
            example["context"],
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits[0].cpu().numpy()
        end_logits = outputs.end_logits[0].cpu().numpy()

        # Find the tokens with the highest `start` and `end` scores
        start_index = np.argmax(start_logits)
        end_index = np.argmax(end_logits)

        # Compute the score of the "no answer" option
        no_answer_score = start_logits[0] + end_logits[0]
        best_answer_score = start_logits[start_index] + end_logits[end_index]

        if no_answer_score > best_answer_score or end_index < start_index:
            answer = ""
        else:
            answer_tokens = inputs["input_ids"][0][start_index:end_index+1]
            answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

        # Collect predictions and references
        prediction = {"id": example["id"], "prediction_text": answer, "no_answer_probability": float(1 / (1 + np.exp(best_answer_score - no_answer_score)))}
        reference = {"id": example["id"], "answers": example["answers"]}

        all_predictions.append(prediction)
        all_references.append(reference)

    squad_metric = load_metric("squad_v2")
    results = squad_metric.compute(predictions=all_predictions, references=all_references)
    return results

def main():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]
    results = []
    # Load dataset
    dataset = load_dataset("squad_v2", split="validation")
    # Limit to first 100 examples
    dataset = dataset.select(range(100))
    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__SQuAD"
            print(f"\nEvaluating model {model_name}")
            try:
                metrics = evaluate_model(model_name, dataset, max_samples=100)
                metrics['model_name'] = model_name
                metrics['norm_type'] = norm_type
                metrics['variant'] = variant
                results.append(metrics)
            except Exception as e:
                print(f"Error evaluating model {model_name}: {e}")
                continue
    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv('squad_evaluation_results.csv', index=False)
    print("\nEvaluation results saved to squad_evaluation_results.csv")

if __name__ == "__main__":
    main()


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]


Evaluating model shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:22<00:00,  4.42it/s]


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]


Evaluating model shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.76it/s]



Evaluating model shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.76it/s]



Evaluating model shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.70it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.72it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD


tokenizer_config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Evaluating shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:21<00:00,  4.64it/s]



Evaluation results saved to squad_evaluation_results.csv
