In [1]:
!pip install torch transformers datasets tqdm pandas evaluate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
from evaluate import load as load_metric
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return model, tokenizer

def evaluate_model(model_name, dataset, norm_type, max_samples=100):
    model, tokenizer = load_model_and_tokenizer(model_name)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    all_predictions = []
    all_references = []
    data_for_gpt4 = []

    for i, example in enumerate(tqdm(dataset, desc=f"Evaluating {model_name}")):
        if i >= max_samples:
            break

        question = example["question"]
        context = example["context"]
        reference_answers = example["answers"]["text"]

        inputs = tokenizer(
            question,
            context,
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits[0].cpu().numpy()
        end_logits = outputs.end_logits[0].cpu().numpy()

        # Find the tokens with the highest `start` and `end` scores
        start_index = np.argmax(start_logits)
        end_index = np.argmax(end_logits)

        # Compute the score of the "no answer" option
        no_answer_score = start_logits[0] + end_logits[0]
        best_answer_score = start_logits[start_index] + end_logits[end_index]

        if no_answer_score > best_answer_score or end_index < start_index:
            answer = ""
        else:
            answer_tokens = inputs["input_ids"][0][start_index:end_index+1]
            answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

        # Collect predictions and references
        prediction = {"id": example["id"], "prediction_text": answer, "no_answer_probability": float(1 / (1 + np.exp(best_answer_score - no_answer_score)))}
        reference = {"id": example["id"], "answers": example["answers"]}

        all_predictions.append(prediction)
        all_references.append(reference)

        # Collect data for GPT-4 evaluation
        data_for_gpt4.append({
            "model_name": model_name,  # Add the model_name here
            "question": question,
            "context": context,
            "reference_answers": reference_answers,
            "generated_answer": answer
        })

    # Save data for GPT-4 evaluation
    df = pd.DataFrame(data_for_gpt4)
    # Create a filename based on norm_type and variant
    # norm_type = model_name.split("_")[2].split("-")[0]
    variant = model_name.split("__")[1]
    filename = f"{norm_type}_{variant}_gpt4_evaluation_data.csv"
    df.to_csv(filename, index=False)
    print(f"Data for GPT-4 evaluation saved to {filename}")

    # Compute evaluation metrics
    squad_metric = load_metric("squad_v2")
    results = squad_metric.compute(predictions=all_predictions, references=all_references)
    return results

def main():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]
    results = []
    # Load dataset
    dataset = load_dataset("squad_v2", split="validation")
    # Limit to first 100 examples
    dataset = dataset.select(range(100))
    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__SQuAD"
            print(f"\nEvaluating model {model_name}")
            try:
                metrics = evaluate_model(model_name, dataset, norm_type, max_samples=100)
                metrics['model_name'] = model_name
                metrics['norm_type'] = norm_type
                metrics['variant'] = variant
                results.append(metrics)
            except Exception as e:
                print(f"Error evaluating model {model_name}: {e}")
                continue
    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv('squad_evaluation_results.csv', index=False)
    print("\nEvaluation results saved to squad_evaluation_results.csv")

if __name__ == "__main__":
    main()



Evaluating model shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 74.95it/s]


Data for GPT-4 evaluation saved to LN_baseModel_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 77.06it/s]


Data for GPT-4 evaluation saved to LN_noNorm_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 77.95it/s]


Data for GPT-4 evaluation saved to LN_AttnOnly_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 77.77it/s]


Data for GPT-4 evaluation saved to LN_FFNonly_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 76.59it/s]


Data for GPT-4 evaluation saved to RMSN_baseModel_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 76.30it/s]


Data for GPT-4 evaluation saved to RMSN_noNorm_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 77.03it/s]


Data for GPT-4 evaluation saved to RMSN_AttnOnly_gpt4_evaluation_data.csv

Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 75.76it/s]


Data for GPT-4 evaluation saved to RMSN_FFNonly_gpt4_evaluation_data.csv

Evaluation results saved to squad_evaluation_results.csv


# GPT-4o SQuAD prompt

As an expert evaluator, your task is to assess the quality of an answer generated by a question-answering system based on the provided context. Please focus on the following criteria:

1. **Correctness**: Is the answer correct based on the context?
2. **Completeness**: Does the answer fully address the question?
3. **Relevance**: Is the answer relevant to the question and context?
4. **Fluency**: Is the answer well-written with proper grammar and style?
5. **Conciseness**: Is the answer concise and to the point?

For each criterion, provide:

- **Score**: A number from 1 to 5 (where 1 is poor and 5 is excellent).
- **Explanation**: A brief justification for the score.

After evaluating each criterion, provide an **overall score** and a **short overall feedback**.

---

**Model Name:**

*Insert the model name here*

---

**Question:**

*Insert the question here*

---

**Context:**

*Insert the context here*

---

**Reference Answers:**

*Insert the reference answers here*

---

**Generated Answer:**

*Insert the generated answer here*

---

**Your Evaluation:**

*GPT-4 will fill this part*
