In [1]:
!pip install torch transformers datasets tqdm pandas

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [4]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

# Function to load model and tokenizer
def load_model_and_tokenizer(model_name):
    """
    Loads the GPT-2 model and tokenizer from the specified HuggingFace repository.
    """
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure pad_token is set
    model.config.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id in model config
    return model, tokenizer

# Function to generate summary
def generate_summary(model, tokenizer, text, max_input_length=874, max_summary_length=150):
    """
    Generates a summary for the given text using the fine-tuned model.
    """
    # Prepare the prompt
    prompt = f"summarize: {text} summary:"

    # Tokenize the prompt without padding
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True,
        padding=False  # Do not pad here
    )

    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Record the length of the prompt
    prompt_length = input_ids.size(1)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_summary_length,
            num_beams=5,                # Beam search for better quality
            no_repeat_ngram_size=3,     # Prevent repetition
            early_stopping=True,
            do_sample=False,            # Disable sampling for determinism
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Extract generated token IDs
    generated_ids = outputs[0]

    # Extract summary token IDs (tokens beyond the prompt)
    summary_ids = generated_ids[prompt_length:]

    # Decode the summary tokens
    generated_summary = tokenizer.decode(summary_ids, skip_special_tokens=True).strip()

    # Truncate the generated summary to max_summary_length tokens
    generated_summary_tokens = tokenizer.tokenize(generated_summary)
    if len(generated_summary_tokens) > max_summary_length:
        generated_summary_tokens = generated_summary_tokens[:max_summary_length]
        generated_summary = tokenizer.convert_tokens_to_string(generated_summary_tokens)

    # Decode the full generated output for manual inspection (optional)
    full_output = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return generated_summary, full_output

# Main execution block
if __name__ == "__main__":
    # Load the BillSum dataset (ca_test split)
    print("Loading the BillSum dataset...")
    dataset = load_dataset("billsum", split="ca_test")
    print(f"Loaded dataset with {len(dataset)} examples.")

    # Use only the first 100 examples
    dataset = dataset.select(range(100))
    print(f"Selected first {len(dataset)} examples for evaluation.")

    # Define the variants and norm types
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]

    # Loop over norm types and variants
    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum"
            print(f"\nProcessing model: {model_name}")

            # Load model and tokenizer
            try:
                model, tokenizer = load_model_and_tokenizer(model_name)
            except Exception as e:
                print(f"Error loading model {model_name}: {e}")
                continue

            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            model.to(device)
            model.eval()

            # Initialize a list to store results for this model
            data = []

            for i, example in enumerate(tqdm(dataset, desc=f"Generating summaries for {model_name}")):
                original_text = example['text']
                original_reference = example['summary']

                # Truncate the input text
                max_input_tokens = 874  # Adjust based on your context window
                text_tokens = tokenizer.tokenize(original_text)
                if len(text_tokens) > max_input_tokens:
                    text_tokens_truncated = text_tokens[:max_input_tokens]
                    truncated_text = tokenizer.convert_tokens_to_string(text_tokens_truncated)
                else:
                    truncated_text = original_text

                # Truncate the reference summary
                max_summary_tokens = 150  # Adjust based on your context window
                reference_tokens = tokenizer.tokenize(original_reference)
                if len(reference_tokens) > max_summary_tokens:
                    reference_tokens_truncated = reference_tokens[:max_summary_tokens]
                    truncated_reference = tokenizer.convert_tokens_to_string(reference_tokens_truncated)
                else:
                    truncated_reference = original_reference

                generated_summary, _ = generate_summary(model, tokenizer, truncated_text, max_input_length=874, max_summary_length=150)

                data.append({
                    'model_name': model_name,
                    'norm_type': norm_type,
                    'variant': variant,
                    'input_text': original_text,
                    'truncated_input': truncated_text,
                    'reference_summary': original_reference,
                    'truncated_reference_summary': truncated_reference,
                    'generated_summary': generated_summary
                })

            # Save the data for this model to a separate CSV file
            df = pd.DataFrame(data)
            # Create a filename based on norm_type and variant
            filename = f"{norm_type}_{variant}_evaluation_data.csv"
            df.to_csv(filename, index=False)
            print(f"Data for model {model_name} saved to {filename}")


Loading the BillSum dataset...
Loaded dataset with 1237 examples.
Selected first 100 examples for evaluation.

Processing model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum: 100%|██████████| 100/100 [04:55<00:00,  2.95s/it]


Data for model shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum saved to LN_baseModel_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]


Data for model shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum saved to LN_noNorm_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [04:54<00:00,  2.95s/it]


Data for model shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum saved to LN_AttnOnly_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [04:55<00:00,  2.96s/it]


Data for model shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum saved to LN_FFNonly_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum: 100%|██████████| 100/100 [04:54<00:00,  2.94s/it]


Data for model shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum saved to RMSN_baseModel_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]


Data for model shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum saved to RMSN_noNorm_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.93s/it]


Data for model shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum saved to RMSN_AttnOnly_evaluation_data.csv

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [04:55<00:00,  2.96s/it]

Data for model shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum saved to RMSN_FFNonly_evaluation_data.csv





# GPT-4 Evaluation Prompt Template


As an expert evaluator, your task is to assess the quality of a generated summary based on the truncated input text and the truncated reference summary. Please focus on the truncated input when evaluating the generated summary. Rate the generated summary according to the following criteria, on a scale from 1 to 5 (where 1 is poor and 5 is excellent):

1. **Relevance**: Does the summary capture the main points of the **truncated input text**?
2. **Conciseness**: Is the summary succinct without unnecessary details?
3. **Fluency**: Is the summary well-written with correct grammar and style?
4. **Accuracy**: Does the summary accurately represent the content of the **truncated input text** without errors?
5. **Coherence**: Is the summary logically organized and easy to understand?

For each criterion, provide:

- **Score**: A number from 1 to 5.
- **Explanation**: A brief justification for the score.

After evaluating each criterion, provide an **overall score** and a **short overall feedback**.

---
**Truncated Input Text:**

*Insert the truncated input text here*

---
**Truncated Reference Summary:**

*Insert the truncated reference summary here*

---
**Generated Summary:**

*Insert the generated summary here*

---
**Your Evaluation:**

*GPT-4 will fill this part*
