In [3]:
# !pip install torch transformers datasets tqdm evaluate sacrebleu numpy pandas
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=546abe8bce071b2d1d21e78577a3717fc4b8d95414702b8dcc8316ce1e19ed36
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [8]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import sacrebleu
import numpy as np
import pandas as pd

# Function to load model and tokenizer
def load_model_and_tokenizer(model_name):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure pad_token is set
    model.config.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id in model config
    return model, tokenizer

# Updated function to generate summary
def generate_summary(model, tokenizer, text, max_input_length=874, max_summary_length=150):
    """
    Generates a summary for the given text using the fine-tuned model.
    """
    # Prepare the prompt
    prompt = f"summarize: {text} summary:"

    # Tokenize the prompt
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True,
        padding=False  # Do not pad here
    )

    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    # Record the length of the prompt
    prompt_length = input_ids.size(1)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_summary_length,
            num_beams=5,
            no_repeat_ngram_size=3,
            early_stopping=True,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Extract generated token IDs
    generated_ids = outputs[0]

    # Extract summary token IDs (tokens beyond the prompt)
    summary_ids = generated_ids[prompt_length:]

    # Decode the summary tokens
    generated_summary = tokenizer.decode(summary_ids, skip_special_tokens=True).strip()

    # Truncate the summary if it exceeds max_summary_length
    generated_summary_tokens = tokenizer.tokenize(generated_summary)
    if len(generated_summary_tokens) > max_summary_length:
        generated_summary_tokens = generated_summary_tokens[:max_summary_length]
        generated_summary = tokenizer.convert_tokens_to_string(generated_summary_tokens)

    # Decode the full generated output for manual inspection (optional)
    full_output = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return generated_summary, full_output

# Function to compute metrics
def compute_metrics(predictions, references):
    """
    Compute ROUGE and BLEU metrics for summarization.
    """
    rouge = evaluate.load("rouge")

    # Compute ROUGE scores
    rouge_result = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True,
        use_aggregator=True  # Ensure aggregated scores
    )

    # Compute BLEU scores using SacreBLEU with smoothing
    bleu_scores = sacrebleu.corpus_bleu(
        predictions,
        [references],
        smooth_method='exp',       # Exponential smoothing
        smooth_value=0.1,
        force=True,                # Force compute even if length mismatch
        lowercase=True,            # Normalize case
        tokenize='13a'             # Tokenizer type (SacreBLEU default)
    )
    bleu_score = bleu_scores.score

    # Aggregate the results
    result = {
        "rouge1": round(rouge_result["rouge1"], 4),
        "rouge2": round(rouge_result["rouge2"], 4),
        "rougeL": round(rouge_result["rougeL"], 4),
        "bleu": round(bleu_score, 4)
    }

    # Optional: Calculate average prediction length
    prediction_lens = [len(pred.split()) for pred in predictions]
    result["gen_len"] = round(np.mean(prediction_lens), 4)

    return result

# Function to evaluate the model
def evaluate_model(model_name, dataset, max_input_length=874, max_summary_length=150):
    """
    Evaluates the fine-tuned model on the provided dataset.
    """
    print(f"Evaluating model: {model_name}")
    try:
        model, tokenizer = load_model_and_tokenizer(model_name)
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    generated_summaries = []
    reference_summaries_full = []
    reference_summaries_truncated = []

    for i, example in enumerate(tqdm(dataset, desc=f"Generating summaries for {model_name}")):
        text = example['text']
        reference_full = example['summary']  # Original full-length reference summary

        # Truncate the input text
        max_input_tokens = max_input_length
        text_tokens = tokenizer.tokenize(text)
        if len(text_tokens) > max_input_tokens:
            text_tokens = text_tokens[:max_input_tokens]
            text = tokenizer.convert_tokens_to_string(text_tokens)

        # Truncate the reference summary
        reference_tokens = tokenizer.tokenize(reference_full)
        if len(reference_tokens) > max_summary_length:
            reference_tokens_truncated = reference_tokens[:max_summary_length]
            reference_truncated = tokenizer.convert_tokens_to_string(reference_tokens_truncated)
        else:
            reference_truncated = reference_full

        generated_summary, _ = generate_summary(model, tokenizer, text, max_input_length, max_summary_length)

        if generated_summary:
            generated_summaries.append(generated_summary)
            reference_summaries_full.append(reference_full)
            reference_summaries_truncated.append(reference_truncated)
        else:
            print(f"Warning: Empty summary generated for example {i+1}")

    if not generated_summaries:
        print(f"Warning: No valid summaries generated for {model_name}")
        return None

    # Compute metrics using full reference summaries
    scores_full = compute_metrics(generated_summaries, reference_summaries_full)
    # Compute metrics using truncated reference summaries
    scores_truncated = compute_metrics(generated_summaries, reference_summaries_truncated)

    # Combine the scores into a single dictionary
    scores = {
        'rouge1_full': scores_full['rouge1'],
        'rouge2_full': scores_full['rouge2'],
        'rougeL_full': scores_full['rougeL'],
        'bleu_full': scores_full['bleu'],
        'rouge1_truncated': scores_truncated['rouge1'],
        'rouge2_truncated': scores_truncated['rouge2'],
        'rougeL_truncated': scores_truncated['rougeL'],
        'bleu_truncated': scores_truncated['bleu'],
        'gen_len': scores_full['gen_len']  # Generated summary length is the same
    }

    print(f"\nScores for {model_name} (using full reference summaries):")
    for metric in ['rouge1_full', 'rouge2_full', 'rougeL_full', 'bleu_full']:
        print(f"{metric}: {scores[metric]:.4f}")

    print(f"\nScores for {model_name} (using truncated reference summaries):")
    for metric in ['rouge1_truncated', 'rouge2_truncated', 'rougeL_truncated', 'bleu_truncated']:
        print(f"{metric}: {scores[metric]:.4f}")

    return scores


# Main execution block
if __name__ == "__main__":
    # Load the BillSum dataset (ca_test split)
    print("Loading the BillSum dataset...")
    dataset = load_dataset("billsum", split="ca_test")
    print(f"Loaded dataset with {len(dataset)} examples.")

    # Use only the first 100 examples
    dataset = dataset.select(range(100))
    print(f"Selected first {len(dataset)} examples for evaluation.")

    # Define the variants and norm types
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]

    # Initialize a list to store results
    results = []

    # Loop over norm types and variants
    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum"
            print(f"\nProcessing model: {model_name}")

            # Evaluate the model
            scores = evaluate_model(
                model_name,
                dataset,
                max_input_length=874,
                max_summary_length=150
            )

            if scores:
                # Add model information to the scores
                scores['model_name'] = model_name
                scores['norm_type'] = norm_type
                scores['variant'] = variant

                # Append to the results
                results.append(scores)
            else:
                print(f"Skipping model {model_name} due to errors.")

    # Save the results to a CSV file
    df = pd.DataFrame(results)
    df.to_csv('evaluation_results.csv', index=False)
    print("\nEvaluation results saved to evaluation_results.csv")



Loading the BillSum dataset...
Loaded dataset with 1237 examples.
Selected first 100 examples for evaluation.

Processing model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum: 100%|██████████| 100/100 [04:55<00:00,  2.96s/it]



Scores for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum (using full reference summaries):
rouge1_full: 0.2735
rouge2_full: 0.0593
rougeL_full: 0.1613
bleu_full: 0.5215

Scores for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3354
rouge2_truncated: 0.0607
rougeL_truncated: 0.1926
bleu_truncated: 2.1973

Processing model: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum: 100%|██████████| 100/100 [04:54<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum (using full reference summaries):
rouge1_full: 0.2669
rouge2_full: 0.0565
rougeL_full: 0.1550
bleu_full: 0.6288

Scores for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3213
rouge2_truncated: 0.0597
rougeL_truncated: 0.1845
bleu_truncated: 2.3720

Processing model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum (using full reference summaries):
rouge1_full: 0.2738
rouge2_full: 0.0595
rougeL_full: 0.1605
bleu_full: 0.6690

Scores for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3306
rouge2_truncated: 0.0602
rougeL_truncated: 0.1894
bleu_truncated: 2.5516

Processing model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum


Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum (using full reference summaries):
rouge1_full: 0.2652
rouge2_full: 0.0541
rougeL_full: 0.1559
bleu_full: 0.5657

Scores for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3208
rouge2_truncated: 0.0576
rougeL_truncated: 0.1849
bleu_truncated: 2.0730

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum


Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum: 100%|██████████| 100/100 [04:54<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum (using full reference summaries):
rouge1_full: 0.2697
rouge2_full: 0.0624
rougeL_full: 0.1550
bleu_full: 0.7039

Scores for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3272
rouge2_truncated: 0.0617
rougeL_truncated: 0.1814
bleu_truncated: 2.3624

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum (using full reference summaries):
rouge1_full: 0.2708
rouge2_full: 0.0620
rougeL_full: 0.1539
bleu_full: 0.6267

Scores for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3302
rouge2_truncated: 0.0597
rougeL_truncated: 0.1831
bleu_truncated: 2.1903

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [04:52<00:00,  2.92s/it]



Scores for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum (using full reference summaries):
rouge1_full: 0.2788
rouge2_full: 0.0667
rougeL_full: 0.1598
bleu_full: 0.7991

Scores for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3407
rouge2_truncated: 0.0681
rougeL_truncated: 0.1909
bleu_truncated: 2.6970

Processing model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [04:53<00:00,  2.94s/it]



Scores for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum (using full reference summaries):
rouge1_full: 0.2756
rouge2_full: 0.0641
rougeL_full: 0.1570
bleu_full: 0.7096

Scores for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum (using truncated reference summaries):
rouge1_truncated: 0.3312
rouge2_truncated: 0.0637
rougeL_truncated: 0.1855
bleu_truncated: 2.4964

Evaluation results saved to evaluation_results.csv
