In [1]:
# Cell 1: Install (run once)
!pip install datasets



In [2]:
# Load the SUMPUBMED dataset
from datasets import load_dataset
dataset = load_dataset("Blaise-g/SumPubmed")
print("Dataset loaded!")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded!
DatasetDict({
    train: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 26147
    })
    test: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3269
    })
    dev: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3268
    })
})


In [3]:
# Look at one example
example = dataset["train"][0]
print("PAPER TEXT (first 500 chars):")
print(example["text"][:500])
print("\n" + "="*50 + "\n")
print("ABSTRACT:")
print(example["abstract"])

PAPER TEXT (first 500 chars):
 thioredoxins are widely distributed in nature from prokaryotes to eukaryotes. these proteins, which belong to the oxidoreductase thiol:disulfide superfamily, are characterized by the active site signature sequence wcxxc. this sequence motif constitutes the redox center mediating the isomerization of specific disulfide bridges on trx target proteins. in yeasts and mammals, the cytoplasmic trx redox system is complemented by a second trx system within mitochondria. in plants, the system is more i


ABSTRACT:
 natrxh, a thioredoxin type h, shows differential expression between self-incompatible and self-compatible nicotiana species. natrxh interacts in vitro with s-rnase and co-localizes with it in the extracellular matrix of the stylar transmitting tissue. natrxh contains n- and c-terminal extensions, a feature shared by thioredoxin h proteins of subgroup to ascertain the function of these extensions in natrxh secretion and protein-protein interaction, we p

In [4]:
# Install PyTorch (required for running models)
!pip install torch



In [5]:
!pip install transformers==4.40.0



In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Load pre-trained GPT-2
model_name = "gpt2"
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
print("Loading model...")
model = GPT2LMHeadModel.from_pretrained(model_name)
# GPT-2 doesn't have a pad token by default, set it to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
print("Model loaded!")
print(f"Model: {model_name}")
print(f"Parameters: {model.num_parameters():,}")

Loading tokenizer...




Loading model...
Model loaded!
Model: gpt2
Parameters: 124,439,808


In [7]:
# Task 3: Generate Baseline Summaries
# Test on a single example first
def generate_summary(text, prompt_template="Summarize the following medical research:\n\n{text}\n\nSummary:", max_new_tokens=150):
    """Generate a summary using pre-trained GPT-2."""
    # Truncate text to fit within GPT-2's context window (1024 tokens)
    # Use conservative char limit since medical text has complex tokens
    max_input_chars = 1500
    truncated_text = text[:max_input_chars]
    
    # Format prompt
    prompt = prompt_template.format(text=truncated_text)
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=800  # Leave room for generation
    )
    
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )
    
    # Decode and extract only the generated part (after the prompt)
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    summary = full_output[len(prompt):].strip()
    
    return summary
# Test on first test example
test_example = dataset["test"][0]
print("=" * 60)
print("INPUT (first 500 chars):")
print(test_example["text"][:500])
print("\n" + "=" * 60)
print("\nGROUND TRUTH ABSTRACT:")
print(test_example["abstract"][:500] + "...")
print("\n" + "=" * 60)
print("\nBASELINE GENERATED SUMMARY:")
baseline_summary = generate_summary(test_example["text"])
print(baseline_summary)

INPUT (first 500 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolution of complete genomes. one particular hypothesis concerns adaptation to high temperatures, proposing that high gc content results from selection favouring g:c pairs over less stable a:t pairs. against


GROUND TRUTH ABSTRACT:
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids that stabilize protein structure and depleted in amino acids that decrease thermostability. does this observation reflect a causal relationship, or could the apparent trend be caused by phylogenetic relatedness among sampled organisms living at diffe

In [8]:
# Task 3: Generate Baseline Summaries with Multiple Prompts
# ==========================================================
# Define baseline prompts
BASELINE_PROMPTS = {
    "prompt_1": "Summarize this text:\n\n{text}\n\nSummary:",
    "prompt_2": "Summarize the following medical research:\n\n{text}\n\nSummary:",
    "prompt_3": "Provide a brief summary:\n\n{text}\n\nSummary:",
    "prompt_4": "TL;DR:\n\n{text}\n\nTL;DR:"
}
# Fixed sample indices for reproducibility
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
# Assign 2 samples per prompt
prompt_sample_mapping = {
    "prompt_1": [0, 100],
    "prompt_2": [500, 1000],
    "prompt_3": [1500, 2000],
    "prompt_4": [2500, 3000]
}
# Document configuration
print("=" * 60)
print("BASELINE CONFIGURATION")
print("=" * 60)
print(f"Model: gpt2")
print(f"Parameters: 124M")
print(f"Transformers version: 4.40.0")
print(f"Max input tokens: 800")
print(f"Max new tokens: 150")
print(f"Num beams: 4")
print(f"No repeat ngram size: 3")
print(f"Do sample: False (deterministic)")
print("=" * 60)
# Generate summaries
baseline_results = []
output_count = 0
print("\nGENERATING BASELINE SUMMARIES...\n")
for prompt_name, indices in prompt_sample_mapping.items():
    prompt_template = BASELINE_PROMPTS[prompt_name]
    
    for idx in indices:
        output_count += 1
        example = dataset["test"][idx]
        generated = generate_summary(example["text"], prompt_template=prompt_template)
        
        baseline_results.append({
            "output_num": output_count,
            "index": idx,
            "prompt_name": prompt_name,
            "prompt_template": prompt_template,
            "input_text": example["text"],
            "ground_truth": example["abstract"],
            "generated": generated
        })
        
        print(f"{'='*60}")
        print(f"OUTPUT {output_count} | {prompt_name.upper()} | Sample index {idx}")
        print(f"{'='*60}")
        print(f"\nPROMPT: {prompt_template[:60]}...")
        print(f"\nINPUT (first 300 chars):\n{example['text'][:300]}...")
        print(f"\nGROUND TRUTH (first 200 chars):\n{example['abstract'][:200]}...")
        print(f"\nGENERATED SUMMARY:\n{generated}")
        print()
print(f"\n{'='*60}")
print(f"Generated {len(baseline_results)} baseline outputs using {len(BASELINE_PROMPTS)} prompts")
print(f"{'='*60}")

BASELINE CONFIGURATION
Model: gpt2
Parameters: 124M
Transformers version: 4.40.0
Max input tokens: 800
Max new tokens: 150
Num beams: 4
No repeat ngram size: 3
Do sample: False (deterministic)

GENERATING BASELINE SUMMARIES...

OUTPUT 1 | PROMPT_1 | Sample index 0

PROMPT: Summarize this text:

{text}

Summary:...

INPUT (first 300 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolutio...

GROUND TRUTH (first 200 chars):
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids t...

GENERATED SUMMARY:
In this paper, we show that gc is an important determinant of the evolutionary history of vertebr

# Task 4 - Baseline Evaluation


In [9]:
# Install evaluation libraries
!pip install rouge-score bert-score



In [10]:
# Task 4: Evaluate Baseline - ROUGE Scores
# =========================================
from rouge_score import rouge_scorer
import numpy as np
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
print("=" * 60)
print("ROUGE EVALUATION")
print("=" * 60 + "\n")
for result in baseline_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)
    
    print(f"Sample {result['output_num']} ({result['prompt_name']}): "
          f"R1={scores['rouge1'].fmeasure:.3f}, "
          f"R2={scores['rouge2'].fmeasure:.3f}, "
          f"RL={scores['rougeL'].fmeasure:.3f}")
# Averages
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)
print("\n" + "-" * 60)
print("ROUGE AVERAGES:")
print(f"  ROUGE-1: {avg_rouge1:.4f}")
print(f"  ROUGE-2: {avg_rouge2:.4f}")
print(f"  ROUGE-L: {avg_rougeL:.4f}")

ROUGE EVALUATION

Sample 1 (prompt_1): R1=0.301, R2=0.025, RL=0.150
Sample 2 (prompt_1): R1=0.290, R2=0.061, RL=0.148
Sample 3 (prompt_2): R1=0.186, R2=0.005, RL=0.115
Sample 4 (prompt_2): R1=0.306, R2=0.015, RL=0.138
Sample 5 (prompt_3): R1=0.350, R2=0.059, RL=0.162
Sample 6 (prompt_3): R1=0.262, R2=0.025, RL=0.131
Sample 7 (prompt_4): R1=0.297, R2=0.068, RL=0.171
Sample 8 (prompt_4): R1=0.333, R2=0.091, RL=0.209

------------------------------------------------------------
ROUGE AVERAGES:
  ROUGE-1: 0.2907
  ROUGE-2: 0.0437
  ROUGE-L: 0.1529


In [11]:
# Task 4: Evaluate Baseline - BERTScore
# =====================================
from bert_score import score as bert_score
print("=" * 60)
print("BERTSCORE EVALUATION")
print("=" * 60)
print("\nCalculating BERTScore (this may take a minute)...\n")
generated_texts = [r["generated"] for r in baseline_results]
reference_texts = [r["ground_truth"] for r in baseline_results]
P, R, F1 = bert_score(generated_texts, reference_texts, lang="en", verbose=True)
# Individual scores
for i, result in enumerate(baseline_results):
    print(f"Sample {result['output_num']} ({result['prompt_name']}): "
          f"P={P[i]:.3f}, R={R[i]:.3f}, F1={F1[i]:.3f}")
# Averages
avg_bert_p = P.mean().item()
avg_bert_r = R.mean().item()
avg_bert_f1 = F1.mean().item()
print("\n" + "-" * 60)
print("BERTSCORE AVERAGES:")
print(f"  Precision: {avg_bert_p:.4f}")
print(f"  Recall:    {avg_bert_r:.4f}")
print(f"  F1:        {avg_bert_f1:.4f}")

BERTSCORE EVALUATION

Calculating BERTScore (this may take a minute)...



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:05<00:00,  5.04s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 100.13it/s]

done in 5.06 seconds, 1.58 sentences/sec
Sample 1 (prompt_1): P=0.833, R=0.809, F1=0.821
Sample 2 (prompt_1): P=0.816, R=0.816, F1=0.816
Sample 3 (prompt_2): P=0.814, R=0.770, F1=0.792
Sample 4 (prompt_2): P=0.833, R=0.812, F1=0.822
Sample 5 (prompt_3): P=0.845, R=0.802, F1=0.823
Sample 6 (prompt_3): P=0.813, R=0.775, F1=0.793
Sample 7 (prompt_4): P=0.864, R=0.802, F1=0.832
Sample 8 (prompt_4): P=0.834, R=0.817, F1=0.825

------------------------------------------------------------
BERTSCORE AVERAGES:
  Precision: 0.8315
  Recall:    0.8004
  F1:        0.8156





In [12]:
# Task 4: Baseline Evaluation Summary
# ====================================
print("=" * 60)
print("BASELINE EVALUATION SUMMARY")
print("=" * 60)
print(f"""
┌─────────────────────────────────────────┐
│  BASELINE RESULTS                       │
├─────────────────────────────────────────┤
│  ROUGE-1:        {avg_rouge1:.4f}                   │
│  ROUGE-2:        {avg_rouge2:.4f}                   │
│  ROUGE-L:        {avg_rougeL:.4f}                   │
│  BERTScore F1:   {avg_bert_f1:.4f}                   │
└─────────────────────────────────────────┘
""")
# Store for later comparison (Task 9)
baseline_metrics = {
    "rouge1": avg_rouge1,
    "rouge2": avg_rouge2,
    "rougeL": avg_rougeL,
    "bertscore_p": avg_bert_p,
    "bertscore_r": avg_bert_r,
    "bertscore_f1": avg_bert_f1
}
print("Baseline metrics stored in 'baseline_metrics' for later comparison")

BASELINE EVALUATION SUMMARY

┌─────────────────────────────────────────┐
│  BASELINE RESULTS                       │
├─────────────────────────────────────────┤
│  ROUGE-1:        0.2907                   │
│  ROUGE-2:        0.0437                   │
│  ROUGE-L:        0.1529                   │
│  BERTScore F1:   0.8156                   │
└─────────────────────────────────────────┘

Baseline metrics stored in 'baseline_metrics' for later comparison


# TASK 5 - PROMPT Engineering 

In [13]:
# Task 5: Prompt Engineering - Versioned Prompts with Ablation
# ============================================================
PROMPT_VERSIONS = {
    "v1_role": {
        "template": "You are a medical researcher. Summarize this research:\n\n{text}\n\nSummary:",
        "change": "Role framing (starting point)"
    },
    
    "v2_audience": {
        "template": "You are a medical researcher. Summarize this research for a patient with no medical background:\n\n{text}\n\nSummary:",
        "change": "+ Audience targeting"
    },
    
    "v3_constraints": {
        "template": "You are a medical researcher. Summarize this research for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.\n\n{text}\n\nSummary:",
        "change": "+ Constraints"
    },
    
    "v4_structure": {
        "template": "You are a medical researcher. Summarize this research for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.\n\nSummarize in 3 points:\n1) What was studied\n2) Key findings\n3) Implications\n\n{text}\n\nSummary:",
        "change": "+ Structured output"
    },
    
    "v5_fewshot": {
        "template": """You are a medical researcher. Summarize this research for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.
Example:
Text: "This study examined the effects of vitamin D supplementation on bone density in elderly patients. 200 participants received either vitamin D or placebo for 12 months. Results showed a 15% improvement in bone density in the treatment group."
Summary: 1) Studied vitamin D supplements and bone health in elderly patients. 2) Found 15% improvement in bone density with vitamin D. 3) Suggests vitamin D may help prevent bone loss in older adults.
Now summarize in 3 points:
{text}
Summary:""",
        "change": "+ Few-shot example"
    }
}
# Same indices as baseline for fair comparison
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
# Print summary
print("=" * 60)
print("PROMPT ENGINEERING: VERSION OVERVIEW")
print("=" * 60)
for version, info in PROMPT_VERSIONS.items():
    print(f"\n{version.upper()}")
    print(f"  Change: {info['change']}")
    print(f"  Template preview: {info['template'][:60]}...")
print("\n" + "=" * 60)
print(f"Testing on {len(sample_indices)} samples: {sample_indices}")
print("=" * 60)

PROMPT ENGINEERING: VERSION OVERVIEW

V1_ROLE
  Change: Role framing (starting point)
  Template preview: You are a medical researcher. Summarize this research:

{tex...

V2_AUDIENCE
  Change: + Audience targeting
  Template preview: You are a medical researcher. Summarize this research for a ...

V3_CONSTRAINTS
  Change: + Constraints
  Template preview: You are a medical researcher. Summarize this research for a ...

V4_STRUCTURE
  Change: + Structured output
  Template preview: You are a medical researcher. Summarize this research for a ...

V5_FEWSHOT
  Change: + Few-shot example
  Template preview: You are a medical researcher. Summarize this research for a ...

Testing on 8 samples: [0, 100, 500, 1000, 1500, 2000, 2500, 3000]


In [14]:
# Task 5: Generate Summaries for Each Prompt Version
# ===================================================
prompt_engineering_results = {}
print("=" * 60)
print("GENERATING SUMMARIES FOR EACH PROMPT VERSION")
print("=" * 60)
for version_name, version_info in PROMPT_VERSIONS.items():
    print(f"\n{'='*60}")
    print(f"Processing: {version_name.upper()}")
    print(f"Change: {version_info['change']}")
    print(f"{'='*60}")
    
    version_results = []
    
    for i, idx in enumerate(sample_indices):
        example = dataset["test"][idx]
        generated = generate_summary(
            example["text"], 
            prompt_template=version_info["template"]
        )
        
        version_results.append({
            "index": idx,
            "ground_truth": example["abstract"],
            "generated": generated
        })
        
        print(f"\nSample {i+1} (index {idx}):")
        print(f"Generated: {generated[:150]}...")
    
    prompt_engineering_results[version_name] = version_results
    print(f"\n {version_name}: Generated {len(version_results)} summaries")
print("\n" + "=" * 60)
print(f" All {len(PROMPT_VERSIONS)} prompt versions processed")
print(f" Results stored in 'prompt_engineering_results'")
print("=" * 60)

GENERATING SUMMARIES FOR EACH PROMPT VERSION

Processing: V1_ROLE
Change: Role framing (starting point)

Sample 1 (index 0):
Generated: In this paper, we present the results of a large-scale comparative study of the effects of high temperature and low gc on the development of a complet...

Sample 2 (index 100):
Generated: In this paper, we present the results of a large-scale phylogenetic analysis of a wide range of pathogenic organisms. We show that the phylogenetic re...

Sample 3 (index 500):
Generated: Phytoplosan is a highly toxic and highly toxic plant organism. It has been shown that it can cause serious health problems in humans and animals. It i...

Sample 4 (index 1000):
Generated: In this paper, we present the results of a large genome-wide association study (GWAS) of the human genome. The results of the study are summarized in ...

Sample 5 (index 1500):
Generated: In this paper, we present the results of a systematic review and meta-analysis of genome-wide association stud

# Evaluate All Prompt Versions (ROUGE)

In [15]:
# Task 5: Evaluate Prompt Versions - ROUGE Scores
# ================================================
from rouge_score import rouge_scorer
import numpy as np
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Store metrics for each version
prompt_engineering_metrics = {}
print("=" * 60)
print("EVALUATING PROMPT VERSIONS - ROUGE SCORES")
print("=" * 60)
for version_name, results in prompt_engineering_results.items():
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for result in results:
        scores = scorer.score(result["ground_truth"], result["generated"])
        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)
    
    prompt_engineering_metrics[version_name] = {
        "rouge1": np.mean(rouge1_scores),
        "rouge2": np.mean(rouge2_scores),
        "rougeL": np.mean(rougeL_scores)
    }
    
    print(f"\n{version_name.upper()}")
    print(f"  ROUGE-1: {np.mean(rouge1_scores):.4f}")
    print(f"  ROUGE-2: {np.mean(rouge2_scores):.4f}")
    print(f"  ROUGE-L: {np.mean(rougeL_scores):.4f}")
print("\n" + "=" * 60)
print("ROUGE evaluation complete")
print("=" * 60)

EVALUATING PROMPT VERSIONS - ROUGE SCORES

V1_ROLE
  ROUGE-1: 0.2503
  ROUGE-2: 0.0389
  ROUGE-L: 0.1424

V2_AUDIENCE
  ROUGE-1: 0.2588
  ROUGE-2: 0.0360
  ROUGE-L: 0.1442

V3_CONSTRAINTS
  ROUGE-1: 0.2608
  ROUGE-2: 0.0305
  ROUGE-L: 0.1392

V4_STRUCTURE
  ROUGE-1: 0.2917
  ROUGE-2: 0.0416
  ROUGE-L: 0.1539

V5_FEWSHOT
  ROUGE-1: 0.1966
  ROUGE-2: 0.0194
  ROUGE-L: 0.1229

ROUGE evaluation complete


# Evaluate All Prompt Versions (BERTScore)

In [16]:
# Task 5: Evaluate Prompt Versions - BERTScore
# =============================================
from bert_score import score as bert_score
print("=" * 60)
print("EVALUATING PROMPT VERSIONS - BERTSCORE")
print("=" * 60)
print("\nThis may take a few minutes...\n")
for version_name, results in prompt_engineering_results.items():
    generated_texts = [r["generated"] for r in results]
    reference_texts = [r["ground_truth"] for r in results]
    
    P, R, F1 = bert_score(generated_texts, reference_texts, lang="en", verbose=False)
    
    prompt_engineering_metrics[version_name]["bertscore_p"] = P.mean().item()
    prompt_engineering_metrics[version_name]["bertscore_r"] = R.mean().item()
    prompt_engineering_metrics[version_name]["bertscore_f1"] = F1.mean().item()
    
    print(f"{version_name.upper()}: BERTScore F1 = {F1.mean().item():.4f}")
print("\n" + "=" * 60)
print("BERTScore evaluation complete")
print("=" * 60)

EVALUATING PROMPT VERSIONS - BERTSCORE

This may take a few minutes...



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


V1_ROLE: BERTScore F1 = 0.8071


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


V2_AUDIENCE: BERTScore F1 = 0.8102


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


V3_CONSTRAINTS: BERTScore F1 = 0.8122


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


V4_STRUCTURE: BERTScore F1 = 0.8167


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


V5_FEWSHOT: BERTScore F1 = 0.7932

BERTScore evaluation complete


 # Before/After Comparison Grid (Ablation Table)

In [17]:
# Task 5: Ablation Analysis - Before/After Grid
# ==============================================
print("=" * 60)
print("ABLATION ANALYSIS: BEFORE/AFTER GRID")
print("=" * 60)
# Header
print(f"\n{'Version':<15} {'Change':<30} {'R-1':<8} {'R-2':<8} {'R-L':<8} {'BERT-F1':<8} {'Δ R-L':<8}")
print("-" * 95)
# Baseline reference
baseline_rougeL = baseline_metrics["rougeL"]
print(f"{'Baseline':<15} {'(Task 3 best)':<30} {baseline_metrics['rouge1']:.4f}   {baseline_metrics['rouge2']:.4f}   {baseline_rougeL:.4f}   {baseline_metrics['bertscore_f1']:.4f}   {'--':<8}")
# Previous version for delta calculation
prev_rougeL = baseline_rougeL
# Each prompt engineering version
for version_name, version_info in PROMPT_VERSIONS.items():
    metrics = prompt_engineering_metrics[version_name]
    delta = metrics["rougeL"] - prev_rougeL
    delta_str = f"{delta:+.4f}"
    
    print(f"{version_name:<15} {version_info['change']:<30} {metrics['rouge1']:.4f}   {metrics['rouge2']:.4f}   {metrics['rougeL']:.4f}   {metrics['bertscore_f1']:.4f}   {delta_str:<8}")
    
    prev_rougeL = metrics["rougeL"]
print("-" * 95)
# Overall improvement
best_version = max(prompt_engineering_metrics.keys(), key=lambda x: prompt_engineering_metrics[x]["rougeL"])
best_rougeL = prompt_engineering_metrics[best_version]["rougeL"]
total_improvement = best_rougeL - baseline_rougeL
print(f"\n{'BEST VERSION:':<15} {best_version}")
print(f"{'TOTAL Δ R-L:':<15} {total_improvement:+.4f} (from {baseline_rougeL:.4f} to {best_rougeL:.4f})")
print(f"{'IMPROVEMENT:':<15} {(total_improvement/baseline_rougeL)*100:.1f}%")
print("\n" + "=" * 60)
print("Ablation analysis complete")
print("=" * 60)

ABLATION ANALYSIS: BEFORE/AFTER GRID

Version         Change                         R-1      R-2      R-L      BERT-F1  Δ R-L   
-----------------------------------------------------------------------------------------------
Baseline        (Task 3 best)                  0.2907   0.0437   0.1529   0.8156   --      
v1_role         Role framing (starting point)  0.2503   0.0389   0.1424   0.8071   -0.0105 
v2_audience     + Audience targeting           0.2588   0.0360   0.1442   0.8102   +0.0019 
v3_constraints  + Constraints                  0.2608   0.0305   0.1392   0.8122   -0.0050 
v4_structure    + Structured output            0.2917   0.0416   0.1539   0.8167   +0.0146 
v5_fewshot      + Few-shot example             0.1966   0.0194   0.1229   0.7932   -0.0310 
-----------------------------------------------------------------------------------------------

BEST VERSION:   v4_structure
TOTAL Δ R-L:    +0.0010 (from 0.1529 to 0.1539)
IMPROVEMENT:    0.6%

Ablation analysis complete

In [18]:
# Task 5: Temperature Variation Test
# ===================================
# Test on best performing prompt (v4_structure)
TEMPERATURE_VALUES = [0.0, 0.3, 0.7, 1.0]
temperature_results = {}
# Use v4_structure template (best performer)
best_template = PROMPT_VERSIONS["v4_structure"]["template"]
print("=" * 60)
print("TEMPERATURE VARIATION TEST (using v4_structure)")
print("=" * 60)
for temp in TEMPERATURE_VALUES:
    print(f"\nTesting temperature = {temp}...")
    
    temp_outputs = []
    for idx in sample_indices[:4]:  # Test on first 4 samples for speed
        example = dataset["test"][idx]
        
        # Modify generate_summary to use temperature
        truncated_text = example["text"][:1500]
        prompt = best_template.format(text=truncated_text)
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=800)
        
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=150,
            num_beams=1,  # Beam search doesn't work with temperature
            do_sample=True if temp > 0 else False,
            temperature=temp if temp > 0 else 1.0,
            pad_token_id=tokenizer.eos_token_id
        )
        
        full_output = tokenizer.decode(output[0], skip_special_tokens=True)
        generated = full_output[len(prompt):].strip()
        temp_outputs.append({"generated": generated, "ground_truth": example["abstract"]})
    
    # Calculate ROUGE
    scores = [scorer.score(r["ground_truth"], r["generated"])["rougeL"].fmeasure for r in temp_outputs]
    avg_rougeL = np.mean(scores)
    
    temperature_results[temp] = avg_rougeL
    print(f"  Temperature {temp}: ROUGE-L = {avg_rougeL:.4f}")
print("\n" + "=" * 60)
print("TEMPERATURE COMPARISON")
print("=" * 60)
for temp, score in temperature_results.items():
    print(f"  temp={temp}: {score:.4f}")

TEMPERATURE VARIATION TEST (using v4_structure)

Testing temperature = 0.0...
  Temperature 0.0: ROUGE-L = 0.1010

Testing temperature = 0.3...
  Temperature 0.3: ROUGE-L = 0.1513

Testing temperature = 0.7...
  Temperature 0.7: ROUGE-L = 0.1396

Testing temperature = 1.0...
  Temperature 1.0: ROUGE-L = 0.1223

TEMPERATURE COMPARISON
  temp=0.0: 0.1010
  temp=0.3: 0.1513
  temp=0.7: 0.1396
  temp=1.0: 0.1223


# Further Testing 

In [19]:
# Experiment: Non-Stacked Prompt - Constraints Only
# ==================================================
# Testing if constraints ALONE perform better than stacked prompts
# (removes v1_role which hurt performance)
constraints_only_template = "Only include factual claims from the text. Do not include acknowledgments or references.\n\n{text}\n\nSummary:"
print("="*60)
print("TESTING: CONSTRAINTS ONLY (Non-Stacked)")
print("="*60)
constraints_only_results = []
for idx in sample_indices:
    example = dataset["test"][idx]
    generated = generate_summary(example["text"], prompt_template=constraints_only_template)
    
    constraints_only_results.append({
        "index": idx,
        "ground_truth": example["abstract"],
        "generated": generated
    })
# Evaluate
rouge_scores = []
for result in constraints_only_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    rouge_scores.append(scores["rougeL"].fmeasure)
avg_rougeL = np.mean(rouge_scores)
print(f"\nConstraints Only ROUGE-L: {avg_rougeL:.4f}")
print(f"Baseline ROUGE-L:         {baseline_metrics['rougeL']:.4f}")
print(f"v3_constraints (stacked): {prompt_engineering_metrics['v3_constraints']['rougeL']:.4f}")
print(f"\nDifference vs baseline:   {avg_rougeL - baseline_metrics['rougeL']:+.4f}")

TESTING: CONSTRAINTS ONLY (Non-Stacked)

Constraints Only ROUGE-L: 0.1423
Baseline ROUGE-L:         0.1529
v3_constraints (stacked): 0.1392

Difference vs baseline:   -0.0106


In [20]:
# Experiment: Non-Stacked Prompt - Structure Only
# ================================================
# Testing if structured output format helps on its own
structure_only_template = "Summarize in 3 points:\n1) What was studied\n2) Key findings\n3) Implications\n\n{text}\n\nSummary:"
print("="*60)
print("TESTING: STRUCTURE ONLY (Non-Stacked)")
print("="*60)
structure_only_results = []
for idx in sample_indices:
    example = dataset["test"][idx]
    generated = generate_summary(example["text"], prompt_template=structure_only_template)
    
    structure_only_results.append({
        "index": idx,
        "ground_truth": example["abstract"],
        "generated": generated
    })
# Evaluate
rouge_scores = []
for result in structure_only_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    rouge_scores.append(scores["rougeL"].fmeasure)
avg_rougeL = np.mean(rouge_scores)
print(f"\nStructure Only ROUGE-L:   {avg_rougeL:.4f}")
print(f"Baseline ROUGE-L:         {baseline_metrics['rougeL']:.4f}")
print(f"v4_structure (stacked):   {prompt_engineering_metrics['v4_structure']['rougeL']:.4f}")
print(f"\nDifference vs baseline:   {avg_rougeL - baseline_metrics['rougeL']:+.4f}")

TESTING: STRUCTURE ONLY (Non-Stacked)

Structure Only ROUGE-L:   0.1472
Baseline ROUGE-L:         0.1529
v4_structure (stacked):   0.1539

Difference vs baseline:   -0.0058


# TASK 7 - Fine-Tune Generator with LoRA

In [21]:
# Install PEFT (Parameter-Efficient Fine-Tuning) library
!pip install peft==0.11.0 accelerate



In [22]:
# Task 7: Configure LoRA for GPT-2 Fine-Tuning
# ============================================
from peft import LoraConfig, get_peft_model, TaskType
import torch

# LoRA Configuration
# Based on Hu et al. (2022) recommendations
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # GPT-2 is a causal language model
    r=8,                            # Rank of low-rank matrices
    lora_alpha=32,                  # Scaling factor
    lora_dropout=0.1,               # Dropout for regularization
    target_modules=["c_attn"],      # GPT-2 attention projection
)

print("="*60)
print("LoRA CONFIGURATION")
print("="*60)
print(f"Rank (r): {lora_config.r}")
print(f"Alpha: {lora_config.lora_alpha}")
print(f"Dropout: {lora_config.lora_dropout}")
print(f"Target modules: {lora_config.target_modules}")
print("="*60)

LoRA CONFIGURATION
Rank (r): 8
Alpha: 32
Dropout: 0.1
Target modules: {'c_attn'}


In [23]:
# Apply LoRA to GPT-2
# ===================
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fresh model for fine-tuning
print("Loading fresh GPT-2 model for fine-tuning...")
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
base_model.config.pad_token_id = tokenizer.eos_token_id

# Apply LoRA
print("Applying LoRA adapters...")
lora_model = get_peft_model(base_model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in lora_model.parameters())
print(f"\n{'='*60}")
print("PARAMETER EFFICIENCY")
print(f"{'='*60}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable %: {100 * trainable_params / total_params:.4f}%")
print(f"{'='*60}")

Loading fresh GPT-2 model for fine-tuning...
Applying LoRA adapters...

PARAMETER EFFICIENCY
Total parameters: 124,734,720
Trainable parameters: 294,912
Trainable %: 0.2364%




In [24]:
# Prepare Training Data
# =====================
from torch.utils.data import Dataset, DataLoader

class SummarizationDataset(Dataset):
    """Dataset for summarization fine-tuning."""
    
    def __init__(self, dataset_split, tokenizer, max_length=512, num_samples=500):
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Use subset for training (laptop-friendly)
        for i in range(min(num_samples, len(dataset_split))):
            example = dataset_split[i]
            # Format: input text -> summary
            text = example["text"][:1000]  # Truncate input
            summary = example["abstract"][:300]  # Truncate summary
            
            # Create training format
            prompt = f"Summarize:\n{text}\n\nSummary: {summary}"
            self.examples.append(prompt)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.examples[idx],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": encoding["input_ids"].squeeze()  # For causal LM, labels = input_ids
        }

# Create datasets
print("Preparing training data...")
train_dataset = SummarizationDataset(dataset["train"], tokenizer, num_samples=500)
print(f"Training samples: {len(train_dataset)}")

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
print(f"Batches per epoch: {len(train_loader)}")

Preparing training data...
Training samples: 500
Batches per epoch: 250


In [25]:
# Task 7: Fine-Tune with LoRA
# ===========================
from torch.optim import AdamW
from tqdm import tqdm

# Training configuration
EPOCHS = 3
LEARNING_RATE = 5e-4
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

print(f"\n{'='*60}")
print("FINE-TUNING CONFIGURATION")
print(f"{'='*60}")
print(f"Device: {DEVICE}")
print(f"Epochs: {EPOCHS}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Batch size: 2")
print(f"Training samples: {len(train_dataset)}")
print(f"{'='*60}\n")

# Move model to device
lora_model = lora_model.to(DEVICE)

# Optimizer
optimizer = AdamW(lora_model.parameters(), lr=LEARNING_RATE)

# Training loop
lora_model.train()
training_losses = []

for epoch in range(EPOCHS):
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        
        # Forward pass
        outputs = lora_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_loss = epoch_loss / len(train_loader)
    training_losses.append(avg_loss)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

print(f"\n{'='*60}")
print("TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Final loss: {training_losses[-1]:.4f}")
print(f"Loss reduction: {training_losses[0] - training_losses[-1]:.4f}")


FINE-TUNING CONFIGURATION
Device: mps
Epochs: 3
Learning rate: 0.0005
Batch size: 2
Training samples: 500



Epoch 1/3: 100%|██████████| 250/250 [01:51<00:00,  2.24it/s, loss=1.8618]


Epoch 1 Average Loss: 1.9764


Epoch 2/3: 100%|██████████| 250/250 [01:46<00:00,  2.34it/s, loss=1.7919]


Epoch 2 Average Loss: 1.7532


Epoch 3/3: 100%|██████████| 250/250 [01:45<00:00,  2.37it/s, loss=1.7109]

Epoch 3 Average Loss: 1.7284

TRAINING COMPLETE
Final loss: 1.7284
Loss reduction: 0.2480





In [26]:
# Save the fine-tuned LoRA adapters
# =================================
import os

output_dir = "./lora_gpt2_medical"
os.makedirs(output_dir, exist_ok=True)

lora_model.save_pretrained(output_dir)
print(f"LoRA adapters saved to: {output_dir}")
print(f"Files saved: {os.listdir(output_dir)}")

LoRA adapters saved to: ./lora_gpt2_medical
Files saved: ['adapter_model.safetensors', 'README.md', 'adapter_config.json']


# TASK 8 - Evaluate Fine-Tuned Model

In [27]:
# Task 8: Generate Summaries with Fine-Tuned Model
# =================================================

def generate_summary_finetuned(text, model, tokenizer, max_new_tokens=150):
    """Generate summary using fine-tuned model."""
    model.eval()
    
    # Use same format as training
    prompt = f"Summarize:\n{text[:1000]}\n\nSummary:"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=800
    ).to(DEVICE)
    
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )
    
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the summary part
    if "Summary:" in full_output:
        summary = full_output.split("Summary:")[-1].strip()
    else:
        summary = full_output[len(prompt):].strip()
    
    return summary

# Generate summaries on test set
print("="*60)
print("GENERATING FINE-TUNED SUMMARIES")
print("="*60)

finetuned_results = []
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]  # Same as baseline

for i, idx in enumerate(sample_indices):
    example = dataset["test"][idx]
    generated = generate_summary_finetuned(example["text"], lora_model, tokenizer)
    
    finetuned_results.append({
        "index": idx,
        "ground_truth": example["abstract"],
        "generated": generated
    })
    
    print(f"\nSample {i+1} (index {idx}):")
    print(f"Generated: {generated[:150]}...")

print(f"\n{'='*60}")
print(f"Generated {len(finetuned_results)} summaries")
print(f"{'='*60}")

GENERATING FINE-TUNED SUMMARIES

Sample 1 (index 0):
Generated: high-gc content in vertebrates has been proposed as a possible explanation for the evolutionary divergence between vertebrates and other vertebrates. ...

Sample 2 (index 100):
Generated: we have developed a method that allows for the comparative analysis of evolutionary changes among species. this method is based on the concept of a ph...

Sample 3 (index 500):
Generated: the phytophthora infestans is one of the most common pathogenic fungi in the world. the pathogen has been implicated in the pathogenesis of many disea...

Sample 4 (index 1000):
Generated: genome-wide association studies have been conducted to investigate the relationship between gene expression and gene function. in this study, we exami...

Sample 5 (index 1500):
Generated: genome-wide association studies (GWAS) are an important tool for the identification of novel regulatory elements. the aim of the present study was to ...

Sample 6 (index 2000):
Gene

In [28]:
# Task 8: Evaluate Fine-Tuned Model
# =================================
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE
print("="*60)
print("FINE-TUNED MODEL EVALUATION")
print("="*60)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for result in finetuned_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

# Calculate BERTScore
generated_texts = [r["generated"] for r in finetuned_results]
reference_texts = [r["ground_truth"] for r in finetuned_results]
P, R, F1 = bert_score(generated_texts, reference_texts, lang="en", verbose=False)

# Store metrics
finetuned_metrics = {
    "rouge1": np.mean(rouge1_scores),
    "rouge2": np.mean(rouge2_scores),
    "rougeL": np.mean(rougeL_scores),
    "bertscore_p": P.mean().item(),
    "bertscore_r": R.mean().item(),
    "bertscore_f1": F1.mean().item()
}

print(f"\nROUGE-1: {finetuned_metrics['rouge1']:.4f}")
print(f"ROUGE-2: {finetuned_metrics['rouge2']:.4f}")
print(f"ROUGE-L: {finetuned_metrics['rougeL']:.4f}")
print(f"BERTScore F1: {finetuned_metrics['bertscore_f1']:.4f}")

# Compare to baseline
print(f"\n{'='*60}")
print("IMPROVEMENT OVER BASELINE")
print(f"{'='*60}")
print(f"ROUGE-L: {baseline_metrics['rougeL']:.4f} -> {finetuned_metrics['rougeL']:.4f} ({(finetuned_metrics['rougeL']/baseline_metrics['rougeL']-1)*100:+.1f}%)")
print(f"BERTScore: {baseline_metrics['bertscore_f1']:.4f} -> {finetuned_metrics['bertscore_f1']:.4f} ({(finetuned_metrics['bertscore_f1']/baseline_metrics['bertscore_f1']-1)*100:+.1f}%)")

FINE-TUNED MODEL EVALUATION


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ROUGE-1: 0.1938
ROUGE-2: 0.0370
ROUGE-L: 0.1188
BERTScore F1: 0.8289

IMPROVEMENT OVER BASELINE
ROUGE-L: 0.1529 -> 0.1188 (-22.3%)
BERTScore: 0.8156 -> 0.8289 (+1.6%)


In [30]:
def generate_summary_finetuned_tldr(text, model, tokenizer, max_new_tokens=150):
    """Generate summary using fine-tuned model with TL;DR prompt."""
    model.eval()
    
    # Use TL;DR format instead of Summarize
    prompt = f"TL;DR:\n{text[:1000]}\n\nTL;DR:"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=800
    ).to(DEVICE)
    
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )
    
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    if "TL;DR:" in full_output:
        summary = full_output.split("TL;DR:")[-1].strip()
    else:
        summary = full_output[len(prompt):].strip()
    
    return summary
print("="*60)
print("TESTING: TL;DR PROMPT ON FINE-TUNED MODEL")
print("="*60)
tldr_finetuned_results = []
for idx in sample_indices:
    example = dataset["test"][idx]
    generated = generate_summary_finetuned_tldr(example["text"], lora_model, tokenizer)
    
    tldr_finetuned_results.append({
        "index": idx,
        "ground_truth": example["abstract"],
        "generated": generated
    })
# Evaluate
rouge_scores = []
for result in tldr_finetuned_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    rouge_scores.append(scores["rougeL"].fmeasure)
avg_rougeL = np.mean(rouge_scores)
print(f"\nFine-tuned + TL;DR ROUGE-L:     {avg_rougeL:.4f}")
print(f"Fine-tuned + Summarize ROUGE-L: {finetuned_metrics['rougeL']:.4f}")
print(f"Baseline (mixed prompts) ROUGE-L: {baseline_metrics['rougeL']:.4f}")

TESTING: TL;DR PROMPT ON FINE-TUNED MODEL

Fine-tuned + TL;DR ROUGE-L:     0.0842
Fine-tuned + Summarize ROUGE-L: 0.1188
Baseline (mixed prompts) ROUGE-L: 0.1529


# TASK 9 - Compare & Contrast All Approaches

In [29]:
# Task 9: Comprehensive Comparison
# =================================

print("="*70)
print("COMPREHENSIVE COMPARISON: ALL APPROACHES")
print("="*70)

# Get best prompt engineering result
best_prompt_version = "v4_structure"
best_prompt_metrics = prompt_engineering_metrics[best_prompt_version]

print(f"\n{'Approach':<25} {'ROUGE-1':<10} {'ROUGE-2':<10} {'ROUGE-L':<10} {'BERT-F1':<10}")
print("-"*70)

# Baseline
print(f"{'Baseline (TL;DR)':<25} {baseline_metrics['rouge1']:.4f}     {baseline_metrics['rouge2']:.4f}     {baseline_metrics['rougeL']:.4f}     {baseline_metrics['bertscore_f1']:.4f}")

# Best Prompt Engineering
print(f"{'Prompt Eng (v4_structure)':<25} {best_prompt_metrics['rouge1']:.4f}     {best_prompt_metrics['rouge2']:.4f}     {best_prompt_metrics['rougeL']:.4f}     {best_prompt_metrics['bertscore_f1']:.4f}")

# Fine-tuned
print(f"{'LoRA Fine-tuned':<25} {finetuned_metrics['rouge1']:.4f}     {finetuned_metrics['rouge2']:.4f}     {finetuned_metrics['rougeL']:.4f}     {finetuned_metrics['bertscore_f1']:.4f}")

print("-"*70)

# Calculate improvements
prompt_improvement = (best_prompt_metrics['rougeL'] / baseline_metrics['rougeL'] - 1) * 100
finetune_improvement = (finetuned_metrics['rougeL'] / baseline_metrics['rougeL'] - 1) * 100

print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f"Prompt engineering improvement: {prompt_improvement:+.1f}% ROUGE-L")
print(f"Fine-tuning improvement: {finetune_improvement:+.1f}% ROUGE-L")
print(f"\nBest approach: {'Fine-tuned' if finetuned_metrics['rougeL'] > best_prompt_metrics['rougeL'] else 'Prompt Engineering'}")

COMPREHENSIVE COMPARISON: ALL APPROACHES

Approach                  ROUGE-1    ROUGE-2    ROUGE-L    BERT-F1   
----------------------------------------------------------------------
Baseline (TL;DR)          0.2907     0.0437     0.1529     0.8156
Prompt Eng (v4_structure) 0.2917     0.0416     0.1539     0.8167
LoRA Fine-tuned           0.1938     0.0370     0.1188     0.8289
----------------------------------------------------------------------

SUMMARY
Prompt engineering improvement: +0.6% ROUGE-L
Fine-tuning improvement: -22.3% ROUGE-L

Best approach: Prompt Engineering
