In [1]:
# Cell 1: Install (run once)
!pip install datasets



In [2]:
# Load the SUMPUBMED dataset
from datasets import load_dataset
dataset = load_dataset("Blaise-g/SumPubmed")
print("Dataset loaded!")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded!
DatasetDict({
    train: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 26147
    })
    test: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3269
    })
    dev: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3268
    })
})


In [3]:
# Look at one example
example = dataset["train"][0]
print("PAPER TEXT (first 500 chars):")
print(example["text"][:500])
print("\n" + "="*50 + "\n")
print("ABSTRACT:")
print(example["abstract"])

PAPER TEXT (first 500 chars):
 thioredoxins are widely distributed in nature from prokaryotes to eukaryotes. these proteins, which belong to the oxidoreductase thiol:disulfide superfamily, are characterized by the active site signature sequence wcxxc. this sequence motif constitutes the redox center mediating the isomerization of specific disulfide bridges on trx target proteins. in yeasts and mammals, the cytoplasmic trx redox system is complemented by a second trx system within mitochondria. in plants, the system is more i


ABSTRACT:
 natrxh, a thioredoxin type h, shows differential expression between self-incompatible and self-compatible nicotiana species. natrxh interacts in vitro with s-rnase and co-localizes with it in the extracellular matrix of the stylar transmitting tissue. natrxh contains n- and c-terminal extensions, a feature shared by thioredoxin h proteins of subgroup to ascertain the function of these extensions in natrxh secretion and protein-protein interaction, we p

In [4]:
# Install PyTorch (required for running models)
!pip install torch



In [5]:
!pip install transformers==4.40.0



In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Load pre-trained GPT-2
model_name = "gpt2"
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
print("Loading model...")
model = GPT2LMHeadModel.from_pretrained(model_name)
# GPT-2 doesn't have a pad token by default, set it to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
print("Model loaded!")
print(f"Model: {model_name}")
print(f"Parameters: {model.num_parameters():,}")

Loading tokenizer...




Loading model...
Model loaded!
Model: gpt2
Parameters: 124,439,808


In [7]:
# Task 3: Generate Baseline Summaries
# Test on a single example first
def generate_summary(text, prompt_template="Summarize the following medical research:\n\n{text}\n\nSummary:", max_new_tokens=150):
    """Generate a summary using pre-trained GPT-2."""
    # Truncate text to fit within GPT-2's context window (1024 tokens)
    # Use conservative char limit since medical text has complex tokens
    max_input_chars = 1500
    truncated_text = text[:max_input_chars]
    
    # Format prompt
    prompt = prompt_template.format(text=truncated_text)
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=800  # Leave room for generation
    )
    
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )
    
    # Decode and extract only the generated part (after the prompt)
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    summary = full_output[len(prompt):].strip()
    
    return summary
# Test on first test example
test_example = dataset["test"][0]
print("=" * 60)
print("INPUT (first 500 chars):")
print(test_example["text"][:500])
print("\n" + "=" * 60)
print("\nGROUND TRUTH ABSTRACT:")
print(test_example["abstract"][:500] + "...")
print("\n" + "=" * 60)
print("\nBASELINE GENERATED SUMMARY:")
baseline_summary = generate_summary(test_example["text"])
print(baseline_summary)

INPUT (first 500 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolution of complete genomes. one particular hypothesis concerns adaptation to high temperatures, proposing that high gc content results from selection favouring g:c pairs over less stable a:t pairs. against


GROUND TRUTH ABSTRACT:
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids that stabilize protein structure and depleted in amino acids that decrease thermostability. does this observation reflect a causal relationship, or could the apparent trend be caused by phylogenetic relatedness among sampled organisms living at diffe

In [10]:
# Task 3: Generate Baseline Summaries with Multiple Prompts
# ==========================================================
# Define baseline prompts
BASELINE_PROMPTS = {
    "prompt_1": "Summarize this text:\n\n{text}\n\nSummary:",
    "prompt_2": "Summarize the following medical research:\n\n{text}\n\nSummary:",
    "prompt_3": "Provide a brief summary:\n\n{text}\n\nSummary:",
    "prompt_4": "TL;DR:\n\n{text}\n\nTL;DR:"
}
# Fixed sample indices for reproducibility
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
# Assign 2 samples per prompt
prompt_sample_mapping = {
    "prompt_1": [0, 100],
    "prompt_2": [500, 1000],
    "prompt_3": [1500, 2000],
    "prompt_4": [2500, 3000]
}
# Document configuration
print("=" * 60)
print("BASELINE CONFIGURATION")
print("=" * 60)
print(f"Model: gpt2")
print(f"Parameters: 124M")
print(f"Transformers version: 4.40.0")
print(f"Max input tokens: 800")
print(f"Max new tokens: 150")
print(f"Num beams: 4")
print(f"No repeat ngram size: 3")
print(f"Do sample: False (deterministic)")
print("=" * 60)
# Generate summaries
baseline_results = []
output_count = 0
print("\nGENERATING BASELINE SUMMARIES...\n")
for prompt_name, indices in prompt_sample_mapping.items():
    prompt_template = BASELINE_PROMPTS[prompt_name]
    
    for idx in indices:
        output_count += 1
        example = dataset["test"][idx]
        generated = generate_summary(example["text"], prompt_template=prompt_template)
        
        baseline_results.append({
            "output_num": output_count,
            "index": idx,
            "prompt_name": prompt_name,
            "prompt_template": prompt_template,
            "input_text": example["text"],
            "ground_truth": example["abstract"],
            "generated": generated
        })
        
        print(f"{'='*60}")
        print(f"OUTPUT {output_count} | {prompt_name.upper()} | Sample index {idx}")
        print(f"{'='*60}")
        print(f"\nPROMPT: {prompt_template[:60]}...")
        print(f"\nINPUT (first 300 chars):\n{example['text'][:300]}...")
        print(f"\nGROUND TRUTH (first 200 chars):\n{example['abstract'][:200]}...")
        print(f"\nGENERATED SUMMARY:\n{generated}")
        print()
print(f"\n{'='*60}")
print(f"Generated {len(baseline_results)} baseline outputs using {len(BASELINE_PROMPTS)} prompts")
print(f"{'='*60}")

BASELINE CONFIGURATION
Model: gpt2
Parameters: 124M
Transformers version: 4.40.0
Max input tokens: 800
Max new tokens: 150
Num beams: 4
No repeat ngram size: 3
Do sample: False (deterministic)

GENERATING BASELINE SUMMARIES...

OUTPUT 1 | PROMPT_1 | Sample index 0

PROMPT: Summarize this text:

{text}

Summary:...

INPUT (first 300 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolutio...

GROUND TRUTH (first 200 chars):
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids t...

GENERATED SUMMARY:
In this paper, we show that gc is an important determinant of the evolutionary history of vertebr

# Task 4 - Baseline Evaluation


In [13]:
# Install evaluation libraries
!pip install rouge-score bert-score



In [14]:
# Task 4: Evaluate Baseline - ROUGE Scores
# =========================================
from rouge_score import rouge_scorer
import numpy as np
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
print("=" * 60)
print("ROUGE EVALUATION")
print("=" * 60 + "\n")
for result in baseline_results:
    scores = scorer.score(result["ground_truth"], result["generated"])
    
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)
    
    print(f"Sample {result['output_num']} ({result['prompt_name']}): "
          f"R1={scores['rouge1'].fmeasure:.3f}, "
          f"R2={scores['rouge2'].fmeasure:.3f}, "
          f"RL={scores['rougeL'].fmeasure:.3f}")
# Averages
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)
print("\n" + "-" * 60)
print("ROUGE AVERAGES:")
print(f"  ROUGE-1: {avg_rouge1:.4f}")
print(f"  ROUGE-2: {avg_rouge2:.4f}")
print(f"  ROUGE-L: {avg_rougeL:.4f}")

ROUGE EVALUATION

Sample 1 (prompt_1): R1=0.301, R2=0.025, RL=0.150
Sample 2 (prompt_1): R1=0.290, R2=0.061, RL=0.148
Sample 3 (prompt_2): R1=0.186, R2=0.005, RL=0.115
Sample 4 (prompt_2): R1=0.306, R2=0.015, RL=0.138
Sample 5 (prompt_3): R1=0.350, R2=0.059, RL=0.162
Sample 6 (prompt_3): R1=0.262, R2=0.025, RL=0.131
Sample 7 (prompt_4): R1=0.297, R2=0.068, RL=0.171
Sample 8 (prompt_4): R1=0.333, R2=0.091, RL=0.209

------------------------------------------------------------
ROUGE AVERAGES:
  ROUGE-1: 0.2907
  ROUGE-2: 0.0437
  ROUGE-L: 0.1529


In [15]:
# Task 4: Evaluate Baseline - BERTScore
# =====================================
from bert_score import score as bert_score
print("=" * 60)
print("BERTSCORE EVALUATION")
print("=" * 60)
print("\nCalculating BERTScore (this may take a minute)...\n")
generated_texts = [r["generated"] for r in baseline_results]
reference_texts = [r["ground_truth"] for r in baseline_results]
P, R, F1 = bert_score(generated_texts, reference_texts, lang="en", verbose=True)
# Individual scores
for i, result in enumerate(baseline_results):
    print(f"Sample {result['output_num']} ({result['prompt_name']}): "
          f"P={P[i]:.3f}, R={R[i]:.3f}, F1={F1[i]:.3f}")
# Averages
avg_bert_p = P.mean().item()
avg_bert_r = R.mean().item()
avg_bert_f1 = F1.mean().item()
print("\n" + "-" * 60)
print("BERTSCORE AVERAGES:")
print(f"  Precision: {avg_bert_p:.4f}")
print(f"  Recall:    {avg_bert_r:.4f}")
print(f"  F1:        {avg_bert_f1:.4f}")

BERTSCORE EVALUATION

Calculating BERTScore (this may take a minute)...



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:04<00:00,  4.23s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 56.14it/s]

done in 4.25 seconds, 1.88 sentences/sec
Sample 1 (prompt_1): P=0.833, R=0.809, F1=0.821
Sample 2 (prompt_1): P=0.816, R=0.816, F1=0.816
Sample 3 (prompt_2): P=0.814, R=0.770, F1=0.792
Sample 4 (prompt_2): P=0.833, R=0.812, F1=0.822
Sample 5 (prompt_3): P=0.845, R=0.802, F1=0.823
Sample 6 (prompt_3): P=0.813, R=0.775, F1=0.793
Sample 7 (prompt_4): P=0.864, R=0.802, F1=0.832
Sample 8 (prompt_4): P=0.834, R=0.817, F1=0.825

------------------------------------------------------------
BERTSCORE AVERAGES:
  Precision: 0.8315
  Recall:    0.8004
  F1:        0.8156





In [16]:
# Task 4: Baseline Evaluation Summary
# ====================================
print("=" * 60)
print("BASELINE EVALUATION SUMMARY")
print("=" * 60)
print(f"""
┌─────────────────────────────────────────┐
│  BASELINE RESULTS                       │
├─────────────────────────────────────────┤
│  ROUGE-1:        {avg_rouge1:.4f}                   │
│  ROUGE-2:        {avg_rouge2:.4f}                   │
│  ROUGE-L:        {avg_rougeL:.4f}                   │
│  BERTScore F1:   {avg_bert_f1:.4f}                   │
└─────────────────────────────────────────┘
""")
# Store for later comparison (Task 9)
baseline_metrics = {
    "rouge1": avg_rouge1,
    "rouge2": avg_rouge2,
    "rougeL": avg_rougeL,
    "bertscore_p": avg_bert_p,
    "bertscore_r": avg_bert_r,
    "bertscore_f1": avg_bert_f1
}
print("Baseline metrics stored in 'baseline_metrics' for later comparison")

BASELINE EVALUATION SUMMARY

┌─────────────────────────────────────────┐
│  BASELINE RESULTS                       │
├─────────────────────────────────────────┤
│  ROUGE-1:        0.2907                   │
│  ROUGE-2:        0.0437                   │
│  ROUGE-L:        0.1529                   │
│  BERTScore F1:   0.8156                   │
└─────────────────────────────────────────┘

Baseline metrics stored in 'baseline_metrics' for later comparison


# TASK 5 - PROMPT Engineering 

In [21]:
# Temperature variations (tested separately on best prompt)
PROMPT_VERSIONS = {
    "v1_baseline": {
        "template": "TL;DR:\n\n{text}\n\nTL;DR:",
        "change": "Baseline (best from Task 3)"
    },
    
    "v2_role": {
        "template": "You are a medical researcher. TL;DR:\n\n{text}\n\nTL;DR:",
        "change": "+ Role framing"
    },
    
    "v3_audience": {
        "template": "You are a medical researcher. Summarize this for a patient with no medical background:\n\n{text}\n\nSummary:",
        "change": "+ Audience targeting"
    },
    
    "v4_constraints": {
        "template": "You are a medical researcher. Summarize this for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.\n\n{text}\n\nSummary:",
        "change": "+ Constraints"
    },
    
    "v5_structure": {
        "template": "You are a medical researcher. Summarize this for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.\n\nSummarize in 3 points:\n1) What was studied\n2) Key findings\n3) Implications\n\n{text}\n\nSummary:",
        "change": "+ Structured output"
    },
    
    "v6_fewshot": {
        "template": """You are a medical researcher. Summarize this for a patient with no medical background. Only include factual claims from the text. Do not include acknowledgments or references.
Example:
Text: "This study examined the effects of vitamin D supplementation on bone density in elderly patients. 200 participants received either vitamin D or placebo for 12 months. Results showed a 15% improvement in bone density in the treatment group."
Summary: 1) Studied vitamin D supplements and bone health in elderly patients. 2) Found 15% improvement in bone density with vitamin D. 3) Suggests vitamin D may help prevent bone loss in older adults.
Now summarize:
{text}
Summary:""",
        "change": "+ Few-shot example"
    }
}


# Same indices as baseline for fair comparison
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
# Print summary
print("=" * 60)
print("PROMPT ENGINEERING: VERSION OVERVIEW")
print("=" * 60)
for version, info in PROMPT_VERSIONS.items():
    print(f"\n{version.upper()}")
    print(f"  Change: {info['change']}")
    print(f"  Template: {info['template'][:50]}...")
print("\n" + "=" * 60)
print(f"Testing on {len(sample_indices)} samples: {sample_indices}")
print("=" * 60)

PROMPT ENGINEERING: VERSION OVERVIEW

V1_BASELINE
  Change: Baseline (best from Task 3)
  Template: TL;DR:

{text}

TL;DR:...

V2_ROLE
  Change: + Role framing
  Template: You are a medical researcher. TL;DR:

{text}

TL;D...

V3_AUDIENCE
  Change: + Audience targeting
  Template: You are a medical researcher. Summarize this for a...

V4_CONSTRAINTS
  Change: + Constraints
  Template: You are a medical researcher. Summarize this for a...

V5_STRUCTURE
  Change: + Structured output
  Template: You are a medical researcher. Summarize this for a...

V6_FEWSHOT
  Change: + Few-shot example
  Template: You are a medical researcher. Summarize this for a...

Testing on 8 samples: [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
