In [1]:
# Cell 1: Install (run once)
!pip install datasets



In [2]:
# Load the SUMPUBMED dataset
from datasets import load_dataset
dataset = load_dataset("Blaise-g/SumPubmed")
print("Dataset loaded!")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded!
DatasetDict({
    train: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 26147
    })
    test: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3269
    })
    dev: Dataset({
        features: ['line_text', 'filename_text', 'text', 'shorter_abstract', 'abstract'],
        num_rows: 3268
    })
})


In [3]:
# Look at one example
example = dataset["train"][0]
print("PAPER TEXT (first 500 chars):")
print(example["text"][:500])
print("\n" + "="*50 + "\n")
print("ABSTRACT:")
print(example["abstract"])

PAPER TEXT (first 500 chars):
 thioredoxins are widely distributed in nature from prokaryotes to eukaryotes. these proteins, which belong to the oxidoreductase thiol:disulfide superfamily, are characterized by the active site signature sequence wcxxc. this sequence motif constitutes the redox center mediating the isomerization of specific disulfide bridges on trx target proteins. in yeasts and mammals, the cytoplasmic trx redox system is complemented by a second trx system within mitochondria. in plants, the system is more i


ABSTRACT:
 natrxh, a thioredoxin type h, shows differential expression between self-incompatible and self-compatible nicotiana species. natrxh interacts in vitro with s-rnase and co-localizes with it in the extracellular matrix of the stylar transmitting tissue. natrxh contains n- and c-terminal extensions, a feature shared by thioredoxin h proteins of subgroup to ascertain the function of these extensions in natrxh secretion and protein-protein interaction, we p

In [4]:
# Install PyTorch (required for running models)
!pip install torch



In [5]:
!pip install transformers==4.40.0



In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Load pre-trained GPT-2
model_name = "gpt2"
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
print("Loading model...")
model = GPT2LMHeadModel.from_pretrained(model_name)
# GPT-2 doesn't have a pad token by default, set it to eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
print("Model loaded!")
print(f"Model: {model_name}")
print(f"Parameters: {model.num_parameters():,}")

Loading tokenizer...




Loading model...
Model loaded!
Model: gpt2
Parameters: 124,439,808


In [11]:
# Task 3: Generate Baseline Summaries
# Test on a single example first
def generate_summary(text, prompt_template="Summarize the following medical research:\n\n{text}\n\nSummary:", max_new_tokens=150):
    """Generate a summary using pre-trained GPT-2."""
    # Truncate text to fit within GPT-2's context window (1024 tokens)
    # Use conservative char limit since medical text has complex tokens
    max_input_chars = 1500
    truncated_text = text[:max_input_chars]
    
    # Format prompt
    prompt = prompt_template.format(text=truncated_text)
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=800  # Leave room for generation
    )
    
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )
    
    # Decode and extract only the generated part (after the prompt)
    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    summary = full_output[len(prompt):].strip()
    
    return summary
# Test on first test example
test_example = dataset["test"][0]
print("=" * 60)
print("INPUT (first 500 chars):")
print(test_example["text"][:500])
print("\n" + "=" * 60)
print("\nGROUND TRUTH ABSTRACT:")
print(test_example["abstract"][:500] + "...")
print("\n" + "=" * 60)
print("\nBASELINE GENERATED SUMMARY:")
baseline_summary = generate_summary(test_example["text"])
print(baseline_summary)

INPUT (first 500 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolution of complete genomes. one particular hypothesis concerns adaptation to high temperatures, proposing that high gc content results from selection favouring g:c pairs over less stable a:t pairs. against


GROUND TRUTH ABSTRACT:
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids that stabilize protein structure and depleted in amino acids that decrease thermostability. does this observation reflect a causal relationship, or could the apparent trend be caused by phylogenetic relatedness among sampled organisms living at diffe

In [12]:
# Task 3: Generate Baseline Summaries (6-8 examples)
# Fixed indices for reproducibility
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2500, 3000]
baseline_results = []
print("Generating baseline summaries...\n")
for i, idx in enumerate(sample_indices):
    example = dataset["test"][idx]
    generated = generate_summary(example["text"])
    
    baseline_results.append({
        "index": idx,
        "input_text": example["text"],
        "ground_truth": example["abstract"],
        "generated": generated
    })
    
    print(f"{'='*60}")
    print(f"SAMPLE {i+1} (index {idx})")
    print(f"{'='*60}")
    print(f"\nINPUT (first 300 chars):\n{example['text'][:300]}...")
    print(f"\nGROUND TRUTH (first 200 chars):\n{example['abstract'][:200]}...")
    print(f"\nGENERATED SUMMARY:\n{generated}")
    print()
print(f"\n✅ Generated {len(baseline_results)} baseline summaries")

Generating baseline summaries...

SAMPLE 1 (index 0)

INPUT (first 300 chars):
 evolutionary molecular biology is mostly concerned with the forces affecting individual genes. however, observations of variable proportions of guanine and cytosine in different species and in different genomic regions of vertebrates have prompted the analysis of forces that may affect the evolutio...

GROUND TRUTH (first 200 chars):
 among bacteria and archaea, amino acid usage is correlated with habitat temperatures. in particular, protein surfaces in species thriving at higher temperatures appear to be enriched in amino acids t...

GENERATED SUMMARY:
In the present study, we show that gc is a major determinant of the evolution and survival of vertebrate genomes. We also show that a high-gc gc density is associated with a high rate of evolution in the vertebrate genome. We show that high-gc gc densities are associated with an increase in the rate of evolutionary change in the human genome, and that this i