In [1]:
# STEP 1: Install required libraries
!pip install transformers datasets rouge-score gradio -q
!pip install evaluate -q
!pip install --upgrade datasets fsspec
!pip install accelerate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Succes

In [2]:
import os
from datasets import load_dataset
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate
import torch

In [3]:
# Enable tokenizer parallelism for faster processing
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
# Load dataset with streaming for memory efficiency
print("Loading dataset...")
dataset = load_dataset("ccdv/pubmed-summarization")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/58.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

In [5]:
# Use a subset for faster experimentation (remove this for full training)
train_dataset = dataset["train"].select(range(10000))  # Use first 10k samples
val_dataset = dataset["validation"].select(range(1000))  # Use first 1k samples

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 10000
Validation samples: 1000


In [6]:
# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
# Optimized tokenization function
def tokenize_function(examples):
    # Tokenize articles with proper padding and truncation
    model_inputs = tokenizer(
        examples["article"],
        max_length=512,
        truncation=True,
        padding=False  # We'll pad dynamically during training
    )

    # Tokenize abstracts (targets)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["abstract"],
            max_length=128,
            truncation=True,
            padding=False
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Tokenize datasets with optimized parameters
print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,  # Process in larger batches
    num_proc=4,  # Use multiple processes
    remove_columns=['article', 'abstract']
)

Tokenizing training dataset...


Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]



In [9]:
print("Tokenizing validation dataset...")
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    num_proc=4,
    remove_columns=['article', 'abstract']
)

Tokenizing validation dataset...


Setting TOKENIZERS_PARALLELISM=false for forked processes.


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]



In [10]:
# Load ROUGE metric for evaluation
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we can't decode them
    labels = [[label if label != -100 else tokenizer.pad_token_id for label in label_seq] for label_seq in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
# Optimized training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-pubmed-finetuned",
    eval_strategy="steps",
    eval_steps=500,
    learning_rate=3e-5,  # Slightly higher learning rate
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch size = 8 * 2 = 16
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=1000,
    fp16=True,  # Mixed precision for faster training
    dataloader_num_workers=4,  # Parallel data loading
    logging_dir='./logs-pubmed',
    logging_steps=100,
    warmup_steps=500,
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    report_to="none",
    push_to_hub=False  # Set to True if you want to push to hub
)

In [12]:
# Data collator with dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8  # Optimize for tensor cores
)

In [13]:
# Initialize trainer with evaluation
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [14]:
# Start training
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
500,2.8664,2.548472,0.37831,0.14405,0.23983
1000,2.5968,2.416587,0.378154,0.145876,0.243528
1500,2.5204,2.388508,0.378504,0.145998,0.24346




TrainOutput(global_step=1875, training_loss=2.784956290690104, metrics={'train_runtime': 1133.0626, 'train_samples_per_second': 26.477, 'train_steps_per_second': 1.655, 'total_flos': 9146046873600000.0, 'train_loss': 2.784956290690104, 'epoch': 3.0})

In [24]:
# Save the final model
trainer.save_model("/content/drive/MyDrive/PubMed_Summarizer/bart-pubmed-final")
tokenizer.save_pretrained("/content/drive/MyDrive/PubMed_Summarizer/bart-pubmed-final")

print("Training completed!")

Training completed!


In [27]:
import gradio as gr
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import re

# Load your fine-tuned model and tokenizer
model_path = "./bart-pubmed-final"  # Path to your saved model
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def preprocess_text(text):
    """Clean and preprocess input text"""
    # Remove extra whitespaces and normalize
    text = re.sub(r'\s+', ' ', text.strip())

    # Remove special characters that might interfere
    text = re.sub(r'[^\w\s.,;:!?()-]', '', text)

    # Ensure minimum length
    if len(text.split()) < 10:
        return "⚠️ Warning: Text too short. Please provide at least 10 words for meaningful summarization."

    return text

def postprocess_summary(summary):
    """Clean and format the generated summary"""
    if not summary:
        return "No summary generated."

    # Capitalize first letter and ensure proper ending
    summary = summary.strip()
    if summary:
        summary = summary[0].upper() + summary[1:]
        if not summary.endswith(('.', '!', '?')):
            summary += '.'

    return summary

def summarize_biomedical_text(text, max_length=150, min_length=50, num_beams=4):
    """
    Generate abstractive summary of biomedical text using fine-tuned BART model
    """
    # Preprocess input
    processed_text = preprocess_text(text)

    # Check if preprocessing returned a warning
    if processed_text.startswith("⚠️"):
        return processed_text

    try:
        # Tokenize input
        inputs = tokenizer(
            processed_text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(device)

        # Generate summary
        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                min_length=min_length,
                num_beams=num_beams,
                length_penalty=2.0,
                early_stopping=True,
                do_sample=False
            )

        # Decode summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Postprocess and return
        return postprocess_summary(summary)

    except Exception as e:
        return f"❌ Error generating summary: {str(e)}"

def get_text_stats(text):
    """Get basic statistics about the input text"""
    words = len(text.split())
    chars = len(text)
    sentences = len(re.findall(r'[.!?]+', text))
    return f"📊 **Text Statistics:** {words} words | {chars} characters | {sentences} sentences"

# Create the Gradio interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Biomedical Literature Summarizer",
    css="""
    .gradio-container {
        max-width: 1200px !important;
    }
    .main-header {
        text-align: center;
        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        font-size: 2.5em;
        font-weight: bold;
        margin-bottom: 0.5em;
    }
    """
) as demo:

    gr.HTML("""
    <div class="main-header">
        🧬 Biomedical Literature Summarizer
    </div>
    <!-- <p style="text-align: center; font-size: 1.1em; color: #666; margin-bottom: 2em;">
        AI-powered abstractive summarization for biomedical research articles using fine-tuned BART
    </p> -->
    """)

    with gr.Row():
        with gr.Column(scale=2):
            input_text = gr.Textbox(
                label="📝 Biomedical Text Input",
                placeholder="Paste your biomedical research article, abstract, or clinical text here...",
                lines=12,
                max_lines=20
            )

            with gr.Row():
                max_length = gr.Slider(
                    minimum=50,
                    maximum=300,
                    value=150,
                    step=10,
                    label="📏 Max Summary Length"
                )
                min_length = gr.Slider(
                    minimum=20,
                    maximum=100,
                    value=50,
                    step=5,
                    label="📐 Min Summary Length"
                )
                num_beams = gr.Slider(
                    minimum=2,
                    maximum=8,
                    value=4,
                    step=1,
                    label="🔍 Beam Search Size"
                )

        with gr.Column(scale=2):
            output_summary = gr.Textbox(
                label="📋 Generated Summary",
                lines=8,
                max_lines=15
            )

            text_stats = gr.Markdown(
                label="📊 Text Statistics",
                value="Enter text to see statistics..."
            )

    with gr.Row():
        summarize_btn = gr.Button(
            "🚀 Generate Summary",
            variant="primary",
            size="lg"
        )
        clear_btn = gr.Button(
            "🗑️ Clear All",
            variant="secondary"
        )

    # Event handlers
    summarize_btn.click(
        fn=summarize_biomedical_text,
        inputs=[input_text, max_length, min_length, num_beams],
        outputs=output_summary
    )

    input_text.change(
        fn=get_text_stats,
        inputs=input_text,
        outputs=text_stats
    )

    clear_btn.click(
        lambda: ("", "", "Enter text to see statistics..."),
        outputs=[input_text, output_summary, text_stats]
    )

    # Example section
    gr.HTML("<hr><h2 style='text-align: center; margin: 2em 0 1em 0;'>🧪 Test Examples</h2>")

    examples = gr.Examples(
        examples=[
            ["""Cardiovascular disease remains the leading cause of death globally, with coronary artery disease being the most prevalent form. Recent advances in interventional cardiology have introduced drug-eluting stents as a revolutionary treatment option for patients with coronary artery stenosis. These stents are coated with antiproliferative drugs that prevent restenosis by inhibiting smooth muscle cell proliferation and neointimal hyperplasia. Clinical trials have demonstrated that drug-eluting stents significantly reduce the rate of target vessel revascularization compared to bare metal stents. However, concerns have emerged regarding late stent thrombosis, particularly beyond one year after implantation. This phenomenon has been attributed to delayed endothelialization and hypersensitivity reactions to the polymer coating. Current research focuses on developing biodegradable polymer coatings and fully bioresorbable stents to address these limitations while maintaining the efficacy of drug elution."""],

            ["""Alzheimer's disease is a progressive neurodegenerative disorder characterized by the accumulation of amyloid-beta plaques and neurofibrillary tangles in the brain. The pathogenesis involves complex interactions between genetic, environmental, and lifestyle factors. Recent research has identified the role of neuroinflammation in disease progression, with microglial activation contributing to neuronal damage. Biomarker development has advanced significantly, with cerebrospinal fluid tau and amyloid-beta levels, along with neuroimaging techniques such as amyloid PET scans, enabling earlier diagnosis. Therapeutic approaches have evolved from symptomatic treatments like cholinesterase inhibitors to disease-modifying therapies targeting amyloid pathology. Monoclonal antibodies such as aducanumab and lecanemab have shown promise in clinical trials by reducing amyloid burden, though their clinical efficacy remains debated. Current research emphasizes combination therapies, lifestyle interventions, and personalized medicine approaches based on genetic risk factors and biomarker profiles."""],

            ["""Type 2 diabetes mellitus is a chronic metabolic disorder characterized by insulin resistance and progressive beta-cell dysfunction. The global prevalence has increased dramatically, largely attributed to sedentary lifestyles, obesity, and dietary changes. Pathophysiology involves multiple organ systems, including skeletal muscle, liver, adipose tissue, and pancreatic islets. Early intervention with lifestyle modifications remains the cornerstone of management, emphasizing dietary changes, regular physical activity, and weight management. Pharmacological treatment has expanded beyond traditional metformin therapy to include newer drug classes such as GLP-1 receptor agonists, SGLT-2 inhibitors, and DPP-4 inhibitors. These medications offer additional benefits including cardiovascular protection and weight reduction. Continuous glucose monitoring and insulin pump therapy have revolutionized diabetes management, enabling better glycemic control and reducing hypoglycemic episodes. Emerging therapies focus on beta-cell preservation, immunomodulation, and artificial pancreas systems for optimal glucose regulation."""],

            ["""Cancer immunotherapy has emerged as a paradigm shift in oncology, harnessing the body's immune system to fight malignancies. Checkpoint inhibitors, particularly PD-1/PD-L1 and CTLA-4 blockers, have demonstrated remarkable efficacy across various cancer types by removing inhibitory signals that prevent T-cell activation. CAR-T cell therapy represents another breakthrough, involving genetic modification of patient T-cells to recognize and eliminate cancer cells expressing specific antigens. Clinical success has been particularly notable in hematologic malignancies, with FDA approvals for multiple CAR-T products. However, challenges include cytokine release syndrome, neurotoxicity, and limited efficacy in solid tumors due to the immunosuppressive tumor microenvironment. Current research focuses on combination therapies, novel targets, and strategies to overcome resistance mechanisms. Biomarker development is crucial for patient selection, with tumor mutational burden, microsatellite instability, and immune gene signatures showing predictive value for treatment response."""]
        ],
        inputs=input_text,
        label="Click any example to test:"
    )

In [28]:
# Launch the interface
if __name__ == "__main__":
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        # server_port=7860,
        show_error=True
    )

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1b78f5945f7f7b7c46.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [25]:
from huggingface_hub import login

# Get your token from https://huggingface.co/settings/tokens
login("your_hf_token")

In [26]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Path to your model in Google Drive
model_path = "/content/drive/MyDrive/PubMed_Summarizer/bart-pubmed-final"  # Adjust path as needed

# Load your model from Drive
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

# Push directly to Hub (replace with your username)
model_name = "Janinduu/bart-pubmed-biomedical"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

print(f"✅ Model uploaded successfully to: https://huggingface.co/{model_name}")

Uploading...:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

✅ Model uploaded successfully to: https://huggingface.co/Janinduu/bart-pubmed-biomedical
