# Extract BioProject datasets pipeline

This notebook restores the original interactive pipeline helpers for querying NCBI BioProject and SRA metadata.


In [8]:
"""Utilities for extracting BioProject datasets from NCBI."""
from __future__ import annotations

from pathlib import Path
from typing import Dict, Iterable, List, Sequence
import xml.etree.ElementTree as ET

import pandas as pd
from Bio import Entrez


__all__ = [
    "BIOPROJECT_QUERY_DEFAULT",
    "fetch_bioproject_details",
    "fetch_srr_details",
    "load_bioproject_table",
    "map_bioproject_to_biosample",
    "map_bioproject_to_srr",
    "run_pipeline",
    "search_bioprojects",
    "setup_entrez",
    "summarize_samples",
    "write_bioproject_srr_mapping",
    "write_bioproject_table",
]


BIOPROJECT_QUERY_DEFAULT = '("Exaiptasia diaphana"[Organism] AND microbiome)'


def setup_entrez(email: str) -> None:
    """Configure the Entrez client with a contact email address."""

    Entrez.email = email


def search_bioprojects(query: str, retmax: int = 100) -> Dict:
    """Run an Entrez search against the BioProject database."""

    handle = Entrez.esearch(db="bioproject", term=query, retmax=retmax)
    try:
        return Entrez.read(handle)
    finally:
        handle.close()


def fetch_bioproject_details(id_list: Sequence[str]) -> List[Dict]:
    """Fetch metadata for a sequence of BioProject identifiers."""

    if not id_list:
        return []

    handle = Entrez.efetch(
        db="bioproject",
        id=",".join(id_list),
        rettype="docsum",
        retmode="xml",
    )
    try:
        document = Entrez.read(handle)["DocumentSummarySet"]["DocumentSummary"]
    finally:
        handle.close()
    return document


def write_bioproject_table(projects: Iterable[Dict], path: Path) -> pd.DataFrame:
    """Write BioProject metadata to a TSV file and return it as a DataFrame."""

    columns = [
        "ProjectId",
        "ProjectAcc",
        "ProjectDate",
        "ProjectTitle",
        "ProjectDescription",
        "OrganismName",
        "OrganismStrain",
    ]
    records = []
    for project in projects:
        records.append(
            {
                "ProjectId": project.get("Project_Id", ""),
                "ProjectAcc": project.get("Project_Acc", ""),
                "ProjectDate": project.get("Registration_Date", ""),
                "ProjectTitle": project.get("Project_Title", ""),
                "ProjectDescription": project.get("Project_Description", ""),
                "OrganismName": project.get("Organism_Name", ""),
                "OrganismStrain": project.get("Organism_Strain", ""),
            }
        )
    df = pd.DataFrame(records, columns=columns)
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, sep="\t", index=False)
    return df


def load_bioproject_table(path: Path) -> pd.DataFrame:
    """Load a BioProject table that was previously written to disk."""

    return pd.read_csv(path, sep="\t")


def map_bioproject_to_biosample(project_ids: Iterable[str]) -> Dict[str, List[str]]:
    """Map BioProject identifiers to linked BioSample accessions."""

    mapping: Dict[str, List[str]] = {}
    for proj in project_ids:
        link = Entrez.elink(dbfrom="bioproject", db="biosample", id=str(proj))
        try:
            link_results = Entrez.read(link)
        finally:
            link.close()
        biosample_ids: List[str] = []
        for link_set in link_results:
            if "LinkSetDb" not in link_set:
                continue
            for link_db in link_set["LinkSetDb"]:
                if link_db.get("DbTo") != "biosample":
                    continue
                for link_entry in link_db.get("Link", []):
                    link_id = str(link_entry["Id"])
                    if link_id not in biosample_ids:
                        biosample_ids.append(link_id)
        mapping[str(proj)] = biosample_ids
    return mapping


def map_bioproject_to_srr(project_ids: Iterable[str]) -> Dict[str, List[str]]:
    """Map BioProject identifiers to linked SRA run accessions."""

    mapping: Dict[str, List[str]] = {}
    for project_id in project_ids:
        link = Entrez.elink(dbfrom="bioproject", db="sra", id=str(project_id))
        try:
            link_results = Entrez.read(link)
        finally:
            link.close()
        link_list: List[str] = []
        for link_set in link_results:
            if "LinkSetDb" not in link_set:
                continue
            for subset in link_set["LinkSetDb"]:
                if subset.get("DbTo") != "sra":
                    continue
                for entry in subset.get("Link", []):
                    new_sra_id = str(entry["Id"])
                    if new_sra_id not in link_list:
                        link_list.append(new_sra_id)
        mapping[str(project_id)] = link_list
    return mapping


def write_bioproject_srr_mapping(
    mapping: Dict[str, List[str]], path: Path
) -> pd.DataFrame:
    """Persist the BioProject-to-SRR mapping to disk."""

    records = []
    for project_id, srr_ids in mapping.items():
        for srr_id in srr_ids:
            records.append({"BioProject": project_id, "SRR": srr_id})
    df = pd.DataFrame(records, columns=["BioProject", "SRR"])
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, sep="\t", index=False)
    return df


def _parse_library_layout(lib_descriptor: ET.Element | None) -> str:
    if lib_descriptor is None:
        return ""
    layout_elem = lib_descriptor.find("LIBRARY_LAYOUT")
    if layout_elem is None or not len(layout_elem):
        return ""
    return layout_elem[0].tag


def fetch_srr_details(mapping: Dict[str, List[str]], path: Path) -> pd.DataFrame:
    """Fetch SRR metadata for each BioProject and store it as a TSV file."""

    header = [
        "BioProjectId",
        "Title",
        "Platform",
        "Instrument",
        "Runs",
        "Spots",
        "Bases",
        "Submitter",
        "Experiment",
        "Study",
        "Organism",
        "Sample",
        "Library",
        "Strategy",
        "Source",
        "Layout",
        "Bioproject",
        "Biosample",
    ]
    rows = []
    for bioproject, srr_ids in mapping.items():
        if not srr_ids:
            continue
        fetch = Entrez.esummary(db="sra", id=",".join(srr_ids), rettype="text")
        try:
            summaries = Entrez.read(fetch)
        finally:
            fetch.close()
        for summary in summaries:
            xml_string = "<root>" + summary["ExpXml"] + "</root>"
            root = ET.fromstring(xml_string)
            summ = root.find("Summary")
            if summ is not None:
                title = summ.findtext("Title", default="")
                platform = summ.findtext("Platform", default="")
                platform_elem = summ.find("Platform")
                instrument_model = (
                    platform_elem.attrib.get("instrument_model", "")
                    if platform_elem is not None
                    else ""
                )
                stats_elem = summ.find("Statistics")
                if stats_elem is not None:
                    total_runs = stats_elem.attrib.get("total_runs", "")
                    total_spots = stats_elem.attrib.get("total_spots", "")
                    total_bases = stats_elem.attrib.get("total_bases", "")
                else:
                    total_runs = total_spots = total_bases = ""
            else:
                title = platform = instrument_model = ""
                total_runs = total_spots = total_bases = ""
            submitter = root.find("Submitter")
            submitter_acc = submitter.attrib.get("acc", "") if submitter is not None else ""
            experiment = root.find("Experiment")
            experiment_acc = (
                experiment.attrib.get("acc", "") if experiment is not None else ""
            )
            study = root.find("Study")
            study_acc = study.attrib.get("acc", "") if study is not None else ""
            organism = root.find("Organism")
            organism_name = (
                organism.attrib.get("ScientificName", "") if organism is not None else ""
            )
            sample = root.find("Sample")
            sample_acc = sample.attrib.get("acc", "") if sample is not None else ""
            lib_descriptor = root.find("Library_descriptor")
            if lib_descriptor is not None:
                library_name = lib_descriptor.findtext("LIBRARY_NAME", default="")
                library_strategy = lib_descriptor.findtext("LIBRARY_STRATEGY", default="")
                library_source = lib_descriptor.findtext("LIBRARY_SOURCE", default="")
                library_layout = _parse_library_layout(lib_descriptor)
            else:
                library_name = library_strategy = library_source = ""
                library_layout = ""
            bioproject_acc = root.findtext("Bioproject", default="")
            biosample_acc = root.findtext("Biosample", default="")
            rows.append(
                {
                    "BioProjectId": bioproject,
                    "Title": title,
                    "Platform": platform,
                    "Instrument": instrument_model,
                    "Runs": total_runs,
                    "Spots": total_spots,
                    "Bases": total_bases,
                    "Submitter": submitter_acc,
                    "Experiment": experiment_acc,
                    "Study": study_acc,
                    "Organism": organism_name,
                    "Sample": sample_acc,
                    "Library": library_name,
                    "Strategy": library_strategy,
                    "Source": library_source,
                    "Layout": library_layout,
                    "Bioproject": bioproject_acc,
                    "Biosample": biosample_acc,
                }
            )
    df = pd.DataFrame(rows, columns=header)
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, sep="\t", index=False)
    return df


def summarize_samples(srr_table: pd.DataFrame) -> pd.Series:
    """Return a summary Series counting SRR entries per BioProject."""

    if "BioProjectId" not in srr_table.columns:
        raise ValueError("SRR table must contain a 'BioProjectId' column")
    return srr_table.groupby("BioProjectId").size()


def run_pipeline(
    email: str,
    output_dir: Path,
    query: str = BIOPROJECT_QUERY_DEFAULT,
    retmax: int = 100,
) -> Dict[str, pd.DataFrame | pd.Series | Dict[str, List[str]]]:
    """Execute the full extraction pipeline and return intermediate results."""

    setup_entrez(email)
    search_results = search_bioprojects(query=query, retmax=retmax)
    ids = search_results.get("IdList", [])
    projects = fetch_bioproject_details(ids)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    bioproject_table_path = output_dir / "bioproject_table.tsv"
    bioproject_df = write_bioproject_table(projects, bioproject_table_path)

    bioproject_ids = bioproject_df["ProjectId"].astype(str).tolist()
    bioproject_to_biosample = map_bioproject_to_biosample(bioproject_ids)
    bioproject_to_srr = map_bioproject_to_srr(bioproject_ids)

    srr_map_path = output_dir / "bioproject_srr_table.tsv"
    bioproject_srr_df = write_bioproject_srr_mapping(bioproject_to_srr, srr_map_path)

    srr_details_path = output_dir / "bioproject_srr_details.tsv"
    srr_table = fetch_srr_details(bioproject_to_srr, srr_details_path)
    sample_summary = summarize_samples(srr_table)

    return {
        "search_results": search_results,
        "bioproject_df": bioproject_df,
        "bioproject_to_biosample": bioproject_to_biosample,
        "bioproject_to_srr": bioproject_to_srr,
        "bioproject_srr_df": bioproject_srr_df,
        "srr_table": srr_table,
        "sample_summary": sample_summary,
    }




## Example execution

Uncomment the call below to run the pipeline with your email and output directory.


In [2]:
results = run_pipeline(
     email="lukas.becker@hhu.de",
     output_dir=Path("../data"),
 )
results["sample_summary"].head()


BioProjectId
1089063       2
1336731      40
360672        6
524291       58
576020     1510
dtype: int64

In [5]:
results.keys()

dict_keys(['search_results', 'bioproject_df', 'bioproject_to_biosample', 'bioproject_to_srr', 'bioproject_srr_df', 'srr_table', 'sample_summary'])

In [7]:
results["search_results"]

{'Count': '12', 'RetMax': '12', 'RetStart': '0', 'IdList': ['1336731', '1089063', '988282', '907389', '650220', '630329', '592182', '588472', '576556', '576020', '524291', '360672'], 'TranslationSet': [], 'TranslationStack': [{'Term': '"Exaiptasia diaphana"[Organism]', 'Field': 'Organism', 'Count': '47', 'Explode': 'Y'}, {'Term': 'microbiome[All Fields]', 'Field': 'All Fields', 'Count': '27521', 'Explode': 'N'}, 'AND', 'GROUP'], 'QueryTranslation': '"Exaiptasia diaphana"[Organism] AND microbiome[All Fields]'}