# Extract BioProject datasets pipeline

This notebook provides reusable helpers for fetching BioProject metadata, mapping BioSamples and SRA runs, and persisting the resulting tables. It is intended to be imported as a module or run interactively inside Jupyter.


## Usage

1. Configure the Entrez email address with `setup_entrez` (or by calling `run_pipeline`).
2. Query BioProject using `search_bioprojects` and fetch project records with `fetch_bioproject_details`.
3. Persist BioProject, BioSample, and SRA metadata using the provided helper functions.
4. Call `run_pipeline` for an end-to-end execution that writes TSV outputs to a directory.


In [None]:
"Utilities for extracting BioProject datasets from NCBI."
from __future__ import annotations

from collections.abc import Iterable, Sequence
from pathlib import Path
from typing import Dict, List

import pandas as pd
from Bio import Entrez


BIOPROJECT_QUERY_DEFAULT = '("Exaiptasia diaphana"[Organism] AND microbiome)'
DEFAULT_CHUNK_SIZE = 50


In [None]:
def setup_entrez(email: str) -> None:
    """Configure the Entrez client with a contact email address."""

    Entrez.email = email


def search_bioprojects(query: str, retmax: int = 100) -> Dict:
    """Run an Entrez search against the BioProject database."""

    handle = Entrez.esearch(db="bioproject", term=query, retmax=retmax)
    try:
        return Entrez.read(handle)
    finally:
        handle.close()


def fetch_bioproject_details(id_list: Sequence[str]) -> List[Dict]:
    """Fetch metadata for a sequence of BioProject identifiers."""

    if not id_list:
        return []

    handle = Entrez.efetch(
        db="bioproject",
        id=",".join(id_list),
        rettype="docsum",
        retmode="xml",
    )
    try:
        document = Entrez.read(handle)["DocumentSummarySet"]["DocumentSummary"]
    finally:
        handle.close()
    return document


In [None]:
def extract_project_data_types(project: Dict) -> List[str]:
    """Derive the sequencing or assay data types associated with a BioProject."""

    data_types: List[str] = []
    data_type_set = project.get("ProjectDataTypeSet")
    if isinstance(data_type_set, dict):
        project_data = data_type_set.get("ProjectDataType")
        if isinstance(project_data, dict):
            project_data = [project_data]
        if isinstance(project_data, list):
            for item in project_data:
                if not isinstance(item, dict):
                    continue
                candidates = []
                for key in ("DataType", "data_type", "Data_Type"):
                    value = item.get(key)
                    if value:
                        candidates.append(value)
                if not candidates:
                    value = item.get("#text") or item.get("@value")
                    if value:
                        candidates.append(value)
                for value in candidates:
                    if isinstance(value, dict):
                        value = value.get("#text") or value.get("@value")
                    if isinstance(value, str):
                        normalized = value.strip()
                        if normalized and normalized not in data_types:
                            data_types.append(normalized)
    if not data_types:
        project_type = project.get("ProjectType")
        if isinstance(project_type, dict):
            for key in ("ProjectTypeSubmission", "SubmissionType", "#text", "@value"):
                value = project_type.get(key)
                if isinstance(value, dict):
                    value = value.get("#text") or value.get("@value")
                if isinstance(value, str):
                    normalized = value.strip()
                    if normalized:
                        data_types.append(normalized)
                        break
    return data_types


def write_bioproject_table(projects: Iterable[Dict], path: Path) -> pd.DataFrame:
    """Write BioProject metadata to a TSV file and return it as a DataFrame."""

    columns = [
        "ProjectRecordId",
        "ProjectAccession",
        "ProjectDate",
        "ProjectTitle",
        "ProjectDescription",
        "OrganismName",
        "OrganismStrain",
        "ProjectDataTypes",
    ]
    records = []
    for project in projects:
        data_types = extract_project_data_types(project)
        records.append(
            {
                "ProjectRecordId": project.get("Project_Id", ""),
                "ProjectAccession": project.get("Project_Acc", ""),
                "ProjectDate": project.get("Registration_Date", ""),
                "ProjectTitle": project.get("Project_Title", ""),
                "ProjectDescription": project.get("Project_Description", ""),
                "OrganismName": project.get("Organism_Name", ""),
                "OrganismStrain": project.get("Organism_Strain", ""),
                "ProjectDataTypes": "; ".join(data_types) if data_types else "",
            }
        )
    df = pd.DataFrame(records, columns=columns)
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, sep="	", index=False)
    return df


def load_bioproject_table(path: Path) -> pd.DataFrame:
    """Load a BioProject table that was previously written to disk."""

    return pd.read_csv(path, sep="	")


In [None]:
def _chunked(sequence: Sequence[str], size: int = DEFAULT_CHUNK_SIZE) -> Iterable[Sequence[str]]:
    for start in range(0, len(sequence), size):
        yield sequence[start : start + size]


def map_bioproject_to_biosample(project_ids: Sequence[str]) -> pd.DataFrame:
    """Link BioProjects to BioSamples via Entrez elink."""

    records: list[dict[str, str]] = []
    for chunk in _chunked(list(project_ids)):
        handle = Entrez.elink(dbfrom="bioproject", db="biosample", id=chunk)
        try:
            links = Entrez.read(handle)
        finally:
            handle.close()
        for entry in links:
            source_id = entry.get("IdList", [""])[0]
            for linkset in entry.get("LinkSetDb", []) or []:
                if linkset.get("DbTo") != "biosample":
                    continue
                for link in linkset.get("Link", []) or []:
                    target = link.get("Id")
                    if target:
                        records.append(
                            {
                                "BioProjectRecordId": str(source_id),
                                "BioSampleRecordId": str(target),
                            }
                        )
    df = pd.DataFrame(records, columns=["BioProjectRecordId", "BioSampleRecordId"])
    if not df.empty:
        df = df.drop_duplicates().reset_index(drop=True)
    return df


def map_bioproject_to_srr(project_ids: Sequence[str]) -> pd.DataFrame:
    """Link BioProjects to SRA run accessions via Entrez elink."""

    records: list[dict[str, str]] = []
    for chunk in _chunked(list(project_ids)):
        handle = Entrez.elink(dbfrom="bioproject", db="sra", id=chunk)
        try:
            links = Entrez.read(handle)
        finally:
            handle.close()
        for entry in links:
            source_id = entry.get("IdList", [""])[0]
            for linkset in entry.get("LinkSetDb", []) or []:
                if linkset.get("DbTo") != "sra":
                    continue
                for link in linkset.get("Link", []) or []:
                    target = link.get("Id")
                    if target:
                        records.append(
                            {
                                "BioProjectRecordId": str(source_id),
                                "SraRunId": str(target),
                            }
                        )
    df = pd.DataFrame(records, columns=["BioProjectRecordId", "SraRunId"])
    if not df.empty:
        df = df.drop_duplicates().reset_index(drop=True)
    return df


In [None]:
def fetch_srr_details(id_list: Sequence[str]) -> pd.DataFrame:
    """Fetch run metadata for a collection of SRA accessions."""

    if not id_list:
        return pd.DataFrame()

    frames: list[pd.DataFrame] = []
    for chunk in _chunked(list(id_list)):
        handle = Entrez.efetch(db="sra", id=",".join(chunk), rettype="runinfo", retmode="text")
        try:
            frame = pd.read_csv(handle)
        finally:
            handle.close()
        if not frame.empty:
            frames.append(frame)
    if not frames:
        return pd.DataFrame()
    return pd.concat(frames, ignore_index=True)


def write_bioproject_srr_mapping(mapping: pd.DataFrame, path: Path) -> pd.DataFrame:
    """Persist the BioProject-to-SRA mapping as a TSV file."""

    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    mapping.to_csv(path, sep="	", index=False)
    return mapping


def summarize_samples(srr_details: pd.DataFrame) -> pd.DataFrame:
    """Summarize SRA runs per BioProject."""

    if srr_details.empty:
        return pd.DataFrame(
            columns=["BioProject", "RunCount", "BioSampleCount", "LibraryStrategies", "LibraryLayouts"]
        )

    def _collect_unique(series: pd.Series) -> str:
        unique_values = sorted({value for value in series.dropna() if str(value).strip()})
        return "; ".join(unique_values)

    summary = (
        srr_details.groupby("BioProject", dropna=False)
        .agg(
            RunCount=("Run", "nunique"),
            BioSampleCount=("BioSample", "nunique"),
            LibraryStrategies=("LibraryStrategy", _collect_unique),
            LibraryLayouts=("LibraryLayout", _collect_unique),
        )
        .reset_index()
    )
    return summary


In [None]:
def _normalize_sra_accession(value: str) -> str:
    if not value:
        return ""
    value_str = str(value)
    if value_str.startswith("SRR"):
        return value_str
    if value_str.isdigit():
        return f"SRR{value_str}"
    return value_str


def run_pipeline(
    email: str,
    output_dir: Path,
    query: str = BIOPROJECT_QUERY_DEFAULT,
    retmax: int = 100,
) -> Dict[str, pd.DataFrame]:
    """Execute the full BioProject to SRA extraction pipeline."""

    setup_entrez(email)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    search_results = search_bioprojects(query=query, retmax=retmax)
    project_ids = search_results.get("IdList", [])

    projects = fetch_bioproject_details(project_ids)
    bioproject_table = write_bioproject_table(projects, output_dir / "bioproject_table.tsv")

    record_to_accession = {
        str(project.get("Project_Id", "")): project.get("Project_Acc", "") for project in projects
    }

    biosample_map = map_bioproject_to_biosample(project_ids)
    if not biosample_map.empty:
        biosample_map = biosample_map.assign(
            BioProjectAccession=biosample_map["BioProjectRecordId"]
            .map(record_to_accession)
            .fillna(""),
        )

    srr_map = map_bioproject_to_srr(project_ids)
    if not srr_map.empty:
        srr_map = srr_map.assign(
            BioProjectAccession=srr_map["BioProjectRecordId"]
            .map(record_to_accession)
            .fillna(""),
            SraRunAccession=srr_map["SraRunId"].map(_normalize_sra_accession),
        )
        write_bioproject_srr_mapping(srr_map, output_dir / "bioproject_srr_mapping.tsv")

    srr_accessions = [] if srr_map.empty else srr_map["SraRunAccession"].tolist()
    srr_details = fetch_srr_details(srr_accessions)
    srr_details_path = output_dir / "bioproject_srr_details.tsv"
    if not srr_details.empty:
        srr_details.to_csv(srr_details_path, sep="	", index=False)

    sample_summary = summarize_samples(srr_details)
    summary_path = output_dir / "bioproject_sample_summary.tsv"
    if not sample_summary.empty:
        sample_summary.to_csv(summary_path, sep="	", index=False)

    return {
        "projects": pd.DataFrame(projects),
        "bioproject_table": bioproject_table,
        "biosample_map": biosample_map,
        "srr_map": srr_map,
        "srr_details": srr_details,
        "sample_summary": sample_summary,
    }


__all__ = [
    "BIOPROJECT_QUERY_DEFAULT",
    "DEFAULT_CHUNK_SIZE",
    "extract_project_data_types",
    "fetch_bioproject_details",
    "fetch_srr_details",
    "load_bioproject_table",
    "map_bioproject_to_biosample",
    "map_bioproject_to_srr",
    "run_pipeline",
    "search_bioprojects",
    "setup_entrez",
    "summarize_samples",
    "write_bioproject_srr_mapping",
    "write_bioproject_table",
]


## Example execution

Uncomment the cell below to run the full pipeline with your contact email and a desired output directory.


In [None]:
# Example usage:
# results = run_pipeline(
#     email="lukas.becker@hhu.de",
#     output_dir=Path("../data"),
# )
# results["sample_summary"].head()
