In [1]:
import pandas as pd
import os

from typing import Dict, List
from Bio import Entrez
from pathlib import Path

In [3]:
# setup
def setup_result_directory(result_dir:str)->Path:
    '''
    Creates a directory by using a specified string as argument.

    :param
        result_dir: directory path as string
    :return:
        result_dir_path: Path based on the specified result_dir string argument.
    '''
    try:
        result_dir_path = Path(result_dir)
        result_dir_path.parent.mkdir(parents=True, exist_ok=True)
        return result_dir_path
    except Exception as e:
        raise Exception("[-] ERROR creating result directory with exception: {}".format(e))

def setup_entrez(email:str) -> None:
    '''
    Setup for the biopython Entrez.email field.

    :param
        email: String variable describing the user email.
    :return:
    '''
    Entrez.email = email

In [28]:
# pipeline step 1 - query NCBI based on a search string
def fetch_bioprojects(query:str, max_returns=100)->List[int]:
    '''
    Function to perform an Entrez search query on the bioproject database.

    :param
        query: search string
    :param
        max_returns: maximal number of bioprojects to return
    :return:
        bioproject_search_results["IdList"]: List[int] of bioproject identifier that can be used for fetching detailed information
    '''
    try:
        bioproject_search_handle = Entrez.esearch(db="bioproject", term=query, retmax=max_returns)
        bioproject_search_results = Entrez.read(bioproject_search_handle)
        bioproject_search_handle.close()
        print("[*] Info: Found {} BioProject's associated to Exaiptasia microbiome projects.".format(bioproject_search_results["Count"]))
        return bioproject_search_results["IdList"]
    except Exception as e:
        raise Exception("[-] ERROR during fetching bioproject data with exception: {}".format(e))

# pipeline step 2 - get detailed information based on Entrez bioproject identifier
def fetch_detailed_bioproject_infos(bioproject_identifier:List[int])->List[Dict]:
    '''
    Function for extracting detailed information of bioproject entries.

    :param
        bioproject_identifier: List[int]: list of bioproject identifier
    :return:
        bioproject_docsum: List[Dict]: detailed information of the requested bioprojects.
    '''
    try:
        to_fetch_ids = ",".join(bioproject_identifier)
        bioproject_docsum_handle = Entrez.efetch(db="bioproject", id=to_fetch_ids, rettype="docsum", retmode="xml")
        bioproject_docsum = Entrez.read(bioproject_docsum_handle)["DocumentSummarySet"]["DocumentSummary"]
        bioproject_docsum_handle.close()
        print("[*] Info: Found detailed information for: {} associated BioProject's".format(len(bioproject_docsum)))
        return bioproject_docsum
    except Exception as e:
        raise Exception("[-] ERROR during fetching bioproject detailed informarion: {}".format(e))

# pipeline step 3 - writing results into CSV file
def write_bioproject_result_file(bioprojects_resultfile:str,project_document_summary:List[Dict])->None:
    '''
    Function for extracting document summary information fields. The information are written into a CSV file.

    :param
        bioprojects_resultfile: str: path to result file
    :param
        project_document_summary: List[Dict]: Entrez document summary
    :return:
    '''
    try:
        with open(bioprojects_resultfile,"w") as biofile:
            header = "ProjectId\tProjectAcc\tProjectDate\tProjectTitle\tProjectDescription\tOrganismName\tOrganismStrain\n"
            biofile.write(header)
            for project in project_document_summary:
                project_id = project["Project_Id"]
                project_acc = project["Project_Acc"]
                project_date = project["Registration_Date"]
                project_title = project["Project_Title"]
                project_description = project["Project_Description"]
                project_organism = project["Organism_Name"]
                project_strain = project["Organism_Strain"]
                entry = "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(project_id,
                                                        project_acc,
                                                        project_date,
                                                        project_title,
                                                        project_description,
                                                        project_organism,
                                                        project_strain)
                biofile.write(entry)
        print("[*] INFO: done writing result file with {} entries.".format(len(project_document_summary)))
    except Exception as e:
        raise Exception("[-] ERROR during BioProject result file writing with exception: {}".format(e))

In [25]:
def read_bioproject_information(bioproject_document_summary_csv:str, separator=",")->pd.DataFrame:
    '''
    Function for reading bioproject csv table with pandas.
    :param
        bioproject_document_summary_csv: Path to bioproject result file.
    :return:
        pd.DataFrame
    '''
    try:
        return pd.read_csv(bioproject_document_summary_csv, sep=separator)
    except Exception as e:
        raise Exception("[-] ERROR opening pandas table with exception: {}".format(e))

In [26]:
query = '("Exaiptasia diaphana"[Organism] AND microbiome)'
bioproject_detailed_info_path = "../results/exaiptasia_microbiome_studies.csv"

setup_entrez("lukas.becker@hhu.de")
# step 1 get bioproject identifier
bioproject_target_ids = fetch_bioprojects(query=query)
# step 2 get detailed information
bioproject_document_summary = fetch_detailed_bioproject_infos(bioproject_target_ids)
# step 3 write result file
write_bioproject_result_file(bioproject_detailed_info_path,bioproject_document_summary)
bioproject_dataframe = read_bioproject_information(bioproject_detailed_info_path, separator="\t")

[*] Info: Found 13 BioProject's associated to Exaiptasia metagenomic projects.
[*] Info: Found detailed information for: 13 associated BioProject's
[*] INFO: done writing result file with 13 entries.


In [27]:
bioproject_dataframe

Unnamed: 0,ProjectId,ProjectAcc,ProjectDate,ProjectTitle,ProjectDescription,OrganismName,OrganismStrain
0,1364369,PRJEB101302,2025/11/16 00:00,Exaiptasia longitudinal microbiomes across tem...,"Under global warming scenarios, the deteriorat...",,
1,1336731,PRJNA1336731,2025/09/30 00:00,Symbiotic state modulates microbiome recovery ...,This study explored microbiome depletion and r...,,
2,1089063,PRJNA1089063,2024/03/18 00:00,Exaiptasia diaphana strain:F003 | isolate:F003...,Coral reefs depend upon a functional symbiosis...,Exaiptasia diaphana,F003
3,988282,PRJNA988282,2023/06/28 00:00,Microbiome of Exaiptasia diaphana and Berghia ...,The cladobranch Berghia stephanieae is able to...,,
4,907389,PRJNA907389,2022/12/01 00:00,Bacterial microbiome variation across body par...,16s rRNA sequencing of bacterial communities i...,,
5,650220,PRJNA650220,2020/08/02 00:00,Microbiome characterization of Exaiptasia diap...,Culture-dependent (isolation and 16S rRNA gene...,Exaiptasia diaphana,
6,630329,PRJNA630329,2020/05/04 00:00,Assessment of a ROS-targeted bacterial probiot...,Probiotic inoculation is one of several interv...,Exaiptasia diaphana,AIMS2-4
7,592182,PRJNA592182,2019/11/27 00:00,Comparison of gene expression in Exaiptasia pa...,Emergence of the symbiotic lifestyle fostered ...,Exaiptasia diaphana,
8,588472,PRJNA588472,2019/11/08 00:00,Comparison of gene expression in symbiotic and...,Emergence of the symbiotic lifestyle fostered ...,Exaiptasia diaphana,
9,576556,PRJNA576556,2019/10/09 00:00,Reduced microbiome of Exaiptasia diaphana,16S rRNA metabarcoding of the V5-V6 region to ...,,
