In [3]:
from Bio import Entrez
from xml.etree import ElementTree
import pandas as pd

In [4]:
# Entrez Setup
Entrez.email = "lukas.becker@hhu.de"

In [5]:
query = '("Exaiptasia diaphana"[Organism] AND microbiome)'

In [6]:
# 1 Query BioProject Database
search_handle = Entrez.esearch(db="bioproject", term=query, retmax=100)
search_results = Entrez.read(search_handle)
search_handle.close()

In [7]:
print("[*] Info: Found {} BioProject's associated to Exaiptasia metagenomic projects.".format(search_results["Count"]))

[*] Info: Found 12 BioProject's associated to Exaiptasia metagenomic projects.


In [8]:
# 2 Detailed searches based on fetched identifiers
ids = search_results["IdList"]
handle = Entrez.efetch(db="bioproject", id=",".join(ids), rettype="docsum", retmode="xml")
projects = Entrez.read(handle)["DocumentSummarySet"]["DocumentSummary"]
handle.close()

In [10]:
# 3 Writing Results Into CSV Table
bioprojects_resultfile = "../data/bioproject_table.csv"
with open(bioprojects_resultfile,"w") as biofile:
    header = "ProjectId\tProjectAcc\tProjectDate\tProjectTitle\tProjectDescription\tOrganismName\tOrganismStrain\n"
    biofile.write(header)
    for project in projects:
        project_id = project["Project_Id"]
        project_acc = project["Project_Acc"]
        project_date = project["Registration_Date"]
        project_title = project["Project_Title"]
        project_description = project["Project_Description"]
        project_organism = project["Organism_Name"]
        project_strain = project["Organism_Strain"]
        entry = "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(project_id,
                                                project_acc,
                                                project_date,
                                                project_title,
                                                project_description,
                                                project_organism,
                                                project_strain)
        biofile.write(entry)
print("[*] INFO: done writing result file with {} entries.".format(len(projects)))


[*] INFO: done writing result file with 12 entries.


In [11]:
# 4 Check Dataframe
df = pd.read_csv(bioprojects_resultfile,sep="\t")
df.head()

Unnamed: 0,ProjectId,ProjectAcc,ProjectDate,ProjectTitle,ProjectDescription,OrganismName,OrganismStrain
0,1336731,PRJNA1336731,2025/09/30 00:00,Symbiotic state modulates microbiome recovery ...,This study explored microbiome depletion and r...,,
1,1089063,PRJNA1089063,2024/03/18 00:00,Exaiptasia diaphana strain:F003 | isolate:F003...,Coral reefs depend upon a functional symbiosis...,Exaiptasia diaphana,F003
2,988282,PRJNA988282,2023/06/28 00:00,Microbiome of Exaiptasia diaphana and Berghia ...,The cladobranch Berghia stephanieae is able to...,,
3,907389,PRJNA907389,2022/12/01 00:00,Bacterial microbiome variation across body par...,16s rRNA sequencing of bacterial communities i...,,
4,650220,PRJNA650220,2020/08/02 00:00,Microbiome characterization of Exaiptasia diap...,Culture-dependent (isolation and 16S rRNA gene...,Exaiptasia diaphana,


In [39]:
# 5 Get BioSample Data
projects = df.ProjectId.apply(lambda x: str(x)).to_list()
bioproject_to_biosample = {}
print("[*] INFO mapping bioproject accessions to biosample entries.")
for proj in projects:
    link = Entrez.elink(dbfrom="bioproject", db="biosample", id=proj)
    link_results = Entrez.read(link)
    link.close()

    biosample_ids = []
    for link_set in link_results:
        for link in link_set['LinkSetDb'][0]['Link']:
            link_id = str(link["Id"])
            if link_id not in biosample_ids:
                biosample_ids.append(link_id)
    bioproject_to_biosample[proj] = biosample_ids
    print("\t[*] Found {} entries.".format(len(biosample_ids)))
print("[*] DONE")

[*] INFO mapping bioproject accessions to biosample entries.
	[*] Found 40 entries.
	[*] Found 2 entries.
	[*] Found 5 entries.
	[*] Found 96 entries.
	[*] Found 14 entries.
	[*] Found 3 entries.
	[*] Found 6 entries.
	[*] Found 16 entries.
	[*] Found 4 entries.
	[*] Found 4 entries.
	[*] Found 58 entries.
	[*] Found 6 entries.
[*] DONE


In [77]:
# 6 Get SRR data
bioproject_to_srr_dict = {}
for project_id in projects:
    print("[*] INFO: fetching SRA information for: {}.".format(project_id))
    link = Entrez.elink(dbfrom="bioproject",db="sra",id=project_id)
    link_results = Entrez.read(link)
    link.close()

    link_list = []
    for link_set in link_results:
        if "LinkSetDb" in link_set.keys():
            links = link_set["LinkSetDb"]
            for sset in links:
                if "DbTo" in sset.keys():
                    if sset["DbTo"] == "sra":
                        link_list_sets = sset["Link"]

                        for lset in link_list_sets:
                            new_sra_id = str(lset["Id"])
                            if new_sra_id not in link_list:
                                link_list.append(new_sra_id)
    if len(link_list) == 0:
        print("[*] WARNING: no hits for: {}".format(project_id))
    else:

        bioproject_to_srr_dict[project_id] = link_list
    print("\t[*] INFO: found {} entries.".format(len(bioproject_to_srr_dict[project_id])))
print("[*] DONE parsing bioproject targets.")

[*] INFO: fetching SRA information for: 1336731.
	[*] INFO: found 40 entries.
[*] INFO: fetching SRA information for: 1089063.
	[*] INFO: found 2 entries.
[*] INFO: fetching SRA information for: 988282.
	[*] INFO: found 26 entries.
[*] INFO: fetching SRA information for: 907389.
	[*] INFO: found 48 entries.
[*] INFO: fetching SRA information for: 650220.
	[*] INFO: found 14 entries.
[*] INFO: fetching SRA information for: 630329.
	[*] INFO: found 1071 entries.
[*] INFO: fetching SRA information for: 592182.
	[*] INFO: found 6 entries.
[*] INFO: fetching SRA information for: 588472.
	[*] INFO: found 16 entries.
[*] INFO: fetching SRA information for: 576556.
	[*] INFO: found 378 entries.
[*] INFO: fetching SRA information for: 576020.
	[*] INFO: found 1510 entries.
[*] INFO: fetching SRA information for: 524291.
	[*] INFO: found 58 entries.
[*] INFO: fetching SRA information for: 360672.
	[*] INFO: found 6 entries.
[*] DONE parsing bioproject targets.


In [78]:
# 7 Output Results Into Temporary File
bioproject_srr_mapping_file = "../data/bioproject_srr_table.csv"
with open(bioproject_srr_mapping_file,"w") as mapfile:
    header = "BioProject\tSRR\n"
    mapfile.write(header)
    for bioproject in bioproject_to_srr_dict.keys():
        for srr_link in bioproject_to_srr_dict[bioproject]:
            mapfile.write(bioproject+"\t"+srr_link+"\n")
bioproject_srr_df = pd.read_csv(bioproject_srr_mapping_file,sep="\t")
bioproject_srr_df.head()

Unnamed: 0,BioProject,SRR
0,1336731,41020117
1,1336731,41020116
2,1336731,41020115
3,1336731,41020114
4,1336731,41020113
