In [None]:
from Bio import Entrez
import pandas as pd
import xml.etree.ElementTree as ET

In [None]:
# Entrez Setup
Entrez.email = "lukas.becker@hhu.de"

In [None]:
query = '("Exaiptasia diaphana"[Organism] AND microbiome)'

In [None]:
# 1 Query BioProject Database
search_handle = Entrez.esearch(db="bioproject", term=query, retmax=100)
search_results = Entrez.read(search_handle)
search_handle.close()

In [None]:
print("[*] Info: Found {} BioProject's associated to Exaiptasia metagenomic projects.".format(search_results["Count"]))

In [None]:
# 2 Detailed searches based on fetched identifiers
ids = search_results["IdList"]
handle = Entrez.efetch(db="bioproject", id=",".join(ids), rettype="docsum", retmode="xml")
projects = Entrez.read(handle)["DocumentSummarySet"]["DocumentSummary"]
handle.close()

In [None]:
# 3 Writing Results Into CSV Table
bioprojects_resultfile = "../data/bioproject_table.csv"
with open(bioprojects_resultfile,"w") as biofile:
    header = "ProjectId\tProjectAcc\tProjectDate\tProjectTitle\tProjectDescription\tOrganismName\tOrganismStrain\n"
    biofile.write(header)
    for project in projects:
        project_id = project["Project_Id"]
        project_acc = project["Project_Acc"]
        project_date = project["Registration_Date"]
        project_title = project["Project_Title"]
        project_description = project["Project_Description"]
        project_organism = project["Organism_Name"]
        project_strain = project["Organism_Strain"]
        entry = "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(project_id,
                                                project_acc,
                                                project_date,
                                                project_title,
                                                project_description,
                                                project_organism,
                                                project_strain)
        biofile.write(entry)
print("[*] INFO: done writing result file with {} entries.".format(len(projects)))


In [None]:
# 4 Check Dataframe
df = pd.read_csv(bioprojects_resultfile,sep="\t")
df.head()

In [None]:
# 5 Get BioSample Data
projects = df.ProjectId.apply(lambda x: str(x)).to_list()
bioproject_to_biosample = {}
print("[*] INFO mapping bioproject accessions to biosample entries.")
for proj in projects:
    link = Entrez.elink(dbfrom="bioproject", db="biosample", id=proj)
    link_results = Entrez.read(link)
    link.close()

    biosample_ids = []
    for link_set in link_results:
        for link in link_set['LinkSetDb'][0]['Link']:
            link_id = str(link["Id"])
            if link_id not in biosample_ids:
                biosample_ids.append(link_id)
    bioproject_to_biosample[proj] = biosample_ids
    print("\t[*] Found {} entries.".format(len(biosample_ids)))
print("[*] DONE")

In [None]:
# 6 Get SRR data
bioproject_to_srr_dict = {}
for project_id in projects:
    print("[*] INFO: fetching SRA information for: {}.".format(project_id))
    link = Entrez.elink(dbfrom="bioproject",db="sra",id=project_id)
    link_results = Entrez.read(link)
    link.close()

    link_list = []
    for link_set in link_results:
        if "LinkSetDb" in link_set.keys():
            links = link_set["LinkSetDb"]
            for sset in links:
                if "DbTo" in sset.keys():
                    if sset["DbTo"] == "sra":
                        link_list_sets = sset["Link"]

                        for lset in link_list_sets:
                            new_sra_id = str(lset["Id"])
                            if new_sra_id not in link_list:
                                link_list.append(new_sra_id)
    if len(link_list) == 0:
        print("[*] WARNING: no hits for: {}".format(project_id))
    else:

        bioproject_to_srr_dict[project_id] = link_list
    print("\t[*] INFO: found {} entries.".format(len(bioproject_to_srr_dict[project_id])))
print("[*] DONE parsing bioproject targets.")

In [None]:
# 7 Output Results Into Temporary File
bioproject_srr_mapping_file = "../data/bioproject_srr_table.csv"
with open(bioproject_srr_mapping_file,"w") as mapfile:
    header = "BioProject\tSRR\n"
    mapfile.write(header)
    for bioproject in bioproject_to_srr_dict.keys():
        for srr_link in bioproject_to_srr_dict[bioproject]:
            mapfile.write(bioproject+"\t"+srr_link+"\n")
bioproject_srr_df = pd.read_csv(bioproject_srr_mapping_file,sep="\t")
bioproject_srr_df.head()

In [None]:
# 8 Fetch Detailed Information
print("[*] Building Result DataFrame File")
bioproject_srr_file = "../data/bioproject_srr_details.csv"
with open(bioproject_srr_file,"w") as outfile:
    outfile.write("BioProjectId\tTitle\tPlatform\tInstrument\tRuns\tSpots\tBases\tSubmitter\tExperiment\tStudy\tOrganism\tSample\tLibrary\tStrategy\tSource\tLayout\tBioproject\tBiosample\n")
    for bioproject in bioproject_to_srr_dict.keys():
        print("[*] Fetching SRR information for bioproject: {}".format(bioproject))
        fetch = Entrez.esummary(db="sra", id=",".join(bioproject_to_srr_dict[bioproject]),rettype="text")
        summaries = Entrez.read(fetch)
        fetch.close()
        print("\t[*] DONE fetching information")
        print("\t[*] Processing information ...")
        if len(summaries) != len(bioproject_to_srr_dict[bioproject]):
            print("\t[!] Warning: length of summaries does not correspond to length of srr data: {} vs. {}".format(len(summaries),len(bioproject_to_srr_dict[bioproject])))
        for summary in summaries:
            xml_string = summary["ExpXml"]
            xml_string = "<root>" + xml_string + "</root>"
            root = ET.fromstring(xml_string)# Extract information
            summ = root.find("Summary")
            if summ is not None:
                title = summ.findtext("Title", default="")
                platform = summ.findtext("Platform", default="")
                platform_elem = summ.find("Platform")
                instrument_model = platform_elem.attrib.get("instrument_model") if platform_elem is not None else ""
                stats_elem = summ.find("Statistics")
                if stats_elem is not None:
                    total_runs = stats_elem.attrib.get("total_runs", "")
                    total_spots = stats_elem.attrib.get("total_spots", "")
                    total_bases = stats_elem.attrib.get("total_bases", "")
                else:
                    total_runs = total_spots = total_bases = ""
            else:
                title = platform = instrument_model = total_runs = total_spots = total_bases = ""

            # Submitter, Experiment, Study, Organism, Sample
            submitter = root.find("Submitter")
            submitter_acc = submitter.attrib.get("acc", "") if submitter is not None else ""

            experiment = root.find("Experiment")
            experiment_acc = experiment.attrib.get("acc", "") if experiment is not None else ""

            study = root.find("Study")
            study_acc = study.attrib.get("acc", "") if study is not None else ""

            organism = root.find("Organism")
            organism_name = organism.attrib.get("ScientificName", "") if organism is not None else ""

            sample = root.find("Sample")
            sample_acc = sample.attrib.get("acc", "") if sample is not None else ""

            # Library descriptor
            lib_descriptor = root.find("Library_descriptor")
            if lib_descriptor is not None:
                library_name = lib_descriptor.findtext("LIBRARY_NAME", default="")
                library_strategy = lib_descriptor.findtext("LIBRARY_STRATEGY", default="")
                library_source = lib_descriptor.findtext("LIBRARY_SOURCE", default="")
                library_layout_elem = lib_descriptor.find("LIBRARY_LAYOUT")
                if library_layout_elem is not None and len(library_layout_elem):
                    library_layout = library_layout_elem[0].tag  # PAIRED or SINGLE
                else:
                    library_layout = ""
            else:
                library_name = library_strategy = library_source = library_layout = ""

            # BioProject and BioSample
            bioproject = root.findtext("Bioproject", default="")
            biosample = root.findtext("Biosample", default="")

            line = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(bioproject,title,platform,instrument_model,
                                                                                                 total_runs,total_spots,total_bases,
                                                                                                 submitter_acc,experiment_acc,study_acc,
                                                                                                 organism_name,sample_acc,library_name,library_strategy,library_source,library_layout,bioproject,biosample)
            outfile.write(line)
        print("\t[*] DONE")



In [None]:
# 9 Check Dataframe
srr_table = pd.read_csv(bioproject_srr_file,sep="\t")
srr_table.head()

In [None]:
for bioproject in srr_table.BioProjectId.unique():
    print("[*] INFO number os samples for: {} is {}".format(bioproject,len(srr_table[srr_table["BioProjectId"] == bioproject])))