In [1]:
import pandas as pd
from Bio import Entrez

Entrez.email = 'kaedeito@student.ubc.ca'
Entrez.tool = 'Chlamydomonas-metal-toxicity'

In [2]:
references_df = pd.read_excel('../references/references.xlsx', sheet_name='list')
references_df.head()

Unnamed: 0,Checked?,included?,paper downloaded?,BioProject,Gene Expression Omnibus,DOI,description,"Notes (searched using ""Chlamydomonas"" ""metal"" ""toxicity"" ""SRA"")"
0,True,False,,,,10.1128/AEM.00998-13,Transcriptome Sequencing (RNA-seq) Analysis of...,"Dana F. Simon, Rute F. Domingos. No SRAs avail..."
1,True,False,,PRJNA799651,,,Insights from comparative transcriptome analys...,don’t include
2,True,False,0.0,SRP040767,,10.1371/journal.pone.0107092,,"control, AgNO3 and AgNP, metatranscriptome, th..."
3,True,True,1.0,PRJNA608616,,10.1093/plphys/kiab375,Long-term acclimation to cadmium exposure reve...,"combo of control (Ctrl), short-term (ST), and ..."
4,True,True,1.0,PRJNA735693,GSE176268,10.1016/j.envpol.2021.117594,Rare earth elements show antagonistic interact...,"(5 replicates for each treatment: Ce, Tm, Y, M..."


In [3]:
bioprojects_df = references_df[references_df['included?'] == True]
bioprojects_list = bioprojects_df['BioProject'].tolist()
bioprojects_df.head()

Unnamed: 0,Checked?,included?,paper downloaded?,BioProject,Gene Expression Omnibus,DOI,description,"Notes (searched using ""Chlamydomonas"" ""metal"" ""toxicity"" ""SRA"")"
3,True,True,1.0,PRJNA608616,,10.1093/plphys/kiab375,Long-term acclimation to cadmium exposure reve...,"combo of control (Ctrl), short-term (ST), and ..."
4,True,True,1.0,PRJNA735693,GSE176268,10.1016/j.envpol.2021.117594,Rare earth elements show antagonistic interact...,"(5 replicates for each treatment: Ce, Tm, Y, M..."
5,True,True,1.0,PRJNA335844,GSE84995,10.1016/j.watres.2017.05.020,Transcriptomic approach for assessment of the ...,"Valcea is control, Babeni is experiment"
6,True,True,1.0,PRJNA394256,,10.1016/j.aquatox.2018.04.020,Comparison of genetic differences and transcri...,"strain is the same, methodology is the same, s..."
7,True,True,1.0,PRJNA576974,,10.3389/fmicb.2020.01443,,Control 1: SRR10269729 \nControl 2: SRR102697...


In [4]:
# search for SRAs associated with each BioProject
from Bio.Entrez.Parser import DictionaryElement
import os

def get_sra_list():
    sra_details_csv = '../references/sra_list.csv'
    # if the folder is not empty
    directory = '../sra_details'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # if the folder is not empty
    list_dir = os.listdir(directory)
    if len(list_dir) > 0:
        # if the folder is not empty
        print('SRA details already downloaded')
        sra_df = pd.read_csv(sra_details_csv)
        sra_list: list[dict] = sra_df.to_dict('records')
        return sra_list, sra_df
    else:
        print('SRA details not downloaded')

        sra_list = []
        for bioproject in bioprojects_list:
            handle = Entrez.esearch(db='sra', term=bioproject, retmode='xml', rettype='uilist')
            record = Entrez.read(handle, validate=False)
            if isinstance(record, DictionaryElement):
                id_list: list[str] | None = record['IdList']
                if (id_list is None) or len(id_list) == 0:
                    handle.close()
                    raise Exception(f'No SRA found for {bioproject}')
                for id in id_list:
                    entry = { 'BioProject': bioproject, 'SRA_ID': id }
                    sra_list.append(entry)
            handle.close()

        sra_df = pd.DataFrame(sra_list)
        sra_df.to_csv(sra_details_csv, index=False)
        sra_df['SRA_ID'].to_csv('../references/sra_id_list.txt', header=None, index=None) # type: ignore
        return sra_list, sra_df

sra_list, sra_df = get_sra_list()

In [29]:
import os

def get_sra_details(sra_list: list[dict[str, str]], override: bool = False):
  directory = '../sra_details'
  if not os.path.exists(directory):
    os.makedirs(directory)
  # if the folder is not empty
  list_dir = os.listdir(directory)
  if override or len(list_dir) == 0:
    xml_paths: list[str] = []
    for sra_id_dict in sra_list:
      sra_id = sra_id_dict.get('SRA_ID')
      print(f'Downloading {sra_id}...')
      handle = Entrez.efetch(db='sra', id=sra_id, retmode='xml', rettype='xml')
      xml_path = os.path.realpath(f'../sra_details/{sra_id}.xml')
      with open(xml_path, 'wb') as f:
        f.write(handle.read()) # type: ignore
      handle.close()
      xml_paths.append(xml_path)
    return xml_paths

  else:
      # if the folder is not empty
      # filter out .gitkeep
      list_dir = list(filter(lambda x: x != '.gitkeep', list_dir))
      # give full path
      list_dir = list(map(lambda x: os.path.realpath(f'{directory}/{x}'), list_dir))
      print('SRA details already downloaded')
      return list_dir


In [28]:
xml_paths = get_sra_details(sra_list, True)
print(xml_paths)

Downloading 10287618...
Downloading 10287617...
Downloading 10287616...
Downloading 10287615...
Downloading 10287614...
Downloading 10287613...
Downloading 10287612...
Downloading 10287611...
Downloading 10287610...
Downloading 10287609...
Downloading 10287608...
Downloading 10287607...
Downloading 14749631...
Downloading 14749630...
Downloading 14749629...
Downloading 14749628...
Downloading 14749627...
Downloading 14749626...
Downloading 14749625...
Downloading 14749624...
Downloading 14749623...
Downloading 14749622...
Downloading 14749621...
Downloading 14749620...
Downloading 14749619...
Downloading 14749618...
Downloading 14749617...
Downloading 14749616...
Downloading 14749615...
Downloading 14749614...
Downloading 14749613...
Downloading 14749612...
Downloading 2861989...
Downloading 2861988...
Downloading 2861987...
Downloading 2861986...
Downloading 2861985...
Downloading 2861984...
Downloading 2861983...
Downloading 2861982...
Downloading 2861981...
Downloading 2861980...
Do

In [30]:
from xml.etree.ElementTree import Element
import xml.etree.ElementTree as ET

def get_design_details(design_ele: Element | None):
  design = {}

  if design_ele is None:
    return design
  library = design_ele.find('LIBRARY_DESCRIPTOR')
  if library is not None:
    library_source = library.find('LIBRARY_SOURCE')
    if library_source is not None:
      design['library_source'] = library_source.text
    library_strategy = library.find('LIBRARY_STRATEGY')
    if library_strategy is not None:
      design['library_strategy'] = library_strategy.text
  return design


def parse_xml_files(xml_paths: list[str]):
  entries: list[dict[str, str]] = []
  for xml_path in xml_paths:
    with open(xml_path, 'r') as f:
      os.path.basename(xml_path)
      sra_id = os.path.basename(xml_path).split('.')[0]
      tree = ET.parse(f)
      root = tree.getroot()

      exp_pckgs = root.findall('./EXPERIMENT_PACKAGE')
      if not exp_pckgs:
        continue

      for exp_pckg in exp_pckgs:

        exp = exp_pckg.find('EXPERIMENT')
        if exp is None:
          continue

        exp_id = exp.get('accession')
        title_ele = exp.find('TITLE')

        design_dict = get_design_details(exp.find('DESIGN'))

        run_set = exp_pckg.find('RUN_SET')
        if run_set is None:
          continue
        runs = run_set.findall('RUN')
        if not runs:
          continue

        for run in runs:
          run_dict = {
            'SRX_ID': exp_id,
            'SRR_ID': run.get('accession'),
            'SRA_ID': sra_id,
            'title': title_ele.text if title_ele is not None else None,
          }
          entry = run_dict | design_dict
          entries.append(entry)
  return entries


In [31]:
entries = parse_xml_files(xml_paths)
entries_df = pd.DataFrame(entries)
all_df = pd.merge(sra_df, entries_df, on='SRA_ID')
all_df = pd.merge(all_df, bioprojects_df, on='BioProject')
all_df.to_csv('../references/sra_details.csv', index=False)

all_df['SRR_ID'].to_csv('../references/srr_id_list.txt', header=None, index=None) # type: ignore