In [1]:
import pandas as pd
from Bio import Entrez

Entrez.email = 'kaedeito@student.ubc.ca'
Entrez.tool = 'Chlamydomonas-metal-toxicity'

In [2]:
references_df = pd.read_excel('../references/references.xlsx', sheet_name='list')
references_df.head()

Unnamed: 0,Checked?,included?,paper downloaded?,BioProject,Gene Expression Omnibus,DOI,description,"Notes (searched using ""Chlamydomonas"" ""metal"" ""toxicity"" ""SRA"")"
0,True,False,,,,10.1128/AEM.00998-13,Transcriptome Sequencing (RNA-seq) Analysis of...,"Dana F. Simon, Rute F. Domingos. No SRAs avail..."
1,True,False,,PRJNA799651,,,Insights from comparative transcriptome analys...,don’t include
2,True,False,0.0,SRP040767,,10.1371/journal.pone.0107092,,"control, AgNO3 and AgNP, metatranscriptome, th..."
3,True,True,1.0,PRJNA608616,,10.1093/plphys/kiab375,Long-term acclimation to cadmium exposure reve...,"combo of control (Ctrl), short-term (ST), and ..."
4,True,True,1.0,PRJNA735693,GSE176268,10.1016/j.envpol.2021.117594,Rare earth elements show antagonistic interact...,"(5 replicates for each treatment: Ce, Tm, Y, M..."


In [3]:
bioprojects_df = references_df[references_df['included?'] == True]
bioprojects_list = bioprojects_df['BioProject'].tolist()
bioprojects_df.head()

Unnamed: 0,Checked?,included?,paper downloaded?,BioProject,Gene Expression Omnibus,DOI,description,"Notes (searched using ""Chlamydomonas"" ""metal"" ""toxicity"" ""SRA"")"
3,True,True,1.0,PRJNA608616,,10.1093/plphys/kiab375,Long-term acclimation to cadmium exposure reve...,"combo of control (Ctrl), short-term (ST), and ..."
4,True,True,1.0,PRJNA735693,GSE176268,10.1016/j.envpol.2021.117594,Rare earth elements show antagonistic interact...,"(5 replicates for each treatment: Ce, Tm, Y, M..."
5,True,True,1.0,PRJNA335844,GSE84995,10.1016/j.watres.2017.05.020,Transcriptomic approach for assessment of the ...,"Valcea is control, Babeni is experiment"
6,True,True,1.0,PRJNA394256,,10.1016/j.aquatox.2018.04.020,Comparison of genetic differences and transcri...,"strain is the same, methodology is the same, s..."
7,True,True,1.0,PRJNA576974,,10.3389/fmicb.2020.01443,,Control 1: SRR10269729 \nControl 2: SRR102697...


In [52]:
# search for SRAs associated with each BioProject
def get_sra_list():
    sra_list = []
    for bioproject in bioprojects_list:
        handle = Entrez.esearch(db='sra', term=bioproject, retmode='xml', rettype='uilist')
        record = Entrez.read(handle, validate=False)
        id_list: list[str] = record['IdList']
        handle.close()
        for id in id_list:
            entry = { 'BioProject': bioproject, 'SRA_ID': id }
            sra_list.append(entry)
    sra_df = pd.DataFrame(sra_list)
    sra_df.to_csv('../references/sra_list.csv', index=False)
    sra_df['SRA_ID'].to_csv('../references/sra_id_list.txt', header=None, index=None)
    return sra_list, sra_df

sra_list, sra_df = get_sra_list()

In [60]:
import os
def get_sra_details(sra_list: list[dict[str, str]]):
  xml_paths: list[str] = []
  for sra_id_dict in sra_list:
    sra_id = sra_id_dict.get('SRA_ID')
    handle = Entrez.efetch(db='sra', id=sra_id, retmode='xml', rettype='xml')
    xml_path = os.path.realpath(f'../sra_details/{sra_id}.xml')
    with open(xml_path, 'wb') as f:
      f.write(handle.read())
    handle.close()
    xml_paths.append(xml_path)
  return xml_paths

In [74]:
xml_paths = get_sra_details(sra_list)

10287618
10287617
10287616
10287615
10287614
10287613
10287612
10287611
10287610
10287609
10287608
10287607
14749631
14749630
14749629
14749628
14749627
14749626
14749625
14749624
14749623
14749622
14749621
14749620
14749619
14749618
14749617
14749616
14749615
14749614
14749613
14749612
2861989
2861988
2861987
2861986
2861985
2861984
2861983
2861982
2861981
2861980
2861979
2861978
2861977
2861976
2861975
2861974
2861973
2861972
4440013
4440012
4440011
4440010
4440009
4440008
9185008
9185007
9185006
9185005
9185004
9185003


In [81]:
import xml.etree.ElementTree as ET

def parse_xml_files(xml_paths: list[str]):
  entries: list[dict[str, str]] = []
  for xml_path in xml_paths:
    with open(xml_path, 'r') as f:
      sra_id = xml_path.split('\\')[-1].split('.')[0]
      tree = ET.parse(f)
      root = tree.getroot()
      exp_pckgs = root.findall('./EXPERIMENT_PACKAGE')
      if exp_pckgs:
        for exp_pckg in exp_pckgs:
          exp = exp_pckg.find('EXPERIMENT')
          submission = exp_pckg.find('SUBMISSION')
          title = exp.find('TITLE').text
          design = exp.find('DESIGN')
          design_desc = design.find('DESIGN_DESCRIPTION').text
          library = design.find('LIBRARY_DESCRIPTOR')
          library_source = library.find('LIBRARY_SOURCE').text
          library_strat = library.find('LIBRARY_STRATEGY').text
          entry = {
            'experiment_id': exp.get('accession'),
            'SRA_ID': sra_id,
            'title': title,
            'design_description': design_desc,
            'library_source': library_source,
            'library_strategy': library_strat
          }
          entries.append(entry)
  return entries


In [82]:
entries = parse_xml_files(xml_paths)
entries_df = pd.DataFrame(entries)
all_df = pd.merge(sra_df, entries_df, on='SRA_ID')
all_df = pd.merge(all_df, bioprojects_df, on='BioProject')
all_df.to_csv('../references/sra_details.csv', index=False)