In [1]:
import json
import pandas as pd
import requests
import xml.etree.ElementTree as ET

If starting with a GEO (GSE*) accession, start here by defining `acc`<br>
If starting with a BioProject (PRJ*) accession, skip this cell

In [9]:
acc = 'GSE147528'

url1 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=bioproject&term={acc}[Project Accession]&retmode=json'
r1 = requests.get(url1).json()
if r1['esearchresult']['idlist']:
    i = r1['esearchresult']['idlist'][0] #list of ids, ideally - only search entry type:Series
    url2 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=bioproject&id={i}'
    r2 = requests.get(url2)
    responseXml = ET.fromstring(r2.text)
    for a in responseXml.iter('ArchiveID'):
        prj = a.attrib['accession']
prj

'PRJNA615180'

If starting with a BioProject (PRJ*) accession, start here by uncommenting & defining `prj`

In [19]:
#prj = 'PRJEB50820'

url3 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={prj}&retmode=json&retmax=10000'
r3 = requests.get(url3).json()
if len(r3['esearchresult']['idlist']) > 400: #384 works, 992 does not
    print('many ids')
else:
    ids = ','.join(r3['esearchresult']['idlist'])
    srxs = []
    url4 = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={ids}'
    r4 = requests.get(url4)
    #parse the records for needed information & write report
    responseXml = ET.fromstring(r4.text)
    attributes = {}
    runs = {}
    files = {}
    for ep in responseXml.iter('EXPERIMENT_PACKAGE'):
        e = ep.find('EXPERIMENT')
        srx = e.attrib['accession']
        attributes[srx] = {}
        for s in ep.iter('SAMPLE'):
            for pi in s.iter('PRIMARY_ID'):
                attributes[srx]['primary_id'] = pi.text
            for ei in s.iter('EXTERNAL_ID'):
                attributes[srx][ei.attrib['namespace'] + '_id'] = ei.text
            for t in s.iter('TITLE'):
                attributes[srx]['title'] = t.text
            for sa in s.iter('SAMPLE_ATTRIBUTE'):
                attributes[srx][sa.find('TAG').text] = sa.find('VALUE').text
        for run in ep.iter('RUN'):
            srr = run.attrib['accession']
            runs[srr] = {'SRX': srx}
            file_count = 0
            for f in run.iter('SRAFile'):
                if f.attrib['semantic_name'] == 'fastq':
                    file_count += 1
                    runs[srr]['fastq_' + str(file_count)] = f.attrib['filename']
                    for alt in f.iter('Alternatives'):
                        files[f.attrib['filename']] = [alt.attrib['url']]
sample_df = pd.DataFrame(attributes).transpose()
sample_df

Unnamed: 0,primary_id,BioSample_id,GEO_id,title,source_name,brain region,batch,donor id,Sex
SRX8001155,SRS6376493,SAMN14448154,GSM4432654,EC10,Frozen post-mortem human brain tissue,Entorhinal cortex,D,10,male
SRX8001154,SRS6376492,SAMN14448156,GSM4432653,EC8,Frozen post-mortem human brain tissue,Entorhinal cortex,B,8,male
SRX8001153,SRS6376491,SAMN14448158,GSM4432652,EC9,Frozen post-mortem human brain tissue,Entorhinal cortex,B,9,male
SRX8001152,SRS6376490,SAMN14448159,GSM4432651,EC5,Frozen post-mortem human brain tissue,Entorhinal cortex,B,5,male
SRX8001151,SRS6376489,SAMN14448160,GSM4432650,EC7,Frozen post-mortem human brain tissue,Entorhinal cortex,D,7,male
SRX8001150,SRS6376488,SAMN14448161,GSM4432649,EC6,Frozen post-mortem human brain tissue,Entorhinal cortex,D,6,male
SRX8001149,SRS6376487,SAMN14448162,GSM4432648,EC4,Frozen post-mortem human brain tissue,Entorhinal cortex,C,4,male
SRX8001148,SRS6376486,SAMN14448163,GSM4432647,EC3,Frozen post-mortem human brain tissue,Entorhinal cortex,C,3,male
SRX8001147,SRS6376485,SAMN14448164,GSM4432646,EC1,Frozen post-mortem human brain tissue,Entorhinal cortex,C,1,male
SRX8001146,SRS6376484,SAMN14448136,GSM4432645,EC2,Frozen post-mortem human brain tissue,Entorhinal cortex,C,2,male


In [20]:
run_df = pd.DataFrame(runs).transpose()
run_df

Unnamed: 0,SRX,fastq_1,fastq_2,fastq_3
SRR11422719,SRX8001155,EC2612_S20_L001_R2_001.fastq.gz,EC2612_S20_L001_R1_001.fastq.gz,EC2612_S20_L001_I1_001.fastq.gz
SRR11422718,SRX8001154,EC2354_S19_L001_I1_001.fastq.gz,EC2354_S19_L001_R1_001.fastq.gz,EC2354_S19_L001_R2_001.fastq.gz
SRR11422717,SRX8001153,EC2508_S18_L001_I1_001.fastq.gz,EC2508_S18_L001_R1_001.fastq.gz,EC2508_S18_L001_R2_001.fastq.gz
SRR11422716,SRX8001152,EC2321_S17_L001_I1_001.fastq.gz,EC2321_S17_L001_R1_001.fastq.gz,EC2321_S17_L001_R2_001.fastq.gz
SRR11422715,SRX8001151,EC2821_S16_L001_I1_001.fastq.gz,EC2821_S16_L001_R1_001.fastq.gz,EC2821_S16_L001_R2_001.fastq.gz
SRR11422714,SRX8001150,EC2813_S15_L001_I1_001.fastq.gz,EC2813_S15_L001_R1_001.fastq.gz,EC2813_S15_L001_R2_001.fastq.gz
SRR11422713,SRX8001149,EC5094_S14_L001_I1_001.fastq.gz,EC5094_S14_L001_R1_001.fastq.gz,EC5094_S14_L001_R2_001.fastq.gz
SRR11422712,SRX8001148,EC899_S13_L001_I1_001.fastq.gz,EC899_S13_L001_R1_001.fastq.gz,EC899_S13_L001_R2_001.fastq.gz
SRR11422711,SRX8001147,EC4454_S12_L001_I1_001.fastq.gz,EC4454_S12_L001_R1_001.fastq.gz,EC4454_S12_L001_R2_001.fastq.gz
SRR11422710,SRX8001146,EC11917_S11_L001_I1_001.fastq.gz,EC11917_S11_L001_R1_001.fastq.gz,EC11917_S11_L001_R2_001.fastq.gz


In [27]:
files_df = pd.DataFrame(files, index=['uri']).transpose()
files_df

Unnamed: 0,uri
EC2612_S20_L001_R2_001.fastq.gz,s3://sra-pub-src-7/SRR11422719/EC2612_S20_L001...
EC2612_S20_L001_R1_001.fastq.gz,s3://sra-pub-src-7/SRR11422719/EC2612_S20_L001...
EC2612_S20_L001_I1_001.fastq.gz,s3://sra-pub-src-7/SRR11422719/EC2612_S20_L001...
EC2354_S19_L001_I1_001.fastq.gz,s3://sra-pub-src-7/SRR11422718/EC2354_S19_L001...
EC2354_S19_L001_R1_001.fastq.gz,s3://sra-pub-src-7/SRR11422718/EC2354_S19_L001...
EC2354_S19_L001_R2_001.fastq.gz,s3://sra-pub-src-7/SRR11422718/EC2354_S19_L001...
EC2508_S18_L001_I1_001.fastq.gz,s3://sra-pub-src-4/SRR11422717/EC2508_S18_L001...
EC2508_S18_L001_R1_001.fastq.gz,s3://sra-pub-src-4/SRR11422717/EC2508_S18_L001...
EC2508_S18_L001_R2_001.fastq.gz,s3://sra-pub-src-4/SRR11422717/EC2508_S18_L001...
EC2321_S17_L001_I1_001.fastq.gz,s3://sra-pub-src-7/SRR11422716/EC2321_S17_L001...
