## Data Import Script

This script requests the Accessions and downloads all of the relevant fastq files for a given SRA project

In [1]:
from Bio import Entrez

# Change to SRA Project ID of interest
sra_project="ERP019762"
# Change to email address to allow NCBI contact 
Entrez.email = "A.N.Other@example.com"  

# Request all IDs associated with the SRA Project ID of interest. (Up to 10,000)
handle = Entrez.esearch(db="sra", term=sra_project, retmax=10000)
record = Entrez.read(handle)
id_list = record['IdList']

In [13]:
from subprocess import call
import os

# For each SRA Record identifier, request a summary file
for sra_id in id_list:
    
    net_handle = Entrez.esummary(db="sra", id=sra_id)
    parsed = Entrez.read(net_handle)
    net_handle.close()
    
    # Parse the summary file, to extract the run accession
    for parsed_record in parsed:
        accession = parsed_record['Runs'][10:20]
        filename = "../data/fastq/" + accession + ".fastq"
        
        # If a fastq file for the run accession hasn't already been downloaded, run fastq-dump to download the relevant file. 
        if not os.path.isfile(filename):
            call("fastq-dump --outdir ../data/fastq/ {}".format(accession), shell=True)

In [5]:
sra_id

'3784391'

In [11]:
parsed[0]['Runs'][10:20]

'ERR1865077'