# Download expression data of *Staphylococcus aureus*

Reference:  
Davis A. R., Gohara D. W., Yap M. N. Sequence selectivity of macrolide-induced translational attenuation.
Proc Natl Acad Sci U S A ***111*** (2014). doi:[10.1073/pnas.1410356111](https://doi.org/10.1073/pnas.1410356111)

- The reference genome (GCF_000013425.1) was downloaded from NCBI ftp server (RefSeq or GenBank) to `data/pubdata/expression/DGY14PNAS/reference`
- Raw fastq files were downloaded from EBI ftp server to `data/pubdata/expression/DGY14PNAS/fastq`


In [1]:
import gzip
import hashlib
import pandas as pd
import shutil
import time
import urllib.request 
from Bio import SeqIO
from pyscripts.config import path2
from pyscripts.datasets import DatasetDownloader
ddownloader = DatasetDownloader()

In [2]:
refs = pd.read_csv(path2.metadata/'expression'/'DGY14PNAS_refs.tsv', sep='\t')

In [3]:
for ref in refs.itertuples():
    # fetch genomes
    file_target = path2.pubdata/'expression'/'DGY14PNAS'/'reference'/f'{ref.reference_assembly}.gbff.gz'
    ddownloader.fetch_NCBI_genome(ref.ftp_path, file_target)
    
    # convert GenBank files to FASTA files
    with gzip.open(file_target, 'rt') as gbff:
        SeqIO.convert(gbff, 'gb', path2.data/'expression'/'DGY14PNAS'/'reference'/f'{ref.reference_assembly}.fna', 'fasta')
    time.sleep(1)

In [4]:
runs = pd.read_csv(path2.metadata/'expression'/'DGY14PNAS_runs.tsv', sep='\t')

In [5]:
# fetch raw reads in fastq format
for run in runs.itertuples():
    ftp_source  = f'ftp://{run.fastq_ftp}'
    file_target = (path2.pubdata/'expression'/'DGY14PNAS'/'fastq'/run.tag).with_suffix('.fastq.gz')
    with urllib.request.urlopen(ftp_source) as response, open(file_target, 'wb') as outfile:
        shutil.copyfileobj(response, outfile)
    with open(file_target, 'rb') as fqfile:
        md5sum = hashlib.md5(fqfile.read()).hexdigest() 
        
    assert md5sum == run.fastq_md5
    time.sleep(1)
    