# Download expression data of five *Streptomyces* species

Reference:  
Kim, W., Hwang, S., Lee, N. et al. Transcriptome and translatome profiles of Streptomyces species in different growth phases. Sci Data ***7***, 138 (2020). doi:[10.1038/s41597-020-0476-9](https://doi.org/10.1038/s41597-020-0476-9)

- Reference genomes were downloaded from NCBI ftp server (RefSeq or GenBank) to `data/pubdata/expression/Kim+20SciData/reference`
- Trimmed fastq files were downloaded from EBI ftp server to `data/pubdata/expression/Kim+20SciData/fastq`


In [1]:
import gzip
import hashlib
import pandas as pd
import shutil
import time
import urllib.request 
from Bio import SeqIO
from pyscripts.config import path2
from pyscripts.datasets import DatasetDownloader
ddownloader = DatasetDownloader()

In [2]:
refs = pd.read_csv(path2.metadata/'expression'/'Kim+20SciData_refs.tsv', sep='\t')

In [3]:
for ref in refs.itertuples():
    # fetch genomes
    file_target = path2.pubdata/'expression'/'Kim+20SciData'/'reference'/f'{ref.reference_assembly}.gbff.gz'
    ddownloader.fetch_NCBI_genome(ref.ftp_path, file_target)
    
    # convert GenBank files to FASTA files
    with gzip.open(file_target, 'rt') as gbff:
        SeqIO.convert(gbff, 'gb', path2.data/'expression'/'Kim+20SciData'/'reference'/f'{ref.reference_assembly}.fna', 'fasta')
    time.sleep(1)

In [4]:
runs = pd.read_csv(path2.metadata/'expression'/'Kim+20SciData_runs.tsv', sep='\t')

In [None]:
# fetch trimmed reads in fastq format
for run in runs.itertuples():
    ftp_source  = f'ftp://{run.fastq_ftp}'
    file_target = (path2.pubdata/'expression'/'Kim+20SciData'/'trimmed_fastq'/run.tag).with_suffix('.fastq.gz')
    with urllib.request.urlopen(ftp_source) as response, open(file_target, 'wb') as outfile:
        shutil.copyfileobj(response, outfile)
    with open(file_target, 'rb') as fqfile:
        md5sum = hashlib.md5(fqfile.read()).hexdigest() 
        
    assert md5sum == run.fastq_md5
    time.sleep(1)
    