In [1]:
import pandas as pd
import subprocess
import glob

For this tutorial, we will be using samples published by [Wibowo et al. 2021](https://www.nature.com/articles/s41586-021-03532-0)

In [2]:
metadata = pd.read_csv("../data/metadata/PRJNA561510.tsv", sep="\t")

We select only the Zape samples

In [3]:
zape = metadata.query("scientific_name == 'human gut metagenome' and experiment_alias.str.contains('Zape')", engine='python')
zape

Unnamed: 0,study_accession,sample_accession,experiment_accession,run_accession,tax_id,scientific_name,library_name,experiment_title,experiment_alias,run_alias,fastq_ftp,fastq_aspera,submitted_ftp,sra_ftp,sample_alias,sample_title
2,PRJNA561510,SAMN12619382,SRX9046728,SRR12557704,408170,human gut metagenome,Zape3,Illumina HiSeq 4000 sequencing; WGS of human g...,Zape3,Zape3_1.fq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/004/SRR125...,fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/004/SRR1...,,,Zape3,This sample has been submitted by pda|marshacw...
3,PRJNA561510,SAMN12619381,SRX9046727,SRR12557705,408170,human gut metagenome,Zape2,Illumina HiSeq 4000 sequencing; WGS of human g...,Zape2,Zape2_1.fq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/005/SRR125...,fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/005/SRR1...,,,Zape2,This sample has been submitted by pda|marshacw...
4,PRJNA561510,SAMN12619380,SRX9046726,SRR12557706,408170,human gut metagenome,Zape1,Illumina HiSeq 4000 sequencing; WGS of human g...,Zape1,Zape1_1.fq.gz,ftp.sra.ebi.ac.uk/vol1/fastq/SRR125/006/SRR125...,fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/006/SRR1...,,,Zape1,This sample has been submitted by pda|marshacw...


In [4]:
list(zape.columns)

['study_accession',
 'sample_accession',
 'experiment_accession',
 'run_accession',
 'tax_id',
 'scientific_name',
 'library_name',
 'experiment_title',
 'experiment_alias',
 'run_alias',
 'fastq_ftp',
 'fastq_aspera',
 'submitted_ftp',
 'sra_ftp',
 'sample_alias',
 'sample_title']

In [11]:
def aspera_download(sample_name, url, outdir):
    aspera_cmd = "/usr/local64/opt/aspera/connect/bin/ascp -i /usr/local64/opt/aspera/connect/etc/asperaweb_id_dsa.openssh -Tr -Q -l 100m -P33001 -L- era-fasp@"
    fwd = url.split(";")[0]
    rev = url.split(";")[1]
    dl_fwd = f"{aspera_cmd}{fwd} {outdir}/{sample_name}_1.fastq.gz "
    print(dl_fwd)
    subprocess.check_output(dl_fwd, shell=True)
    dl_rev = f"{aspera_cmd}{rev} {outdir}/{sample_name}_2.fastq.gz "
    print(dl_rev)
    subprocess.check_output(dl_rev, shell=True)

In [12]:
for i in zape.index[1:]:
    aspera_download(sample_name = zape.loc[i, 'library_name'],
             url = zape.loc[i, 'fastq_aspera'],
             outdir = "../data/raw")

/usr/local64/opt/aspera/connect/bin/ascp -i /usr/local64/opt/aspera/connect/etc/asperaweb_id_dsa.openssh -Tr -Q -l 100m -P33001 -L- era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/005/SRR12557705/SRR12557705_1.fastq.gz ../data/raw/Zape2_1.fastq.gz 
/usr/local64/opt/aspera/connect/bin/ascp -i /usr/local64/opt/aspera/connect/etc/asperaweb_id_dsa.openssh -Tr -Q -l 100m -P33001 -L- era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/005/SRR12557705/SRR12557705_2.fastq.gz ../data/raw/Zape2_2.fastq.gz 
/usr/local64/opt/aspera/connect/bin/ascp -i /usr/local64/opt/aspera/connect/etc/asperaweb_id_dsa.openssh -Tr -Q -l 100m -P33001 -L- era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/006/SRR12557706/SRR12557706_1.fastq.gz ../data/raw/Zape1_1.fastq.gz 
/usr/local64/opt/aspera/connect/bin/ascp -i /usr/local64/opt/aspera/connect/etc/asperaweb_id_dsa.openssh -Tr -Q -l 100m -P33001 -L- era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR125/006/SRR12557706/SRR12557706_2.fastq.gz ../data/raw/Zape1_2.fastq.gz 


## Subsampling files to make the analysis quicker for this tutorial

In [50]:
def subsample(filename, outdir, depth=200000):
    basename = ".".join(filename.split(".")[:-2]).split("/")[-1]
    print(basename)
    cmd = f"seqtk sample -s100 {filename} {depth} > {outdir}/{basename}_subsampled_{depth}.fastq"
    print(cmd)
    subprocess.check_output(cmd, shell=True)

In [51]:
for f in glob.glob("../data/raw/*"):
    outdir = "../data/subsampled"
    subsample(f, outdir)

Zape3_1
seqtk sample -s100 ../data/raw/Zape3_1.fastq.gz 200000 > ../data/subsampled/Zape3_1_subsampled_200000.fastq
Zape3_2
seqtk sample -s100 ../data/raw/Zape3_2.fastq.gz 200000 > ../data/subsampled/Zape3_2_subsampled_200000.fastq
Zape2_1
seqtk sample -s100 ../data/raw/Zape2_1.fastq.gz 200000 > ../data/subsampled/Zape2_1_subsampled_200000.fastq
Zape2_2
seqtk sample -s100 ../data/raw/Zape2_2.fastq.gz 200000 > ../data/subsampled/Zape2_2_subsampled_200000.fastq
Zape1_1
seqtk sample -s100 ../data/raw/Zape1_1.fastq.gz 200000 > ../data/subsampled/Zape1_1_subsampled_200000.fastq
Zape1_2
seqtk sample -s100 ../data/raw/Zape1_2.fastq.gz 200000 > ../data/subsampled/Zape1_2_subsampled_200000.fastq
