# Set-up

In [11]:
import os
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

In [12]:
# Choose the current dataset we are working with
dataset_name = "Zhu2023_sc-islet_scRNA-seq"
srp_id = "SRP374217"

In [19]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/datasets"
cwd = os.path.join(base_dir, dataset_name, "bin", "data_acquisition")
fastq_dir = os.path.join(base_dir,  dataset_name, "fastq", "24Oct23")
metadata_dir = os.path.join(base_dir, dataset_name, "metadata", "24Oct23")
base_dir, cwd, fastq_dir, metadata_dir

('/cellar/users/aklie/data/datasets',
 '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/bin/data_acquisition',
 '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23',
 '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/metadata/24Oct23')

In [23]:
# If fastq_dir does not exist, create it
if not os.path.exists(fastq_dir):
    os.makedirs(fastq_dir)

In [20]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [21]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)

In [22]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(metadata_dir, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(metadata_dir, f"{srp_id}_srr_ids.txt"), index=False, header=False)

In [24]:
metadata

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,gcp_access_type,experiment_alias,source_name,day,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2
0,SRR19140222,SRP374217,Improving stem cell-derived pancreatic islets ...,SRX15207313,GSM6123268: H1-D39_S8; Homo sapiens; RNA-Seq,GSM6123268: H1-D39_S8; Homo sapiens; RNA-Seq,9606,Homo sapiens,GSM6123268,RNA-Seq,...,gcp identity,,Islets,D32,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/022...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/022...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
1,SRR19140223,SRP374217,Improving stem cell-derived pancreatic islets ...,SRX15207312,GSM6123267: H1-D32_S6; Homo sapiens; RNA-Seq,GSM6123267: H1-D32_S6; Homo sapiens; RNA-Seq,9606,Homo sapiens,GSM6123267,RNA-Seq,...,gcp identity,,Islets,D32,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/023...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/023...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
2,SRR19140224,SRP374217,Improving stem cell-derived pancreatic islets ...,SRX15207311,GSM6123266: H1-D21_S4; Homo sapiens; RNA-Seq,GSM6123266: H1-D21_S4; Homo sapiens; RNA-Seq,9606,Homo sapiens,GSM6123266,RNA-Seq,...,gcp identity,,Islets,D21,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/024...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/024...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
3,SRR19140225,SRP374217,Improving stem cell-derived pancreatic islets ...,SRX15207310,GSM6123265: Hs_hESC_beta_cells_differentiation...,GSM6123265: Hs_hESC_beta_cells_differentiation...,9606,Homo sapiens,GSM6123265,RNA-Seq,...,gcp identity,,Islets,D14,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/025...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/025...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...
4,SRR19140226,SRP374217,Improving stem cell-derived pancreatic islets ...,SRX15207309,GSM6123264: HZ144_D11_S11; Homo sapiens; RNA-Seq,GSM6123264: HZ144_D11_S11; Homo sapiens; RNA-Seq,9606,Homo sapiens,GSM6123264,RNA-Seq,...,gcp identity,,Islets,D11,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/026...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR191/026...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR191/...


# Download non-diabetic samples (`sra` files)

In [11]:
db.download(df=metadata, out_dir=fastq_dir)

Checking download URLs


The following files will be downloaded: 

run_accession study_accession experiment_accession public_url                                                                                                  download_url                                                                                            out_dir                                                                            filesize
SRR19140210   SRP374215       SRX15207301          https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-zq-38/SRR019/19140/SRR19140210/SRR19140210.lite.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR191/SRR19140210/SRR19140210.sra /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Zhu2023_sc-islet_snATAC-seq  5.4 GB 
SRR19140211   SRP374215       SRX15207300          https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-zq-38/SRR019/19140/SRR19140211/SRR19140211.lite.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR191/SRR19140211/S

 80%|████████  | 4/5 [16:31<04:07, 247.76s/it]

# Convert to `fastq` files

In [None]:
import glob
import subprocess

In [64]:
tmp_dir = "/cellar/users/aklie/tmp/fastq-dump"
gzip = True
split_files = True
threads = 4

In [72]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
# Run the following command parallel-fastq-dump --threads 4 --outdir . --split-files --tmpdir $tmp_dir --gzip -s SRR14048750.sra
for sra_file in glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.sra")):
    sra_dir = os.path.dirname(sra_file)
    if gzip:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} --gzip -s {sra_file}"
    else:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} -s {sra_file}"
    print(cmd)
    
    # Check to see if the files have already been downloaded
    if len(glob.glob(os.path.join(sra_dir, "*.fastq*"))) > 0:
        print(f"Files already downloaded for {sra_dir}")
    else:
        subprocess.run(cmd, shell=True)

parallel-fastq-dump --threads 4 --outdir /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698 --split-files --tmpdir /cellar/users/aklie/tmp/fastq-dump --gzip -s /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698/SRR14048750.sra


In [65]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
for file in tqdm(os.listdir(fastq_dir)):
    if file.endswith(".sra"):
        file_path = os.path.join(fastq_dir, file)
        print(file_path)

100%|██████████| 1/1 [00:00<00:00, 13662.23it/s]


# DONE!

---