# Set-up

In [1]:
import os
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

  


In [2]:
# Choose the current dataset we are working with
dataset_name = "Zhu2023_sc-islet_snATAC-seq"
srp_id = "SRP374215"

In [7]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/igvf/beta_cell_networks"
cwd = os.path.join(base_dir, "download", dataset_name)
fastq_dir = os.path.join(base_dir, "fastq", dataset_name)
base_dir, cwd, fastq_dir

('/cellar/users/aklie/data/igvf/beta_cell_networks',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/download/Zhu2023_sc-islet_snATAC-seq',
 '/cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Zhu2023_sc-islet_snATAC-seq')

In [8]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [9]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)

In [10]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(cwd, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(cwd, f"{srp_id}_srr_ids.txt"), index=False, header=False)

# Download non-diabetic samples (`sra` files)

In [11]:
db.download(df=metadata, out_dir=fastq_dir)

Checking download URLs


The following files will be downloaded: 

run_accession study_accession experiment_accession public_url                                                                                                  download_url                                                                                            out_dir                                                                            filesize
SRR19140210   SRP374215       SRX15207301          https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-zq-38/SRR019/19140/SRR19140210/SRR19140210.lite.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR191/SRR19140210/SRR19140210.sra /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Zhu2023_sc-islet_snATAC-seq  5.4 GB 
SRR19140211   SRP374215       SRX15207300          https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-zq-38/SRR019/19140/SRR19140211/SRR19140211.lite.1 ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR191/SRR19140211/S

 80%|████████  | 4/5 [16:31<04:07, 247.76s/it]

# Convert to `fastq` files

In [None]:
import glob
import subprocess

In [64]:
tmp_dir = "/cellar/users/aklie/tmp/fastq-dump"
gzip = True
split_files = True
threads = 4

In [72]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
# Run the following command parallel-fastq-dump --threads 4 --outdir . --split-files --tmpdir $tmp_dir --gzip -s SRR14048750.sra
for sra_file in glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.sra")):
    sra_dir = os.path.dirname(sra_file)
    if gzip:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} --gzip -s {sra_file}"
    else:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} -s {sra_file}"
    print(cmd)
    
    # Check to see if the files have already been downloaded
    if len(glob.glob(os.path.join(sra_dir, "*.fastq*"))) > 0:
        print(f"Files already downloaded for {sra_dir}")
    else:
        subprocess.run(cmd, shell=True)

parallel-fastq-dump --threads 4 --outdir /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698 --split-files --tmpdir /cellar/users/aklie/tmp/fastq-dump --gzip -s /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698/SRR14048750.sra


In [65]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
for file in tqdm(os.listdir(fastq_dir)):
    if file.endswith(".sra"):
        file_path = os.path.join(fastq_dir, file)
        print(file_path)

100%|██████████| 1/1 [00:00<00:00, 13662.23it/s]


# DONE!

---