# Set-up

In [1]:
import os
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

  


In [3]:
# Choose the current dataset we are working with
dataset_name = "Augsornworawat2023_sc-islet_10X-Multiome"
srp_id = "SRP366403"

In [4]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/datasets"
cwd = os.path.join(base_dir, dataset_name, "bin", "data_acquisition")
fastq_dir = os.path.join(base_dir,  dataset_name, "fastq", "07Nov23")
metadata_dir = os.path.join(base_dir, dataset_name, "metadata", "07Nov23")
base_dir, cwd, fastq_dir, metadata_dir

('/cellar/users/aklie/data/datasets',
 '/cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/bin/data_acquisition',
 '/cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/fastq/07Nov23',
 '/cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/metadata/07Nov23')

In [6]:
# If fastq_dir does not exist, create it
if not os.path.exists(fastq_dir):
    os.makedirs(fastq_dir)

# If metadata_dir does not exist, create it
if not os.path.exists(metadata_dir):
    os.makedirs(metadata_dir)


In [7]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [9]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)
metadata.head()

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,source_name,cell type,stage,kit used,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2
0,SRR18511696,SRP366403,Single nuclei multiomics of human stem cell-de...,SRX14642877,GSM5979664: SC-Islet 1 [ATAC-Seq]; Homo sapien...,GSM5979664: SC-Islet 1 [ATAC-Seq]; Homo sapien...,9606,Homo sapiens,,ATAC-seq,...,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/096...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/096...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...
1,SRR18511697,SRP366403,Single nuclei multiomics of human stem cell-de...,SRX14642878,GSM5979665: SC-Islet 1 [RNA-Seq]; Homo sapiens...,GSM5979665: SC-Islet 1 [RNA-Seq]; Homo sapiens...,9606,Homo sapiens,,RNA-Seq,...,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/097...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/097...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...
2,SRR18511698,SRP366403,Single nuclei multiomics of human stem cell-de...,SRX14642879,GSM5979666: SC-Islet 2 [ATAC-Seq]; Homo sapien...,GSM5979666: SC-Islet 2 [ATAC-Seq]; Homo sapien...,9606,Homo sapiens,,ATAC-seq,...,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/098...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/098...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...
3,SRR18511699,SRP366403,Single nuclei multiomics of human stem cell-de...,SRX14642880,GSM5979667: SC-Islet 2 [RNA-Seq]; Homo sapiens...,GSM5979667: SC-Islet 2 [RNA-Seq]; Homo sapiens...,9606,Homo sapiens,,RNA-Seq,...,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/099...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/099...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...
4,SRR18511704,SRP366403,Single nuclei multiomics of human stem cell-de...,SRX14642885,GSM5979672: SC-Islet Stage6 week 2 [ATAC-Seq];...,GSM5979672: SC-Islet Stage6 week 2 [ATAC-Seq];...,9606,Homo sapiens,,ATAC-seq,...,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/004...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR185/004...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR185/...


In [10]:
metadata.columns

Index(['run_accession', 'study_accession', 'study_title',
       'experiment_accession', 'experiment_title', 'experiment_desc',
       'organism_taxid', 'organism_name', 'library_name', 'library_strategy',
       'library_source', 'library_selection', 'library_layout',
       'sample_accession', 'sample_title', 'instrument', 'instrument_model',
       'instrument_model_desc', 'total_spots', 'total_size', 'run_total_spots',
       'run_total_bases', 'run_alias', 'public_filename', 'public_size',
       'public_date', 'public_md5', 'public_version', 'public_semantic_name',
       'public_supertype', 'public_sratoolkit', 'aws_url', 'aws_free_egress',
       'aws_access_type', 'public_url', 'ncbi_url', 'ncbi_free_egress',
       'ncbi_access_type', 'gcp_url', 'gcp_free_egress', 'gcp_access_type',
       'experiment_alias', 'source_name', 'cell type', 'stage', 'kit used',
       'ena_fastq_http', 'ena_fastq_http_1', 'ena_fastq_http_2',
       'ena_fastq_ftp', 'ena_fastq_ftp_1', 'ena_fastq_f

In [16]:
metadata[['experiment_title', 'experiment_desc', 'public_filename', 'source_name', 'cell type', 'stage', 'kit used']]

Unnamed: 0,experiment_title,experiment_desc,public_filename,source_name,cell type,stage,kit used
0,GSM5979664: SC-Islet 1 [ATAC-Seq]; Homo sapien...,GSM5979664: SC-Islet 1 [ATAC-Seq]; Homo sapien...,SRR18511696.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
1,GSM5979665: SC-Islet 1 [RNA-Seq]; Homo sapiens...,GSM5979665: SC-Islet 1 [RNA-Seq]; Homo sapiens...,SRR18511697.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
2,GSM5979666: SC-Islet 2 [ATAC-Seq]; Homo sapien...,GSM5979666: SC-Islet 2 [ATAC-Seq]; Homo sapien...,SRR18511698.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
3,GSM5979667: SC-Islet 2 [RNA-Seq]; Homo sapiens...,GSM5979667: SC-Islet 2 [RNA-Seq]; Homo sapiens...,SRR18511699.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
4,GSM5979672: SC-Islet Stage6 week 2 [ATAC-Seq];...,GSM5979672: SC-Islet Stage6 week 2 [ATAC-Seq];...,TWHH-week2_atac_S3_L002_R1_001.fastq.gz,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
5,GSM5979673: SC-Islet Stage6 week 2 [RNA-Seq]; ...,GSM5979673: SC-Islet Stage6 week 2 [RNA-Seq]; ...,SRR18511705.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
6,GSM5979673: SC-Islet Stage6 week 2 [RNA-Seq]; ...,GSM5979673: SC-Islet Stage6 week 2 [RNA-Seq]; ...,SRR18511706.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
7,GSM5979674: SC-Islet Stage6 week 3 [ATAC-Seq];...,GSM5979674: SC-Islet Stage6 week 3 [ATAC-Seq];...,SRR18511707.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
8,GSM5979675: SC-Islet Stage6 week 3 [RNA-Seq]; ...,GSM5979675: SC-Islet Stage6 week 3 [RNA-Seq]; ...,SRR18511708.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...
9,GSM5979676: SC-Islet Stage6 week 4 [ATAC-Seq];...,GSM5979676: SC-Islet Stage6 week 4 [ATAC-Seq];...,SRR18511709.lite,Stem Cell Derived Islets,ES-Derived Islet,In vitro,10x Single Cell Multiome ATAC + Gene Expressio...


In [22]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(metadata_dir, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(metadata_dir, f"{srp_id}_srr_ids.txt"), index=False, header=False)

# Download non-diabetic samples (`sra` files)

In [17]:
db.download(df=metadata, out_dir=fastq_dir)

Checking download URLs


KeyboardInterrupt: 

# Convert to `fastq` files

In [None]:
import glob
import subprocess

In [64]:
tmp_dir = "/cellar/users/aklie/tmp/fastq-dump"
gzip = True
split_files = True
threads = 4

In [72]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
# Run the following command parallel-fastq-dump --threads 4 --outdir . --split-files --tmpdir $tmp_dir --gzip -s SRR14048750.sra
for sra_file in glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.sra")):
    sra_dir = os.path.dirname(sra_file)
    if gzip:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} --gzip -s {sra_file}"
    else:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} -s {sra_file}"
    print(cmd)
    
    # Check to see if the files have already been downloaded
    if len(glob.glob(os.path.join(sra_dir, "*.fastq*"))) > 0:
        print(f"Files already downloaded for {sra_dir}")
    else:
        subprocess.run(cmd, shell=True)

parallel-fastq-dump --threads 4 --outdir /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698 --split-files --tmpdir /cellar/users/aklie/tmp/fastq-dump --gzip -s /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698/SRR14048750.sra


In [65]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
for file in tqdm(os.listdir(fastq_dir)):
    if file.endswith(".sra"):
        file_path = os.path.join(fastq_dir, file)
        print(file_path)

100%|██████████| 1/1 [00:00<00:00, 13662.23it/s]


# DONE!

---