# Set-up

In [1]:
import os
from tqdm.autonotebook import tqdm
from pysradb.sraweb import SRAweb

  


In [2]:
# Choose the current dataset we are working with
dataset_name = "AI-ATAC"
srp_id = "SRP110978"

In [3]:
# Set-up directories
base_dir = "/cellar/users/aklie/data/datasets"
cwd = os.path.join(base_dir, dataset_name, "bin", "data_acquisition")
fastq_dir = os.path.join(base_dir,  dataset_name, "fastq", "10Nov23")
metadata_dir = os.path.join(base_dir, dataset_name, "metadata", "10Nov23")
base_dir, cwd, fastq_dir, metadata_dir

('/cellar/users/aklie/data/datasets',
 '/cellar/users/aklie/data/datasets/AI-ATAC/bin/data_acquisition',
 '/cellar/users/aklie/data/datasets/AI-ATAC/fastq/10Nov23',
 '/cellar/users/aklie/data/datasets/AI-ATAC/metadata/10Nov23')

In [4]:
# If fastq_dir does not exist, create it
if not os.path.exists(fastq_dir):
    os.makedirs(fastq_dir)

# If metadata_dir does not exist, create it
if not os.path.exists(metadata_dir):
    os.makedirs(metadata_dir)

In [5]:
# Connect to SRA
db = SRAweb()

# Get metadata

In [6]:
# Grab the metadata for the SRP
metadata = db.sra_metadata(srp_id, detailed=True)
metadata.head()

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,source_name,strain,genotype,cell type,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2
0,SRR5799381,SRP110978,ImmGen ATAC-seq data,SRX2978848,GSM2692169: Ep.MEChi.Th#1; Mus musculus; ATAC-seq,GSM2692169: Ep.MEChi.Th#1; Mus musculus; ATAC-seq,10090,Mus musculus,,ATAC-seq,...,Thymus,C57BL/6,,Thymic epithelial Cell,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/001...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/001...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...
1,SRR5799382,SRP110978,ImmGen ATAC-seq data,SRX2978849,GSM2692170: Ep.MEChi.Th#2; Mus musculus; ATAC-seq,GSM2692170: Ep.MEChi.Th#2; Mus musculus; ATAC-seq,10090,Mus musculus,,ATAC-seq,...,Thymus,C57BL/6,,Thymic epithelial Cell,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/002...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/002...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...
2,SRR5799383,SRP110978,ImmGen ATAC-seq data,SRX2978850,GSM2692171: preT.DN1.Th#1; Mus musculus; ATAC-seq,GSM2692171: preT.DN1.Th#1; Mus musculus; ATAC-seq,10090,Mus musculus,,ATAC-seq,...,Thymus,C57BL/6,,Double Negative Thymocytes,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/003...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/003...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...
3,SRR5799384,SRP110978,ImmGen ATAC-seq data,SRX2978851,GSM2692172: preT.DN1.Th#2; Mus musculus; ATAC-seq,GSM2692172: preT.DN1.Th#2; Mus musculus; ATAC-seq,10090,Mus musculus,,ATAC-seq,...,Thymus,C57BL/6,,Double Negative Thymocytes,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/004...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/004...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...
4,SRR5799385,SRP110978,ImmGen ATAC-seq data,SRX2978852,GSM2692173: preT.DN2a.Th#1; Mus musculus; ATAC...,GSM2692173: preT.DN2a.Th#1; Mus musculus; ATAC...,10090,Mus musculus,,ATAC-seq,...,Thymus,C57BL/6,,Double Negative Thymocytes,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/005...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR579/005...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR579/...


In [7]:
metadata.columns

Index(['run_accession', 'study_accession', 'study_title',
       'experiment_accession', 'experiment_title', 'experiment_desc',
       'organism_taxid', 'organism_name', 'library_name', 'library_strategy',
       'library_source', 'library_selection', 'library_layout',
       'sample_accession', 'sample_title', 'instrument', 'instrument_model',
       'instrument_model_desc', 'total_spots', 'total_size', 'run_total_spots',
       'run_total_bases', 'run_alias', 'public_filename', 'public_url',
       'public_size', 'public_date', 'public_md5', 'public_version',
       'public_semantic_name', 'public_supertype', 'public_sratoolkit',
       'ncbi_url', 'ncbi_free_egress', 'ncbi_access_type', 'aws_url',
       'aws_free_egress', 'aws_access_type', 'gcp_url', 'gcp_free_egress',
       'gcp_access_type', 'experiment_alias', 'source_name', 'strain',
       'genotype', 'cell type', 'ena_fastq_http', 'ena_fastq_http_1',
       'ena_fastq_http_2', 'ena_fastq_ftp', 'ena_fastq_ftp_1',
       'ena

In [9]:
metadata[['experiment_title', 'experiment_desc']].head()

Unnamed: 0,experiment_title,experiment_desc
0,GSM2692169: Ep.MEChi.Th#1; Mus musculus; ATAC-seq,GSM2692169: Ep.MEChi.Th#1; Mus musculus; ATAC-seq
1,GSM2692170: Ep.MEChi.Th#2; Mus musculus; ATAC-seq,GSM2692170: Ep.MEChi.Th#2; Mus musculus; ATAC-seq
2,GSM2692171: preT.DN1.Th#1; Mus musculus; ATAC-seq,GSM2692171: preT.DN1.Th#1; Mus musculus; ATAC-seq
3,GSM2692172: preT.DN1.Th#2; Mus musculus; ATAC-seq,GSM2692172: preT.DN1.Th#2; Mus musculus; ATAC-seq
4,GSM2692173: preT.DN2a.Th#1; Mus musculus; ATAC...,GSM2692173: preT.DN2a.Th#1; Mus musculus; ATAC...


In [10]:
# Save the metadata and the list of srr ids
metadata.to_csv(os.path.join(metadata_dir, f"{srp_id}_metadata.tsv"), index=False, sep="\t")
metadata["run_accession"].to_csv(os.path.join(metadata_dir, f"{srp_id}_srr_ids.txt"), index=False, header=False)

# Download non-diabetic samples (`sra` files)

In [11]:
#db.download(df=metadata, out_dir=fastq_dir)

# Convert to `fastq` files

In [None]:
import glob
import subprocess

In [64]:
tmp_dir = "/cellar/users/aklie/tmp/fastq-dump"
gzip = True
split_files = True
threads = 4

In [72]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
# Run the following command parallel-fastq-dump --threads 4 --outdir . --split-files --tmpdir $tmp_dir --gzip -s SRR14048750.sra
for sra_file in glob.glob(os.path.join(fastq_dir, srp_id, "*", "*.sra")):
    sra_dir = os.path.dirname(sra_file)
    if gzip:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} --gzip -s {sra_file}"
    else:
        cmd = f"parallel-fastq-dump --threads {threads} --outdir {sra_dir} --split-files --tmpdir {tmp_dir} -s {sra_file}"
    print(cmd)
    
    # Check to see if the files have already been downloaded
    if len(glob.glob(os.path.join(sra_dir, "*.fastq*"))) > 0:
        print(f"Files already downloaded for {sra_dir}")
    else:
        subprocess.run(cmd, shell=True)

parallel-fastq-dump --threads 4 --outdir /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698 --split-files --tmpdir /cellar/users/aklie/tmp/fastq-dump --gzip -s /cellar/users/aklie/data/igvf/beta_cell_networks/fastq/Wang2023_islet_snATAC-seq/SRP311849/SRX10424698/SRR14048750.sra


In [65]:
# Loop through and print out each SRA download file within the subdirectories of the fastq_dir
for file in tqdm(os.listdir(fastq_dir)):
    if file.endswith(".sra"):
        file_path = os.path.join(fastq_dir, file)
        print(file_path)

100%|██████████| 1/1 [00:00<00:00, 13662.23it/s]


# DONE!

---