# Prep for scripts

In [87]:
# Imports
import os
import glob
import pickle
import pandas as pd

In [3]:
# Set paths
sra_metadata = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/bin/data_acquisition/SRP290255_metadata.tsv"

In [22]:
# Grab datasets dirs
datasets = sorted(glob.glob("/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/*"))
datasets = [x for x in datasets if "vdb_validate_all.out" not in x][:-1]
len(datasets), datasets

(4,
 ['/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841',
  '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842',
  '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409843',
  '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409844'])

In [29]:
# Add SRA accessions
sra_df = pd.read_csv(sra_metadata, sep="\t")
#sra_df["sample_id"] = sra_df["experiment_title"].str.split(":", expand=True)[1].str.split("snATAC", expand=True)[0].str.strip()
sra_df["sample_id"] = sra_df["donor id"].str.replace(":", "_")
sra_df = sra_df.iloc[:-1]  # Drop the last row (will handle this separately)
sra_df.head()

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,age,bmi,donor id,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2,sample_id
0,SRR12957013,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409841,GSM4873768: Islet 1 snATAC (CB); Homo sapiens;...,GSM4873768: Islet 1 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,32,32.3,UNOS:AFC2208,,,,,,,UNOS_AFC2208
1,SRR12957014,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409842,GSM4873769: Islet 2 snATAC (CB); Homo sapiens;...,GSM4873769: Islet 2 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,45,29.3,UNOS:AFEA331,,,,,,,UNOS_AFEA331
2,SRR12957015,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409843,GSM4873770: Islet 3 snATAC (CB); Homo sapiens;...,GSM4873770: Islet 3 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,62,36.1,UNOS:AFEP022,,,,,,,UNOS_AFEP022
3,SRR12957016,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409844,GSM4873771: Pancreas 1 snATAC (CB); Homo sapie...,GSM4873771: Pancreas 1 snATAC (CB); Homo sapie...,9606,Homo sapiens,,ATAC-seq,...,33,30.9,nPOD:6004,,,,,,,nPOD_6004


In [31]:
# Get a mapping of experiment accessions to sample ids, useful for chromap and CellRanger
expacc_to_sample = sra_df.set_index("experiment_accession")["sample_id"].to_dict()
expacc_to_sample

{'SRX9409841': 'UNOS_AFC2208',
 'SRX9409842': 'UNOS_AFEA331',
 'SRX9409843': 'UNOS_AFEP022',
 'SRX9409844': 'nPOD_6004'}

In [35]:
expacc_to_sample.values()

dict_values(['UNOS_AFC2208', 'UNOS_AFEA331', 'UNOS_AFEP022', 'nPOD_6004'])

In [33]:
# Rename for convenience
for dataset in datasets:
    fastq_files = glob.glob(os.path.join(dataset, "*.fastq.gz"))
    file_mapping = {}
    for fastq_file in fastq_files:
        read_type = fastq_file.split("_")[-1].split(".")[0]
        file_path = os.path.dirname(fastq_file)
        exp_acc = os.path.basename(file_path).split("_")[0]
        sample_id = expacc_to_sample[exp_acc]
        new_file = f"{file_path}/{sample_id}_R{read_type}.fastq.gz"
        file_mapping[fastq_file] = new_file
        cmd = f"mv {fastq_file} {new_file}"
        print(cmd)
        os.system(cmd)
    with open(os.path.join(file_path, "file_mapping.pickle"), "wb") as f:
        pickle.dump(file_mapping, f)

mv /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/SRR12957013_1.fastq.gz /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/UNOS_AFC2208_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/SRR12957013_2.fastq.gz /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/UNOS_AFC2208_R2.fastq.gz
mv /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/SRR12957014_1.fastq.gz /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R1.fastq.gz
mv /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/SRR12957014_2.fastq.gz /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R2.fastq.gz
mv /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409843/S

In [45]:
import gzip

def extract_barcodes(input_fastq, output_fastq, n_reads=None, return_bcs=False):
    # Determine if the input/output files are gzipped based on the file extension
    if input_fastq.endswith('.gz'):
        open_func = gzip.open
        mode = 'rt'  # Text mode for reading
        write_mode = 'wt'  # Text mode for writing
    else:
        open_func = open
        mode = 'r'
        write_mode = 'w'
    bcs = []
    with open_func(input_fastq, mode) as infile, open_func(output_fastq, write_mode) as outfile:
        read_count = 0
        while True:
            header = infile.readline().strip()
            if not header:
                break  # End of file
            sequence = infile.readline().strip()
            plus = infile.readline().strip()
            quality = infile.readline().strip()

            # Extract barcode (assuming it's the first element in the description part of the header)
            barcode = header.split()[1].split(':')[0]

            # Append to list
            bcs.append(barcode)
            
            # Write to output file
            outfile.write(header + '\n')
            outfile.write(barcode + '\n')
            outfile.write(plus + '\n')
            outfile.write('?' * len(barcode) + '\n')  # Assuming all quality scores are '?'

            read_count += 1
            if n_reads is not None and read_count >= n_reads:
                break
    if return_bcs:
        return bcs

In [86]:
for dataset in datasets:
    fastq_files = glob.glob(os.path.join(dataset, "*R1.fastq.gz"))
    for fastq_file in fastq_files:
        output_fastq = fastq_file.replace("R1.fastq.gz", "barcodes.fastq.gz")
        extract_barcodes(fastq_file, output_fastq)
        print(f"Extracted barcodes from {fastq_file} to {output_fastq}")

Extracted barcodes from /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/UNOS_AFC2208_R1.fastq.gz to /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409841/UNOS_AFC2208_barcodes.fastq.gz
Extracted barcodes from /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R1.fastq.gz to /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_barcodes.fastq.gz
Extracted barcodes from /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409843/UNOS_AFEP022_R1.fastq.gz to /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409843/UNOS_AFEP022_barcodes.fastq.gz
Extracted barcodes from /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409844/nPOD_6004_R1.fastq.gz to /cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX940984

In [73]:
test_r1 = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R1.fastq.gz"
test_r1_bc = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R3.fastq.gz"

In [74]:
r1_bcs = extract_barcodes(test_r1, test_r1_bc, n_reads=100000, return_bcs=True)

In [79]:
test_r2 = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R2.fastq.gz"
test_r2_bc = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/fastq/SRP290255/SRX9409842/UNOS_AFEA331_R3.fastq.gz"

In [80]:
r2_bcs = extract_barcodes(test_r2, test_r2_bc, n_reads=100000, return_bcs=True)

In [81]:
# Get matching and mismatching barcodes
for i, (bc1, bc2) in enumerate(zip(r1_bcs, r2_bcs)):
    if bc1 != bc2:
        print(i, bc1, bc2)

# 

# 

In [89]:
sra_df.columns

Index(['run_accession', 'study_accession', 'study_title',
       'experiment_accession', 'experiment_title', 'experiment_desc',
       'organism_taxid', 'organism_name', 'library_name', 'library_strategy',
       'library_source', 'library_selection', 'library_layout',
       'sample_accession', 'sample_title', 'instrument', 'instrument_model',
       'instrument_model_desc', 'total_spots', 'total_size', 'run_total_spots',
       'run_total_bases', 'run_alias', 'public_filename', 'public_size',
       'public_date', 'public_md5', 'public_version', 'public_semantic_name',
       'public_supertype', 'public_sratoolkit', 'aws_url', 'aws_free_egress',
       'aws_access_type', 'public_url', 'ncbi_url', 'ncbi_free_egress',
       'ncbi_access_type', 'gcp_url', 'gcp_free_egress', 'gcp_access_type',
       'experiment_alias', 'source_name', 'tissue', 'sex', 'age', 'bmi',
       'donor id', 'ena_fastq_http', 'ena_fastq_http_1', 'ena_fastq_http_2',
       'ena_fastq_ftp', 'ena_fastq_ftp_1', 'en

In [90]:
# Paths
outdir_path = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/metadata/2024_04_22"

In [91]:
# make output directory
os.makedirs(outdir_path, exist_ok=True)

# For processing with SnapATAC2

In [98]:
# Add SRA accessions
sra_df = pd.read_csv(sra_metadata, sep="\t")
#sra_df["sample_id"] = sra_df["experiment_title"].str.split(":", expand=True)[1].str.split("snATAC", expand=True)[0].str.strip()
sra_df["sample_id"] = sra_df["donor id"].str.replace(":", "_")
sra_df.head()

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,age,bmi,donor id,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2,sample_id
0,SRR12957013,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409841,GSM4873768: Islet 1 snATAC (CB); Homo sapiens;...,GSM4873768: Islet 1 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,32,32.3,UNOS:AFC2208,,,,,,,UNOS_AFC2208
1,SRR12957014,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409842,GSM4873769: Islet 2 snATAC (CB); Homo sapiens;...,GSM4873769: Islet 2 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,45,29.3,UNOS:AFEA331,,,,,,,UNOS_AFEA331
2,SRR12957015,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409843,GSM4873770: Islet 3 snATAC (CB); Homo sapiens;...,GSM4873770: Islet 3 snATAC (CB); Homo sapiens;...,9606,Homo sapiens,,ATAC-seq,...,62,36.1,UNOS:AFEP022,,,,,,,UNOS_AFEP022
3,SRR12957016,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409844,GSM4873771: Pancreas 1 snATAC (CB); Homo sapie...,GSM4873771: Pancreas 1 snATAC (CB); Homo sapie...,9606,Homo sapiens,,ATAC-seq,...,33,30.9,nPOD:6004,,,,,,,nPOD_6004
4,SRR14135828,SRP290255,Single-cell chromatin accessibility identifies...,SRX9409845,GSM4873772: Pancreas 1 snATAC (10X); Homo sapi...,GSM4873772: Pancreas 1 snATAC (10X); Homo sapi...,9606,Homo sapiens,,ATAC-seq,...,33,30.9,nPOD:6004,,,,,,,nPOD_6004


In [100]:
df = sra_df[['tissue', 'sex', 'age', 'bmi', 'donor id']]

In [101]:
# Rename donor id to donor_id
df = df.rename(columns={'donor id': 'donor_id'})

In [102]:
# Get all the directories with fragment files in them
processed_dir = "/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap"
input_frag_paths = glob.glob(os.path.join(processed_dir, "*", "aln.bed"))
len(input_frag_paths)

5

In [107]:
# Replace ":" with "_" in donor_id
df['donor_id'] = df['donor_id'].str.replace(":", "_")
df['donor_id'].iloc[-2] = df['donor_id'].iloc[-1] + "_CB"
df['donor_id'].iloc[-1] = df['donor_id'].iloc[-1] + "_10x"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [111]:
df.columns

Index(['tissue', 'sex', 'age', 'bmi', 'donor_id', 'input_frag_path'], dtype='object')

In [109]:
# Get dict of sample_id to path
sample_id_to_path = {}
for input_frag_path in input_frag_paths:
    sample_id = input_frag_path.split("/")[-2]
    sample_id_to_path[sample_id] = input_frag_path
sample_id_to_path

{'UNOS_AFEP022': '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap/UNOS_AFEP022/aln.bed',
 'UNOS_AFEA331': '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap/UNOS_AFEA331/aln.bed',
 'nPOD_6004_CB': '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap/nPOD_6004_CB/aln.bed',
 'UNOS_AFC2208': '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap/UNOS_AFC2208/aln.bed',
 'nPOD_6004_10x': '/cellar/users/aklie/data/datasets/Chiou2021_islet_snATAC-seq/processed/chromap/nPOD_6004_10x/aln.bed'}

In [110]:
# Add column
df["input_frag_path"] = df["donor_id"].map(sample_id_to_path)
df["input_frag_path"].head()

0    /cellar/users/aklie/data/datasets/Chiou2021_is...
1    /cellar/users/aklie/data/datasets/Chiou2021_is...
2    /cellar/users/aklie/data/datasets/Chiou2021_is...
3    /cellar/users/aklie/data/datasets/Chiou2021_is...
4    /cellar/users/aklie/data/datasets/Chiou2021_is...
Name: input_frag_path, dtype: object

In [112]:
# Save the output
df[["input_frag_path", "donor_id", 'tissue', 'sex', 'age', 'bmi']].to_csv(os.path.join(outdir_path, "snapatac2_process.tsv"), sep="\t", index=False, header=False)

In [114]:
# Save the output
df[["donor_id", 'tissue', 'sex', 'age', 'bmi']].to_csv(os.path.join(outdir_path, "sample_metadata.tsv"), sep="\t", index=False)

# DONE!

---