# Prep for chromap and cellranger-atac scripts

In [111]:
# Imports
import os
import glob
import pickle
import pandas as pd

In [91]:
# Set paths
supp_excel_table = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/metadata/23Oct23/1-s2.0-S1534580723001077-mmc2.xlsx"
sra_metadata = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/bin/data_acquisition/SRP374215_metadata.tsv"
seqkit_stats = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/metadata/23Oct23/fastq_stats.tsv"

In [92]:
# Grab datasets dirs
datasets = glob.glob("/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/*")
datasets = [x for x in datasets if "vdb_validate_all.out" not in x]
len(datasets), datasets

(5,
 ['/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207297',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207301',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207298',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207299'])

In [93]:
# Load and clean metadata
ilset_snatac_metadata = pd.read_excel(supp_excel_table, sheet_name=0, skiprows=4).iloc[:5, :]
ilset_snatac_metadata.drop(columns="Type of data", inplace=True)
ilset_snatac_metadata.dropna(axis=1, how="all", inplace=True)

In [94]:
# Add SRA accessions
sra_df = pd.read_csv(sra_metadata, sep="\t")
sra_df["sample_id"] = sra_df["experiment_title"].str.split(":", expand=True)[1].str.split(";", expand=True)[0].str.strip().str.split("_", expand=True)[0]
ilset_snatac_metadata = ilset_snatac_metadata.merge(sra_df[["run_accession", "experiment_accession", "sample_id"]], left_on="ID", right_on="sample_id", how="left")

In [106]:
# Get a mapping of experiment accessions to sample ids, useful for chromap and CellRanger
expacc_to_sample = ilset_snatac_metadata.set_index("experiment_accession")["sample_id"].to_dict()
expacc_to_sample

{'SRX15207297': 'MM129',
 'SRX15207298': 'MM157',
 'SRX15207299': 'MM166',
 'SRX15207300': 'MM168',
 'SRX15207301': 'MM290'}

In [130]:
# Rename for CellRanger
for dataset in datasets:
    fastq_files = glob.glob(os.path.join(dataset, "*.fastq.gz"))
    file_mapping = {}
    for fastq_file in fastq_files:
        read_type = fastq_file.split("_")[-1].split(".")[0]
        file_path = os.path.dirname(fastq_file)
        exp_acc = os.path.basename(file_path).split("_")[0]
        sample_id = expacc_to_sample[exp_acc]
        new_file = f"{file_path}/{sample_id}_S1_L001_R{read_type}_001.fastq.gz"
        file_mapping[fastq_file] = new_file
        cmd = f"mv {fastq_file} {new_file}"
        print(cmd)
        os.system(cmd)
    with open(os.path.join(file_path, "file_mapping.pickle"), "wb") as f:
        pickle.dump(file_mapping, f)

mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/SRR19140211_2.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/MM168_S1_L001_R2_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/SRR19140211_1.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/MM168_S1_L001_R1_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/SRR19140211_3.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207300/MM168_S1_L001_R3_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207297/SRR19140214_1.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/04Jul23/SRX15207297/MM129_S1_L001_R1_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/fastq/0

In [129]:
# Check sizes
sra_df[["run_accession", "experiment_accession", "sample_id", "total_spots"]].sort_values("total_spots", ascending=False)

Unnamed: 0,run_accession,experiment_accession,sample_id,total_spots
4,SRR19140214,SRX15207297,MM129,624062139
3,SRR19140213,SRX15207298,MM157,434496051
2,SRR19140212,SRX15207299,MM166,231878793
0,SRR19140210,SRX15207301,MM290,173358378
1,SRR19140211,SRX15207300,MM168,126271591


# Seqkit statistics

In [125]:
pd.read_csv(seqkit_stats, sep="\t")

Unnamed: 0,file,format,type,num_seqs,sum_len,min_len,avg_len,max_len
0,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,624062139,63030276039,101,101.0,101
1,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,624062139,9984994224,16,16.0,16
2,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,624062139,63030276039,101,101.0,101
3,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,434496051,43884101151,101,101.0,101
4,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,434496051,6951936816,16,16.0,16
5,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,434496051,43884101151,101,101.0,101
6,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,231878793,23419758093,101,101.0,101
7,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,231878793,3710060688,16,16.0,16
8,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,231878793,23419758093,101,101.0,101
9,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,126271591,12753430691,101,101.0,101


# DONE!

---