# Prep for kallisto bs and cellranger-atac scripts

In [1]:
# Imports
import os
import glob
import pickle
import pandas as pd

In [6]:
# Set paths
supp_excel_table = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_snATAC-seq/metadata/23Oct23/1-s2.0-S1534580723001077-mmc2.xlsx"
sra_metadata = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/metadata/24Oct23/SRP374217_metadata.tsv"
seqkit_stats = "/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/metadata/24Oct23/fastq_stats.tsv"

In [7]:
# Grab datasets dirs
datasets = glob.glob("/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/*")
datasets = [x for x in datasets if "vdb_validate_all.out" not in x]
len(datasets), datasets

(5,
 ['/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207312',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207311',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207313',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207309',
  '/cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207310'])

In [35]:
# Load and clean metadata
islet_scrna_metadata = pd.read_excel(supp_excel_table, sheet_name=0, skiprows=4).iloc[5:10, :]
islet_scrna_metadata.drop(columns="Type of data", inplace=True)
islet_scrna_metadata.dropna(axis=1, how="all", inplace=True)

In [36]:
# Add SRA accessions
sra_df = pd.read_csv(sra_metadata, sep="\t")
sra_df["sample_id"] = ["H1-D39", "H1-D32", "H1-D21", "H1-D14", "H1-D11"]
islet_scrna_metadata = islet_scrna_metadata.merge(sra_df[["run_accession", "experiment_accession", "sample_id"]], left_on="ID", right_on="sample_id", how="left")

In [37]:
# Get a mapping of experiment accessions to sample ids, useful for chromap and CellRanger
expacc_to_sample = islet_scrna_metadata.set_index("experiment_accession")["sample_id"].to_dict()
expacc_to_sample

{'SRX15207309': 'H1-D11',
 'SRX15207310': 'H1-D14',
 'SRX15207311': 'H1-D21',
 'SRX15207312': 'H1-D32',
 'SRX15207313': 'H1-D39'}

In [45]:
expacc_to_sample.values()

dict_values(['H1-D11', 'H1-D14', 'H1-D21', 'H1-D32', 'H1-D39'])

In [39]:
# Rename for CellRanger
for dataset in datasets:
    fastq_files = glob.glob(os.path.join(dataset, "*.fastq.gz"))
    file_mapping = {}
    for fastq_file in fastq_files:
        read_type = fastq_file.split("_")[-1].split(".")[0]
        file_path = os.path.dirname(fastq_file)
        exp_acc = os.path.basename(file_path).split("_")[0]
        sample_id = expacc_to_sample[exp_acc]
        new_file = f"{file_path}/{sample_id}_S1_L001_R{read_type}_001.fastq.gz"
        file_mapping[fastq_file] = new_file
        cmd = f"mv {fastq_file} {new_file}"
        print(cmd)
        os.system(cmd)
    with open(os.path.join(file_path, "file_mapping.pickle"), "wb") as f:
        pickle.dump(file_mapping, f)

mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207312/SRR19140223_1.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207312/H1-D32_S1_L001_R1_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207312/SRR19140223_2.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207312/H1-D32_S1_L001_R2_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207311/SRR19140224_2.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207311/H1-D21_S1_L001_R2_001.fastq.gz
mv /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207311/SRR19140224_1.fastq.gz /cellar/users/aklie/data/datasets/Zhu2023_sc-islet_scRNA-seq/fastq/24Oct23/SRP374217/SRX15207311/H1-D21_S1_L001_R1_001.fastq

In [40]:
# Check sizes
sra_df[["run_accession", "experiment_accession", "sample_id", "total_spots"]].sort_values("total_spots", ascending=False)

Unnamed: 0,run_accession,experiment_accession,sample_id,total_spots
4,SRR19140226,SRX15207309,H1-D11,700803322
1,SRR19140223,SRX15207312,H1-D32,573654434
0,SRR19140222,SRX15207313,H1-D39,515273491
2,SRR19140224,SRX15207311,H1-D21,429276850
3,SRR19140225,SRX15207310,H1-D14,325250810


# Seqkit statistics

In [46]:
pd.read_csv(seqkit_stats, sep="\t")

Unnamed: 0,file,format,type,num_seqs,sum_len,min_len,avg_len,max_len
0,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,700803322,70781135522,101,101.0,101
1,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,700803322,70781135522,101,101.0,101
2,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,325250810,32850331810,101,101.0,101
3,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,325250810,32850331810,101,101.0,101
4,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,429276850,43356961850,101,101.0,101
5,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,429276850,43356961850,101,101.0,101
6,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,573654434,57939097834,101,101.0,101
7,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,573654434,57939097834,101,101.0,101
8,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,515273491,52042622591,101,101.0,101
9,/cellar/users/aklie/data/datasets/Zhu2023_sc-i...,FASTQ,DNA,515273491,52042622591,101,101.0,101


# DONE!

---