First, symlink the FASTQ files

In [8]:
%%writefile /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2_gex.py
#!/usr/bin/env python3
import csv
import yaml
import gzip
import os
import argparse

# Custom representers so PyYAML outputs tags like !Assay
def tag_representer(dumper, data):
    # data should be a dict with exactly one key: "!Tag"
    tag = list(data.keys())[0]
    value = data[tag]
    return dumper.represent_mapping(tag, value)
# Register for the tags you use
yaml.add_representer(dict, tag_representer)

def build_seqspec(sample_name, reads):
    """
    Build seqspec YAML for one sample, given its paired read files.
    """
    reads.sort(key=lambda r: r['illumina_read_type'])  # Ensure R1 then R2
    # Build !Read entries
    read_entries = []
    for r in reads:
        read_tag = "!Read"
        read_entry = {
            read_tag:  {
                "read_id": r["accession"],
                "name": r["read_names"].strip("[]\""),
                "modality": "rna",
                "primer_id": "truseq_read1" if r["illumina_read_type"] == "R1" else "truseq_read2",
                "min_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "max_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "strand": "pos" if r["illumina_read_type"] == "R1" else "neg",
                "files": [
                    {"!File": {
                        "file_id": r["accession"],
                        "filename": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "filesize": int(r["file_size"]),
                        "filetype": r["file_format"],
                        "url": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "urltype": "https",
                        "md5": r["md5sum"]
                    }}
                ]
            }
        
        }
        read_entries.append(read_entry)
    index5 = reads[0]['index5']
    index7 = reads[0]['index7']
    seqspec = {
        "!Assay": {
            "seqspec_version": "0.3.0",
            "assay_id": "10XCRISPR5prime",
            "name": "10x CRISPR and RNA assay 5'",
            "doi": "https://doi.org/",
            "date": "25 July 2025",
            "description": "10x single-cell RNA-seq and CRISPR 5'",
            "modalities": ["rna"],
            "lib_struct": "https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium5vdjfb.html",
            "sequence_protocol": "Illumina NextSeq 2000 (EFO:0010963), Illumina NovaSeq X Plus (EFO:0022841)",
            "sequence_kit": reads[0]["sequencing_kit"],
            "library_protocol": [
                {"!LibProtocol": {
                    "protocol_id": "Chromium Next GEM Single Cell 5' HT Reagent Kits v2 (Dual Index) (10x Genomics)",
                    "name": "5' Gene Expression (GEX) Library Construction",
                    "modality": "rna"
                }}
            ],
            "library_kit": "10XCRISPR5prime",
            "sequence_spec": read_entries,
            "library_spec": [
                {"!Region": {
                    "parent_id": "rna",
                    "region_id": "rna",
                    "region_type": "rna",
                    "name": "rna",
                    "sequence_type": "joined",
                    "sequence": "AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNACATCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXTTTCTTATATGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG",
                    "min_len": 135,
                    "max_len": 269,
                    "onlist": None,
                    "regions": [ 
                         {"!Region": {"region_id": "truseq_read1", "region_type": "truseq_read1", "name": "truseq_read1", "sequence_type": "fixed", "sequence": "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", "min_len": 33, "max_len": 33, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "cell_bc", "region_type": "barcode", "name": "cell_bc", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNN", "min_len": 16, "max_len": 16, "onlist": {"!Onlist": {"file_id": "IGVFFI9487JPEN", "filename": "https://api.data.igvf.org/tabular-files/IGVFFI9487JPEN/@@download/IGVFFI9487JPEN.tsv.gz", "filetype": "tsv", "filesize": 2200000, "url": "https://api.data.igvf.org/tabular-files/IGVFFI9487JPEN/@@download/IGVFFI9487JPEN.tsv.gz", "urltype": "https", "md5": "f62a276e262fdd85262a889d0f48556b"}}, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "umi", "region_type": "umi", "name": "umi", "sequence_type": "random", "sequence": "XXXXXXXXXX", "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "tso", "region_type": "linker", "name": "Template switch oligo", "sequence_type": "fixed", "sequence": "TTTCTTATATGGG", "min_len": 13, "max_len": 13, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "cdna", "region_type": "cdna", "name": "cdna", "sequence_type": "random", "sequence": "X"*90, "min_len": 1, "max_len": 90, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "truseq_read2", "region_type": "truseq_read2", "name": "truseq_read2", "sequence_type": "fixed", "sequence": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "min_len": 34, "max_len": 34, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "illumina_p5", "region_type": "illumina_p5", "name": "Illumina P5", "sequence_type": "fixed", "sequence": "AATGATACGGCGACCACCGAGATCTACAC", "min_len": 29, "max_len": 29, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "illumina_p7", "region_type": "illumina_p7", "name": "Illumina P7", "sequence_type": "fixed", "sequence": "ATCTCGTATGCCGTCTTCTGCTTG", "min_len": 24, "max_len": 24, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "index5", "region_type": "index5", "name": "index5", "sequence_type": "fixed", "sequence": index5, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "index7", "region_type": "index7", "name": "index7", "sequence_type": "fixed", "sequence": index7, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "rna"}}
                    ]
                }}
            ]
        }
    }
    return seqspec
def generate_seqspecs(input_csv, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    samples = {}
    with open(input_csv, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prefix = os.path.basename(row["submitted_file_name"]).split('_R')[0]
            samples.setdefault(prefix, []).append(row)
    for sample, reads in samples.items():
        seqspec = build_seqspec(sample, reads)
        output_file = os.path.join(output_dir, f"seqspec_{sample}.yaml.gz")
        with gzip.open(output_file, 'wt', encoding='utf-8') as gzfile:
           yaml.dump(seqspec,gzfile,sort_keys=False)
        
        print(f"Generated: {output_file}")
    print(f"\nDone! Seqspecs saved in: {output_dir}")
def main():
    parser = argparse.ArgumentParser(description="Generate seqspec YAMLs (gzipped) from CSV")
    parser.add_argument("-i", "--input", required=True, help="Input CSV file")
    parser.add_argument("-o", "--output", default="seqspec_files", help="Output directory")
    args = parser.parse_args()
    generate_seqspecs(args.input, args.output)
if __name__ == "__main__":
    main()

Overwriting /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2_gex.py


In [1]:
%%writefile /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2-sgRNA.py
#!/usr/bin/env python3
import csv
import yaml
import gzip
import os
import argparse

# Custom representers so PyYAML outputs tags like !Assay
def tag_representer(dumper, data):
    # data should be a dict with exactly one key: "!Tag"
    tag = list(data.keys())[0]
    value = data[tag]
    return dumper.represent_mapping(tag, value)
# Register for the tags you use
yaml.add_representer(dict, tag_representer)

def build_seqspec(sample_name, reads):
    """
    Build seqspec YAML for one sample, given its paired read files.
    """
    reads.sort(key=lambda r: r['illumina_read_type'])  # Ensure R1 then R2
    # Build !Read entries
    read_entries = []
    for r in reads:
        read_tag = "!Read"
        read_entry = {
            read_tag:  {
                "read_id": r["accession"],
                "name": r["read_names"].strip("[]\""),
                "modality": "crispr",
                "primer_id": "truseq_read1" if r["illumina_read_type"] == "R1" else "truseq_read2",
                "min_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "max_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "strand": "pos" if r["illumina_read_type"] == "R1" else "neg",
                "files": [
                    {"!File": {
                        "file_id": r["accession"],
                        "filename": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "filesize": int(r["file_size"]),
                        "filetype": r["file_format"],
                        "url": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "urltype": "https",
                        "md5": r["md5sum"]
                    }}
                ]
            }
        
        }
        read_entries.append(read_entry)
    index5 = reads[0]['index5']
    index7 = reads[0]['index7']
    seqspec = {
        "!Assay": {
            "seqspec_version": "0.3.0",
            "assay_id": "10XCRISPR5prime",
            "name": "10x CRISPR and RNA assay 5'",
            "doi": "https://doi.org/",
            "date": "25 July 2025",
            "description": "10x single-cell RNA-seq and CRISPR 5'",
            "modalities": ["crispr"],
            "lib_struct": "https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium5vdjfb.html",
            "sequence_protocol": "Illumina NextSeq 2000 (EFO:0010963), Illumina NovaSeq X Plus (EFO:0022841)",
            "sequence_kit": reads[0]["sequencing_kit"],
            "library_protocol": [
                {"!LibProtocol": {
                    "protocol_id": "Chromium Next GEM Single Cell 5' HT Reagent Kits v2 (Dual Index) (10x Genomics)",
                    "name": "5' Gene Expression (GEX) Library Construction",
                    "modality": "crispr"
                }}
            ],
            "library_kit": "10XCRISPR5prime",
            "sequence_spec": read_entries,
            "library_spec": [
                {"!Region": {
                    "parent_id": "crispr",
                    "region_id": "crispr",
                    "region_type": "crispr",
                    "name": "crispr",
                    "sequence_type": "joined",
                    "sequence": "AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXTTTCTTATATGGGGNNNNNNNNNNNNNNNNNNNNGTTTAAGAGCTAAGCTGGAAACAGCATAGCAAGTTTAAATAAGGCTAGTCCGTTATCAACTTGAGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG",
                    "min_len": 135,
                    "max_len": 269,
                    "onlist": None,
                    "regions": [
                        {"!Region": {"region_id": "truseq_read1", "region_type": "truseq_read1", "name": "truseq_read1", "sequence_type": "fixed", "sequence": "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", "min_len": 33, "max_len": 33, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "cell_bc", "region_type": "barcode", "name": "cell_bc", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNN", "min_len": 16, "max_len": 16, "onlist": {"!Onlist": {"file_id": "IGVFFI9487JPEN", "filename": "https://api.data.igvf.org/tabular-files/IGVFFI9487JPEN/@@download/IGVFFI9487JPEN.tsv.gz", "filetype": "tsv", "filesize": 2200000, "url": "https://api.data.igvf.org/tabular-files/IGVFFI9487JPEN/@@download/IGVFFI9487JPEN.tsv.gz", "urltype": "https", "md5": "f62a276e262fdd85262a889d0f48556b"}}, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "umi", "region_type": "umi", "name": "umi", "sequence_type": "random", "sequence": "XXXXXXXXXX", "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "tso", "region_type": "linker", "name": "Template switch oligo", "sequence_type": "fixed", "sequence": "TTTCTTATATGGG", "min_len": 13, "max_len": 13, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "sgrna_target", "region_type": "sgrna_target", "name": "sgrna_target", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNNNNNN", "min_len": 20, "max_len": 20, "onlist": {"!Onlist": {"file_id": "IGVFFI6638COVL", "filename": "https://api.data.igvf.org/reference-files/IGVFFI6638COVL/@@download/IGVFFI6638COVL.tsv.gz", "filetype": "tsv", "filesize": 2200000, "url": "https://api.data.igvf.org/reference-files/IGVFFI6638COVL/@@download/IGVFFI6638COVL.tsv.gz", "urltype": "https", "md5": "7e6d956b7bf03c4cc6d16ae5fed9661b"}}, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "sgrna_scaffold", "region_type": "linker", "name": "sgrna_scaffold", "sequence_type": "fixed", "sequence": "GTTTAAGAGCTAAGCTGGAAACAGCATAGCAAGTTTA", "min_len": 37, "max_len": 37, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "feature_primer", "region_type": "custom_primer", "name": "feature_primer", "sequence_type": "fixed", "sequence": "AATAAGGCTAGTCCGTTATCAACTTG", "min_len": 26, "max_len": 26, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "truseq_read2", "region_type": "truseq_read2", "name": "truseq_read2", "sequence_type": "fixed", "sequence": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "min_len": 34, "max_len": 34, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "illumina_p5", "region_type": "illumina_p5", "name": "Illumina P5", "sequence_type": "fixed", "sequence": "AATGATACGGCGACCACCGAGATCTACAC", "min_len": 29, "max_len": 29, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "illumina_p7", "region_type": "illumina_p7", "name": "Illumina P7", "sequence_type": "fixed", "sequence": "ATCTCGTATGCCGTCTTCTGCTTG", "min_len": 24, "max_len": 24, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "index5", "region_type": "index5", "name": "index5", "sequence_type": "fixed", "sequence": index5, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "index7", "region_type": "index7", "name": "index7", "sequence_type": "fixed", "sequence": index7, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "crispr"}}
                    ]
                }}
            ]
        }
    }
    return seqspec
def generate_seqspecs(input_csv, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    samples = {}
    with open(input_csv, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prefix = os.path.basename(row["submitted_file_name"]).split('_R')[0]
            samples.setdefault(prefix, []).append(row)
    for sample, reads in samples.items():
        seqspec = build_seqspec(sample, reads)
        output_file = os.path.join(output_dir, f"seqspec_{sample}.yaml.gz")
        with gzip.open(output_file, 'wt', encoding='utf-8') as gzfile:
           yaml.dump(seqspec,gzfile,sort_keys=False)
        
        print(f"Generated: {output_file}")
    print(f"\nDone! Seqspecs saved in: {output_dir}")
def main():
    parser = argparse.ArgumentParser(description="Generate seqspec YAMLs (gzipped) from CSV")
    parser.add_argument("-i", "--input", required=True, help="Input CSV file")
    parser.add_argument("-o", "--output", default="seqspec_files", help="Output directory")
    args = parser.parse_args()
    generate_seqspecs(args.input, args.output)
if __name__ == "__main__":
    main()

Overwriting /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2-sgRNA.py


In [1]:
%%writefile /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3_gex.py
#!/usr/bin/env python3
import csv
import yaml
import gzip
import os
import argparse

# Custom representers so PyYAML outputs tags like !Assay
def tag_representer(dumper, data):
    # data should be a dict with exactly one key: "!Tag"
    tag = list(data.keys())[0]
    value = data[tag]
    return dumper.represent_mapping(tag, value)
# Register for the tags you use
yaml.add_representer(dict, tag_representer)

def build_seqspec(sample_name, reads):
    """
    Build seqspec YAML for one sample, given its paired read files.
    """
    reads.sort(key=lambda r: r['illumina_read_type'])  # Ensure R1 then R2
    # Build !Read entries
    read_entries = []
    for r in reads:
        read_tag = "!Read"
        read_entry = {
            read_tag:  {
                "read_id": r["accession"],
                "name": r["read_names"].strip("[]\""),
                "modality": "rna",
                "primer_id": "truseq_read1" if r["illumina_read_type"] == "R1" else "truseq_read2",
                "min_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "max_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "strand": "pos" if r["illumina_read_type"] == "R1" else "neg",
                "files": [
                    {"!File": {
                        "file_id": r["accession"],
                        "filename": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "filesize": int(r["file_size"]),
                        "filetype": r["file_format"],
                        "url": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "urltype": "https",
                        "md5": r["md5sum"]
                    }}
                ]
            }
        
        }
        read_entries.append(read_entry)
    index5 = reads[0]['index5']
    index7 = reads[0]['index7']
    seqspec = {
        "!Assay": {
            "seqspec_version": "0.3.0",
            "assay_id": "10XCRISPR5prime",
            "name": "10x CRISPR and RNA assay 5'",
            "doi": "https://doi.org/",
            "date": "25 July 2025",
            "description": "10x single-cell RNA-seq and CRISPR 5'",
            "modalities": ["rna"],
            "lib_struct": "https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium5vdjfb.html",
            "sequence_protocol": "Illumina NextSeq 2000 (EFO:0010963), Illumina NovaSeq X Plus (EFO:0022841)",
            "sequence_kit": reads[0]["sequencing_kit"],
            "library_protocol": [
                {"!LibProtocol": {
                    "protocol_id": "Chromium Next GEM Single Cell 5' HT Reagent Kits v2 (Dual Index) (10x Genomics)",
                    "name": "5' Gene Expression (GEX) Library Construction",
                    "modality": "rna"
                }}
            ],
            "library_kit": "10XCRISPR5prime",
            "sequence_spec": read_entries,
            "library_spec": [
                {"!Region": {
                    "parent_id": "rna",
                    "region_id": "rna",
                    "region_type": "rna",
                    "name": "rna",
                    "sequence_type": "joined",
                    "sequence": "AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNACATCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXTTTCTTATATGGGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG",
                    "min_len": 135,
                    "max_len": 275,
                    "onlist": None,
                    "regions": [ 
                         {"!Region": {"region_id": "truseq_read1", "region_type": "truseq_read1", "name": "truseq_read1", "sequence_type": "fixed", "sequence": "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", "min_len": 33, "max_len": 33, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "cell_bc", "region_type": "barcode", "name": "cell_bc", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNN", "min_len": 16, "max_len": 16, "onlist": {"!Onlist": {"file_id": "IGVFFI1697XAXI", "filename": "https://api.data.igvf.org/tabular-files/IGVFFI1697XAXI/@@download/IGVFFI1697XAXI.tsv.gz", "filetype": "tsv", "filesize": 6900000, "url": "https://api.data.igvf.org/tabular-files/IGVFFI1697XAXI/@@download/IGVFFI1697XAXI.tsv.gz", "urltype": "https", "md5": "5033d08c9b8d5080d2e5f65a3896b53b"}}, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "umi", "region_type": "umi", "name": "umi", "sequence_type": "random", "sequence": "XXXXXXXXXXXX", "min_len": 12, "max_len": 12, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "tso", "region_type": "linker", "name": "Template switch oligo", "sequence_type": "fixed", "sequence": "TTTCTTATATGGG", "min_len": 13, "max_len": 13, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "cdna", "region_type": "cdna", "name": "cdna", "sequence_type": "random", "sequence": "X"*90, "min_len": 1, "max_len": 90, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "truseq_read2", "region_type": "truseq_read2", "name": "truseq_read2", "sequence_type": "fixed", "sequence": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "min_len": 34, "max_len": 34, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "illumina_p5", "region_type": "illumina_p5", "name": "Illumina P5", "sequence_type": "fixed", "sequence": "AATGATACGGCGACCACCGAGATCTACAC", "min_len": 29, "max_len": 29, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "illumina_p7", "region_type": "illumina_p7", "name": "Illumina P7", "sequence_type": "fixed", "sequence": "ATCTCGTATGCCGTCTTCTGCTTG", "min_len": 24, "max_len": 24, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "index5", "region_type": "index5", "name": "index5", "sequence_type": "fixed", "sequence": index5, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "rna"}},
                         {"!Region": {"region_id": "index7", "region_type": "index7", "name": "index7", "sequence_type": "fixed", "sequence": index7, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "rna"}}
                    ]
                }}
            ]
        }
    }
    return seqspec
def generate_seqspecs(input_csv, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    samples = {}
    with open(input_csv, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prefix = os.path.basename(row["submitted_file_name"]).split('_R')[0]
            samples.setdefault(prefix, []).append(row)
    for sample, reads in samples.items():
        seqspec = build_seqspec(sample, reads)
        output_file = os.path.join(output_dir, f"seqspec_{sample}.yaml.gz")
        with gzip.open(output_file, 'wt', encoding='utf-8') as gzfile:
           yaml.dump(seqspec,gzfile,sort_keys=False)
        
        print(f"Generated: {output_file}")
    print(f"\nDone! Seqspecs saved in: {output_dir}")
def main():
    parser = argparse.ArgumentParser(description="Generate seqspec YAMLs (gzipped) from CSV")
    parser.add_argument("-i", "--input", required=True, help="Input CSV file")
    parser.add_argument("-o", "--output", default="seqspec_files", help="Output directory")
    args = parser.parse_args()
    generate_seqspecs(args.input, args.output)
if __name__ == "__main__":
    main()

Overwriting /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3_gex.py


In [2]:
%%writefile /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3-sgRNA.py
#!/usr/bin/env python3
import csv
import yaml
import gzip
import os
import argparse

# Custom representers so PyYAML outputs tags like !Assay
def tag_representer(dumper, data):
    # data should be a dict with exactly one key: "!Tag"
    tag = list(data.keys())[0]
    value = data[tag]
    return dumper.represent_mapping(tag, value)
# Register for the tags you use
yaml.add_representer(dict, tag_representer)

def build_seqspec(sample_name, reads):
    """
    Build seqspec YAML for one sample, given its paired read files.
    """
    reads.sort(key=lambda r: r['illumina_read_type'])  # Ensure R1 then R2
    # Build !Read entries
    read_entries = []
    for r in reads:
        read_tag = "!Read"
        read_entry = {
            read_tag:  {
                "read_id": r["accession"],
                "name": r["read_names"].strip("[]\""),
                "modality": "crispr",
                "primer_id": "truseq_read1" if r["illumina_read_type"] == "R1" else "truseq_read2",
                "min_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "max_len": 26 if r["illumina_read_type"] == "R1" else 90,
                "strand": "pos" if r["illumina_read_type"] == "R1" else "neg",
                "files": [
                    {"!File": {
                        "file_id": r["accession"],
                        "filename": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "filesize": int(r["file_size"]),
                        "filetype": r["file_format"],
                        "url": f"https://api.data.igvf.org/sequence-files/{r['accession']}/@@download/{r['accession']}.fastq.gz",
                        "urltype": "https",
                        "md5": r["md5sum"]
                    }}
                ]
            }
        
        }
        read_entries.append(read_entry)
    index5 = reads[0]['index5']
    index7 = reads[0]['index7']
    seqspec = {
        "!Assay": {
            "seqspec_version": "0.3.0",
            "assay_id": "10XCRISPR5prime",
            "name": "10x CRISPR and RNA assay 5'",
            "doi": "https://doi.org/",
            "date": "25 July 2025",
            "description": "10x single-cell RNA-seq and CRISPR 5'",
            "modalities": ["crispr"],
            "lib_struct": "https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium5vdjfb.html",
            "sequence_protocol": "Illumina NextSeq 2000 (EFO:0010963), Illumina NovaSeq X Plus (EFO:0022841)",
            "sequence_kit": reads[0]["sequencing_kit"],
            "library_protocol": [
                {"!LibProtocol": {
                    "protocol_id": "Chromium Next GEM Single Cell 5' HT Reagent Kits v2 (Dual Index) (10x Genomics)",
                    "name": "5' Gene Expression (GEX) Library Construction",
                    "modality": "crispr"
                }}
            ],
            "library_kit": "10XCRISPR5prime",
            "sequence_spec": read_entries,
            "library_spec": [
                {"!Region": {
                    "parent_id": "crispr",
                    "region_id": "crispr",
                    "region_type": "crispr",
                    "name": "crispr",
                    "sequence_type": "joined",
                    "sequence": "AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNXXXXXXXXXXXXTTTCTTATATGGGGNNNNNNNNNNNNNNNNNNNNGTTTAAGAGCTAAGCTGGAAACAGCATAGCAAGTTTAAATAAGGCTAGTCCGTTATCAACTTGAGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG",
                    "min_len": 135,
                    "max_len": 269,
                    "onlist": None,
                    "regions": [
                        {"!Region": {"region_id": "truseq_read1", "region_type": "truseq_read1", "name": "truseq_read1", "sequence_type": "fixed", "sequence": "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", "min_len": 33, "max_len": 33, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "cell_bc", "region_type": "barcode", "name": "cell_bc", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNN", "min_len": 16, "max_len": 16, "onlist": {"!Onlist": {"file_id": "IGVFFI1697XAXI", "filename": "https://api.data.igvf.org/tabular-files/IGVFFI1697XAXI/@@download/IGVFFI1697XAXI.tsv.gz", "filetype": "tsv", "filesize": 6900000, "url": "https://api.data.igvf.org/tabular-files/IGVFFI1697XAXI/@@download/IGVFFI1697XAXI.tsv.gz", "urltype": "https", "md5": "5033d08c9b8d5080d2e5f65a3896b53b"}}, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "umi", "region_type": "umi", "name": "umi", "sequence_type": "random", "sequence": "XXXXXXXXXXXX", "min_len": 12, "max_len": 12, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "tso", "region_type": "linker", "name": "Template switch oligo", "sequence_type": "fixed", "sequence": "TTTCTTATATGGG", "min_len": 13, "max_len": 13, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "sgrna_target", "region_type": "sgrna_target", "name": "sgrna_target", "sequence_type": "onlist", "sequence": "NNNNNNNNNNNNNNNNNNNN", "min_len": 20, "max_len": 20, "onlist": {"!Onlist": {"file_id": "IGVFFI6638COVL", "filename": "https://api.data.igvf.org/reference-files/IGVFFI6638COVL/@@download/IGVFFI6638COVL.tsv.gz", "filetype": "tsv", "filesize": 2800, "url": "https://api.data.igvf.org/reference-files/IGVFFI6638COVL/@@download/IGVFFI6638COVL.tsv.gz", "urltype": "https", "md5": "7e6d956b7bf03c4cc6d16ae5fed9661b"}}, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "sgrna_scaffold", "region_type": "linker", "name": "sgrna_scaffold", "sequence_type": "fixed", "sequence": "GTTTAAGAGCTAAGCTGGAAACAGCATAGCAAGTTTA", "min_len": 37, "max_len": 37, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "feature_primer", "region_type": "custom_primer", "name": "feature_primer", "sequence_type": "fixed", "sequence": "AATAAGGCTAGTCCGTTATCAACTTG", "min_len": 26, "max_len": 26, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "truseq_read2", "region_type": "truseq_read2", "name": "truseq_read2", "sequence_type": "fixed", "sequence": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", "min_len": 34, "max_len": 34, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "illumina_p5", "region_type": "illumina_p5", "name": "Illumina P5", "sequence_type": "fixed", "sequence": "AATGATACGGCGACCACCGAGATCTACAC", "min_len": 29, "max_len": 29, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "illumina_p7", "region_type": "illumina_p7", "name": "Illumina P7", "sequence_type": "fixed", "sequence": "ATCTCGTATGCCGTCTTCTGCTTG", "min_len": 24, "max_len": 24, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "index5", "region_type": "index5", "name": "index5", "sequence_type": "fixed", "sequence": index5, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "crispr"}},
                        {"!Region": {"region_id": "index7", "region_type": "index7", "name": "index7", "sequence_type": "fixed", "sequence": index7, "min_len": 10, "max_len": 10, "onlist": None, "regions": None, "parent_id": "crispr"}}
                    ]
                }}
            ]
        }
    }
    return seqspec
def generate_seqspecs(input_csv, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    samples = {}
    with open(input_csv, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prefix = os.path.basename(row["submitted_file_name"]).split('_R')[0]
            samples.setdefault(prefix, []).append(row)
    for sample, reads in samples.items():
        seqspec = build_seqspec(sample, reads)
        output_file = os.path.join(output_dir, f"seqspec_{sample}.yaml.gz")
        with gzip.open(output_file, 'wt', encoding='utf-8') as gzfile:
           yaml.dump(seqspec,gzfile,sort_keys=False)
        
        print(f"Generated: {output_file}")
    print(f"\nDone! Seqspecs saved in: {output_dir}")
def main():
    parser = argparse.ArgumentParser(description="Generate seqspec YAMLs (gzipped) from CSV")
    parser.add_argument("-i", "--input", required=True, help="Input CSV file")
    parser.add_argument("-o", "--output", default="seqspec_files", help="Output directory")
    args = parser.parse_args()
    generate_seqspecs(args.input, args.output)
if __name__ == "__main__":
    main()

Overwriting /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3-sgRNA.py


In [None]:
python3 /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2-sgRNA.py -i /work/rr151/TF_perturbSeq50genes_BL/10x5pv2-sgRNA.csv -o /work/rr151/TF_perturbSeq50genes_BL/seqspecCORRECTED9112025
python3 /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv2_gex.py -i /work/rr151/TF_perturbSeq50genes_BL/10x5pv2-GEX.csv -o /work/rr151/TF_perturbSeq50genes_BL/seqspec_yaml
python3 /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3_gex.py -i /work/rr151/TF_perturbSeq50genes_BL/10x5pv3-GEX.csv -o /work/rr151/TF_perturbSeq50genes_BL/seqspecCORRECTED9102025
python3 /work/rr151/TF_perturbSeq50genes_BL/generate_seqspec10x5pv3-sgRNA.py -i /work/rr151/TF_perturbSeq50genes_BL/10x5pv3-sgRNA.csv -o /work/rr151/TF_perturbSeq50genes_BL/seqspecCORRECTED9112025