In [None]:
import os
import shutil

import subprocess

from aavomics import database

In [None]:
BASE_REFERENCE_NAME = "refdata-gex-mm10-2020-A"
NEW_REFERENCE_NAME = "refdata-gex-mm10-2020-A-AAVomics"

In [None]:
source_reference_FASTA_path = os.path.join(database.DATA_PATH, "references", BASE_REFERENCE_NAME, "fasta", "genome.fa")
source_reference_GTF_path = os.path.join(database.DATA_PATH, "references", BASE_REFERENCE_NAME, "genes", "genes.gtf")

if os.path.exists("tmp"):
    shutil.rmtree("tmp")
    
os.makedirs("tmp")

reference_FASTA_path = os.path.join("tmp", "genome.fa")
reference_GTF_path = os.path.join("tmp", "genes.gtf")

shutil.copy(source_reference_FASTA_path, reference_FASTA_path)
shutil.copy(source_reference_GTF_path, reference_GTF_path)

In [None]:
template_segments = {
    "CAG_mNeonGreen": ["mNeonGreen", "stop", "WPRE_1", "WPRE_2", "WPRE_polyA"],
    "CAG_mRuby2_NLS": ["mRuby2", "2xNLS", "WPRE_1", "WPRE_2", "WPRE_polyA"],
    "CAG_mNeonGreen_NLS": ["mNeonGreen", "2xNLS", "WPRE_1", "WPRE_2", "WPRE_polyA"],
    "mNeonGreen": ["mNeonGreen", "polyA"],
    "CAG_tdTomato_NLS": ["tdTomato_1", "tdTomato_2", "2xNLS", "WPRE_1", "WPRE_2", "WPRE_polyA"],
    "CAG_tdTomato": ["tdTomato_1", "polyA_2", "WPRE_2", "WPRE_polyA"],
    "AAV9_cap": ["AAV9_cap"],
    "UBC_mCherry": ["mCherry", "polyA"],
    "CAG_hFXN": ["hFXN", "WPRE_polyA"]
}

segments = [
    "mNeonGreen",
    "stop",
    "mRuby2",
    "tdTomato_1",
    "tdTomato_2",
    "hFXN",
    "mCherry",
    "polyA",
    "polyA_2",
    "2xNLS",
    "WPRE_1",
    "WPRE_2",
    "WPRE_polyA",
    "AAV9_cap"
]

segment_sequences = {
    "mNeonGreen": "CGCCACCATGGTGAGCAAGGGCGAGGAGGATAACATGGCCTCTCTCCCAGCGACACATGAGTTACACATCTTTGGCTCCATCAACGGTGTGGACTTTGACATGGTGGGTCAGGGCACCGGCAATCCAAATGATGGTTATGAGGAGTTAAACCTGAAGTCCACCAAGGGTGACCTCCAGTTCTCCCCCTGGATTCTGGTCCCTCATATCGGGTATGGCTTCCATCAGTACCTGCCCTACCCTGACGGGATGTCGCCTTTCCAGGCCGCCATGGTAGATGGCTCCGGATACCAAGTCCATCGCACAATGCAGTTTGAAGATGGTGCCTCCCTTACTGTTAACTACCGCTACACCTACGAGGGAAGCCACATCAAAGGAGAGGCCCAGGTGAAGGGGACTGGTTTCCCTGCTGACGGTCCTGTGATGACCAACTCGCTGACCGCTGCGGACTGGTGCAGGTCGAAGAAGACTTACCCCAACGACAAAACCATCATCAGTACCTTTAAGTGGAGTTACACCACTGGAAATGGCAAGCGCTACCGGAGCACTGCGCGGACCACCTACACCTTTGCCAAGCCAATGGCGGCTAACTATCTGAAGAACCAGCCGATGTACGTGTTCCGTAAGACGGAGCTCAAGCACTCCAAGACCGAGCTCAACTTCAAGGAGTGGCAAAAGGCCTTTACCGATGTGATGGGCATGGACGAGCTGTACAAG",
    "stop": "TGA",
    "mRuby2": "CGCCACCATGGTGAGCAAGGGAGAGGAACTCATCAAGGAGAACATGCGCATGAAGGTAGTCATGGAAGGGTCTGTGAATGGGCATCAGTTCAAGTGCACAGGGGAAGGTGAAGGGAATCCCTACATGGGTACTCAGACCATGAGGATCAAGGTTATTGAAGGTGGCCCTCTCCCGTTTGCCTTCGACATTCTTGCCACTAGCTTCATGTATGGCAGTCGGACCTTCATCAAGTACCCCAAAGGCATTCCAGACTTCTTCAAACAGTCCTTTCCAGAGGGCTTCACATGGGAAAGGGTGACCCGTTACGAGGATGGTGGTGTGGTCACAGTCATGCAGGACACTAGTCTGGAGGACGGATGTCTGGTCTATCACGTGCAAGTGAGAGGGGTGAACTTTCCCTCCAATGGGCCTGTGATGCAGAAGAAAACCAAAGGGTGGGAGCCAAACACAGAGATGATGTATCCTGCAGATGGAGGACTGAGAGGCTATACGCACATGGCTCTGAAGGTTGACGGAGGAGGCCACTTGTCATGCTCATTTGTGACCACGTACAGGAGCAAGAAAACTGTCGGCAACATCAAAATGCCTGGCATACATGCGGTTGATCACCGCTTGGAACGACTGGAGGAGTCTGACAACGAGATGTTTGTGGTACAACGGGAACATGCAGTTGCCAAGTTTGCTGGCCTTGGCGGAGGAATGGATGAGCTGTACAAG",
    "tdTomato_1": "CGCCACCATGGTGTCAAAGGGAGAGGAAGTCATCAAGGAATTCATGCGGTTTAAAGTTCGCATGGAAGGGAGCATGAACGGCCATGAATTCGAGATCGAAGGGGAGGGAGAGGGCCGACCATACGAGGGGACACAGACGGCGAAATTGAAAGTGACCAAAGGAGGACCCTTGCCATTCGCTTGGGACATTCTGTCCCCTCAGTTCATGTACGGAAGTAAGGCCTATGTGAAGCATCCTGCCGATATTCCCGACTATAAGAAACTGTCCTTCCCGGAGGGGTTCAAGTGGGAAAGAGTGATGAATTTCGAGGATGGCGGGCTCGTGACCGTCACCCAAGATAGCAGCTTGCAGGACGGAACACTGATCTACAAGGTCAAGATGAGGGGCACTAATTTCCCTCCTGATGGTCCTGTCATGCAAAAGAAAACCATGGGGTGGGAAGCAAGTACGGAACGCCTGTACCCACGGGACGGTGTTCTGAAGGGCGAGATACATCAGGCACTTAAACTCAAGGACGGCGGCCACTATCTGGTCGAATTTAAGACGATCTATATGGCAAAGAAACCAGTACAGCTCCCGGGTTACTACTATGTGGACACTAAACTTGACATTACAAGTCACAACGAGGACTATACCATCGTGGAACAGTATGAACGCTCTGAGGGGAGGCACCATCTGTTTCTTGGCCATGGAACAGGCAGCACAGGATCTGGATCATCTGGCACTGCATCCAGCGAGGATAACAACATGGCCGTGATTAAGGAGTTCATGAGGTTCAAAGTTCGTATGGAGGGATCAATGAATGGCCACGAATTCGAAATCGAGGGTGAAGGGGAGGGTAGACCCTACGAGGGCACTCAGACAGCCAAGCTGAAGGTAACCAAGGGCGGACCTCTGCCTTTTGCTTGGGACATCCTGTCTCCCCAGTTTATGTACGGTAGCAAGGCCTATGTCAAACACCCAGCTGACATCCCGGACTACAAGAAGCTCTCCTTTCCAGAAGGCTTTAAGTGGGAGCGGGTTATGAACTTCGAGGATGGTGGGCTTGTAACTGTGACCCAGGATTCCAGTCTGCAGGATGGGACTCTGATTTACAAAGTGAAAATGCGTGGAACTAACTTTCCACCCGATGGCCCCGTTATGCAGAAGAAAACAATGGGCTGGGAAGCTAGCACCGAGAGACTGTATCCTAGGGATGGAGTGCTGAAGGGTGAGATCCACCAAGCTCTGAAACTCAAAGATGGCGGACATTACCTGGTGGAGTTTAAGACCATATACATGGCCAAGAAGCCCGTGCAATTGCCCGGGTACTACTACGTAGACACAAAGCTGGACATAACCTCACACAATGAGGACTACACAATTGTCGAACAGTATGAGCGATCTGAGGGTAGACACCACCTCTTTCTCTATGGGATGGATGAGCTGTATAAA",
    "mCherry": "GGCCGCCATGGTCAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAATTCATGTATGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCTTACAAGACGGCGAGTTCATCTACAAAGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAAGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGCTGTACAAG",
    "hFXN": "CGCCACCATGTGGACTCTCGGGAGACGCGCAGTAGCTGGATTGCTGGCATCACCTAGCCCAGCCCAGGCCCAGACTCTCACCCGGGTCCCTCGGCCGGCAGAGTTGGCCCCACTCTGCGGTCGCCGTGGCCTGCGCACCGACATCGATGCGACCTGCACGCCCCGCCGCGCAAGTTCGAACCAACGTGGCCTCAACCAGATTTGGAATGTCAAAAAGCAGAGTGTCTATTTGATGAATTTGAGGAAATCTGGAACTTTGGGCCACCCAGGCTCACTAGATGAGACCACCTATGAAAGACTAGCAGAGGAAACGCTGGACTCTTTAGCAGAGTTTTTTGAAGACCTTGCAGACAAGCCATACACCTTTGAGGACTATGATGTCTCCTTTGGGAGTGGTGTCTTAACTGTCAAACTGGGTGGCGATCTAGGAACCTATGTGATCAACAAGCAGACGCCAAACAAGCAAATCTGGCTATCTTCTCCATCCAGTGGACCTAAGCGTTATGACTGGACTGGGAAAAACTGGGTGTACTCCCACGACGGCGTGTCCCTCCATGAGCTGCTGGCCGCAGAGCTGACTAAAGCCTTAAAAACCAAACTGGACTTGTCTTCCTTGGCCTATTCCGGAAAAGATGCTAGCGACTATAAGGACCATGATGGAGATTACAAAGATCATGACGACTACAAGGACGACGACGATAAATGAGAATTCAAGCTT",
    "tdTomato_2": "CGATTGTACAAG",
    "2xNLS": "GGAAGCCCCAAGAAAAAGCGGAAGGTGTGA",
    "WPRE_1": "GAATTCGATATC",
    "polyA_2": "AAGCTGAACCCTCCTGATGAGAGTGGCCCCGGCTGCATGAGCTGCAAGTGTGTGCTCTCCTGA",
    "WPRE_2": "AAGCTTATCGATAATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTCTTAACTATGTTGCTCCTTTTACGCTATGTGGATACGCTGCTTTAATGCCTTTGTATCATGCTATTGCTTCCCGTATGGCTTTCATTTTCTCCTCCTTGTATAAATCCTGGTTGCTGTCTCTTTATGAGGAGTTGTGGCCCGTTGTCAGGCAACGTGGCGTGGTGTGCACTGTGTTTGCTGACGCAACCCCCACTGGTTGGGGCATTGCCACCACCTGTCAGCTCCTTTCCGGGACTTTCGCTTTCCCCCTCCCTATTGCCACGGCGGAACTCATCGCCGCCTGCCTTGCCCGCTGCTGGACAGGGGCTCGGCTGTTGGGCACTGACAATTCCGTGGTGTTGTCGGGGAAATCATCGTCCTTTCCTTGGCTGCTCGCCTATGTTGCCACCTGGATTCTGCGCGGGACGTCCTTCTGCTACGTCCCTTCGGCCCTCAATCCAGCGGACCTTCCTTCCCGCGGCCTGCTGCCGGCTCTGCGGCCTCTTCCGCGTCTTCGCCTTCGCCCTCAGACGAGTCGGATCTCCCTTTGGGCCGCCTCCCCGCATCGATACCGAGCGCTG",
    "WPRE_polyA": "CTCGAGAGATCTACGGGTGGCATCCCTGTGACCCCTCCCCAGTGCCTCTCCTGGCCCTGGAAGTTGCCACTCCAGTGCCCACCAGCCTTGTCCTAATAAAATTAAGTTGCATCATTTTGTCTGACTAGGTGTCCTTCTATAATATTATGGGGTGGAGGGGGGTGGTATGGAGCAA",
    "polyA": "TAAAGGATCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTCCTCCTAGGAATAAAATATCTTTATTTTCATTACATCTGTGTGTTGGTTTTTTGTGT",
    "AAV9_cap": "ATGGCTGCCGATGGTTATCTTCCAGATTGGCTCGAGGACAACCTTAGTGAAGGAATTCGCGAGTGGTGGGCTTTGAAACCTGGAGCCCCTCAACCCAAGGCAAATCAACAACATCAAGACAACGCTCGAGGTCTTGTGCTTCCGGGTTACAAATACCTTGGACCCGGCAACGGACTCGACAAGGGGGAGCCGGTCAACGCAGCAGACGCGGCGGCCCTCGAGCACGACAAGGCCTACGACCAGCAGCTCAAGGCCGGAGACAACCCGTACCTCAAGTACAACCACGCCGACGCCGAGTTCCAGGAGCGGCTCAAAGAAGATACGTCTTTTGGGGGCAACCTCGGGCGAGCAGTCTTCCAGGCCAAAAAGAGGCTTCTTGAACCTCTTGGTCTGGTTGAGGAAGCGGCTAAGACGGCTCCTGGAAAGAAGAGGCCTGTAGAGCAGTCTCCTCAGGAACCGGACTCCTCCGCGGGTATTGGCAAATCGGGTGCACAGCCCGCTAAAAAGAGACTCAATTTCGGTCAGACTGGCGACACAGAGTCAGTCCCAGACCCTCAACCAATCGGAGAACCTCCCGCAGCCCCCTCAGGTGTGGGATCTCTTACAATGGCTTCAGGTGGTGGCGCACCAGTGGCAGACAATAACGAAGGTGCCGATGGAGTGGGTAGTTCCTCGGGAAATTGGCATTGCGATTCCCAATGGCTGGGGGACAGAGTCATCACCACCAGCACCCGAACCTGGGCCCTGCCCACCTACAACAATCACCTCTACAAGCAAATCTCCAACAGCACATCTGGAGGATCTTCAAATGACAACGCCTACTTCGGCTACAGCACCCCCTGGGGGTATTTTGACTTCAACAGATTCCACTGCCACTTCTCACCACGTGACTGGCAGCGACTCATCAACAACAACTGGGGATTCCGGCCTAAGCGACTCAACTTCAAGCTCTTCAACATTCAGGTCAAAGAGGTTACGGACAACAATGGAGTCAAGACCATCGCCAATAACCTTACCAGCACGGTCCAGGTCTTCACGGACTCAGACTATCAGCTCCCGTACGTGCTCGGGTCGGCTCACGAGGGCTGCCTCCCGCCGTTCCCAGCGGACGTTTTCATGATTCCTCAGTACGGGTATCTGACGCTTAATGATGGAAGCCAGGCCGTGGGTCGTTCGTCCTTTTACTGCCTGGAATATTTCCCGTCGCAAATGCTAAGAACGGGTAACAACTTCCAGTTCAGCTACGAGTTTGAGAACGTACCTTTCCATAGCAGCTACGCTCACAGCCAAAGCCTGGACCGACTAATGAATCCACTCATCGACCAATACTTGTACTATCTCTCTAGAACTATTAACGGTTCTGGACAGAATCAACAAACGCTAAAATTCAGTGTGGCCGGACCCAGCAACATGGCTGTCCAGGGAAGAAACTACATACCTGGACCCAGCTACCGACAACAACGTGTCTCAACCACTGTGACTCAAAACAACAACAGCGAATTTGCTTGGCCTGGAGCTTCTTCTTGGGCTCTCAATGGACGTAATAGCTTGATGAATCCTGGACCTGCTATGGCCTCTCACAAAGAAGGAGAGGACCGTTTCTTTCCTTTGTCTGGATCTTTAATTTTTGGCAAACAAGGTACTGGCAGAGACAACGTGGATGCGGACAAAGTCATGATAACCAACGAAGAAGAAATTAAAACTACTAACCCGGTAGCAACGGAGTCCTATGGACAAGTGGCCACAAACCACCAGAGTGCCCAAGCACAGGCGCAGACCGGTTGGGTTCAAAACCAAGGAATACTTCCGGGTATGGTTTGGCAGGACAGAGATGTGTACCTGCAAGGACCCATTTGGGCCAAAATTCCTCACACGGACGGCAACTTTCACCCTTCTCCGCTGATGGGAGGGTTTGGAATGAAGCACCCGCCTCCTCAGATCCTCATCAAAAACACACCTGTACCTGCGGATCCTCCAACGGCCTTCAACAAGGACAAGCTGAACTCTTTCATCACCCAGTATTCTACTGGCCAAGTCAGCGTGGAGATCGAGTGGGAGCTGCAGAAGGAAAACAGCAAGCGCTGGAACCCGGAGATCCAGTACACTTCCAACTATTACAAGTCTAATAATGTTGAATTTGCTGTTAATACTGAAGGTGTATATAGTGAACCCCGCCCCATTGGCACCAGATACCTGACTCGTAATCTGTAAGTCGACTACCGTTCGTATAGCATACATTATACGAAGTTATCATATGTTCGAGCAGACATGATAAGATACATTGATGAGTTTGGACAAACCACAACTAGAATGCAGTGAAAAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATAAA"
}

In [None]:
merged_sequence = ""

for segment in segments:
    print("%s of length %i starts at %i" % (segment, len(segment_sequences[segment]), len(merged_sequence) + 1))
    merged_sequence += segment_sequences[segment]

In [None]:
merged_sequence

In [None]:
with open(reference_FASTA_path, "a") as reference_FASTA_file:
        
    reference_FASTA_file.write(">AAV\n")

    sequence_index = 0

    while sequence_index < len(merged_sequence):

        reference_FASTA_file.write("%s\n" % merged_sequence[sequence_index:sequence_index + 60])

        sequence_index += 60

In [None]:
for template in database.TEMPLATES:

    for template_segment in template_segments[template.name]:
        segment_start_index = 0

        for segment in segments:
            if segment == template_segment:
                break
            segment_start_index += len(segment_sequences[segment])

        segment_end_index = segment_start_index + len(segment_sequences[template_segment])

        segment_start_index += 1

        line = "AAV\tBROWN\texon\t%i\t%i\t.\t+\t.\tgene_id \"%s\"; transcript_id \"%s_%s\"; gene_name \"%s\"; gene_biotype \"protein_coding\";\n" % \
            (segment_start_index, segment_end_index, template.name, template.name, template_segment, template.name)

        print(line)

In [None]:
with open(reference_GTF_path, "a") as reference_GTF_file:

    for template in database.TEMPLATES:
        
        cumulative_template_sequence = ""

        current_transcript_id = []
        cumulative_segment_start_index = None
        previous_end_index = None

        for template_segment in template_segments[template.name]:

            segment_start_index = 0

            for segment in segments:

                if segment == template_segment:
                    break

                segment_start_index += len(segment_sequences[segment])

            segment_end_index = segment_start_index + len(segment_sequences[template_segment])

    #         print(template_segment, segment_start_index, previous_end_index)
            # If this segment is continous with the previous one, we keep looping
            if previous_end_index is None or segment_start_index == previous_end_index:
                current_transcript_id.append(template_segment)
                previous_end_index = segment_end_index

                if cumulative_segment_start_index is None:
                    cumulative_segment_start_index = segment_start_index + 1
                continue
            # Otherwise we're done and can print this one
            else:
                line = "AAV\tBROWN\texon\t%i\t%i\t.\t+\t.\tgene_id \"AAV\"; transcript_id \"%s\"; gene_name \"AAV\"; gene_biotype \"protein_coding\"; exon_id \"%s_%s\";\n" % \
                    (cumulative_segment_start_index, previous_end_index, template.name, template.name, "_".join(current_transcript_id))
                cumulative_template_sequence += merged_sequence[cumulative_segment_start_index-1:previous_end_index]
                current_transcript_id = [template_segment]
                cumulative_segment_start_index = segment_start_index + 1
                previous_end_index = segment_end_index
                print(line)
                reference_GTF_file.write(line)

        if cumulative_segment_start_index is not None:
            line = "AAV\tBROWN\texon\t%i\t%i\t.\t+\t.\tgene_id \"AAV\"; transcript_id \"%s\"; gene_name \"AAV\"; gene_biotype \"protein_coding\"; exon_id \"%s_%s\";\n" % \
                (cumulative_segment_start_index, previous_end_index, template.name, template.name, "_".join(current_transcript_id))
            cumulative_template_sequence += merged_sequence[cumulative_segment_start_index-1:previous_end_index]
            print(line)
            reference_GTF_file.write(line)
    
        if cumulative_template_sequence != template.sequence:
            print("Something went wrong during sequence reconstruction, expected sequence of length %i, got sequence of length %i" % (len(template.sequence), len(cumulative_template_sequence)))

In [None]:
result = subprocess.run(
    [
        "cellranger",
        "mkref",
        "--genome=%s" % NEW_REFERENCE_NAME,
        "--fasta=genome.fa",
        "--genes=genes.gtf",
        "--nthreads=15"
    ],
    cwd="tmp",
)

In [None]:
shutil.move(os.path.join("tmp", NEW_REFERENCE_NAME), os.path.join(database.DATA_PATH, "references", NEW_REFERENCE_NAME))

In [None]:
shutil.rmtree("tmp")