In [None]:
from aavomics import database
import os
import shutil

from pepars.utils import Illumina_FASTQ_File_Set
from skbio.alignment import StripedSmithWaterman as SSW
import re

In [None]:
def align_cell_barcode_templates(FASTQ_file_set, template, cell_barcode_UMI_length):
    
    sequence_index = 0
    alignment_counts = {}
    sequence_alignments = {}
    
    cigar_regex = re.compile(r"(\d+)(\w)")
    aligner = SSW(template, suppress_sequences=False)
    
    for sequences in FASTQ_file_set.get_sequence_iterator():
        
        read_1 = sequences[-2]
        read_2 = sequences[-1]
        
        cell_barcode_UMI = read_1[0:cell_barcode_UMI_length]
        
        if read_2 in sequence_alignments:
            alignment_tuple = sequence_alignments[read_2]
        else:
            alignment = aligner(read_2)
            cigar_segments = cigar_regex.findall(alignment["cigar"])

            target_index = alignment["target_begin"]
            insertion_sequence = ""

            for match_number, match_type in cigar_segments:

                match_number = int(match_number)

                if match_type == "M":
                    target_index += match_number
                elif match_type == "I":
                    target_index -= match_number
                elif match_type == "D":
                    insertion_sequence += read_2[target_index:target_index+match_number]
                    target_index += match_number
                else:
                    print("Match type is %s wut" % match_type)
            
            alignment_max_score = min(len(template), len(read_2)) * 2
            
            alignment_score = int(alignment["optimal_alignment_score"])            
        
            alignment_tuple = (alignment["cigar"], alignment_score, alignment_max_score, int(alignment["target_begin"]), int(alignment["query_begin"]), insertion_sequence)
        
        cell_barcode_UMI_alignment_tuple = (cell_barcode_UMI, alignment_tuple)
        
        if cell_barcode_UMI_alignment_tuple not in alignment_counts:
            alignment_counts[cell_barcode_UMI_alignment_tuple] = 1
        else:
            alignment_counts[cell_barcode_UMI_alignment_tuple] += 1
            
        sequence_index += 1
        
    return alignment_counts

In [None]:
for cell_set in database.CELL_SETS:
    
    tissue_sample = cell_set.source_tissue
    dissociation_run = tissue_sample.dissociation_run
    animal = tissue_sample.animal
    
    if animal is None:
        continue
        
    injections = animal.injections
    
    templates = set()
    
    for injection in injections:
        for vector in injection.vector_pool.vectors:
            templates.add(vector.cargo)
            
    if dissociation_run.protocol_version == 2:
        cell_barcode_UMI_length = 26
    else:
        cell_barcode_UMI_length = 28
    
    if len(templates) == 0:
        print("Skipping %s, no templates injected" % cell_set.name)
        continue
    
    amplified_directory = os.path.join(database.DATA_PATH, "cell_sets", cell_set.name, "virus", "reads")
    
    read_sets = set()
    
    for sequencing_library in cell_set.sequencing_libraries:
        if sequencing_library.type == "Virus Transcripts":
            read_sets.update(sequencing_library.read_sets)
    
    if len(read_sets) == 0:
        print("Skipping %s, no amplified read sets detected" % cell_set.name)
        continue
        
    for template in templates:
        
        found_read_set = False
        
        for read_set in read_sets:
                
            alignment_file_name = "%s_%s.csv" % (read_set.name, template.name)
            alignment_file_directory = os.path.join(database.DATA_PATH, "cell_sets", cell_set.name, "virus", "alignments")
            alignment_file_path = os.path.join(alignment_file_directory, alignment_file_name)

            if os.path.exists(alignment_file_path):
                print("Skipping %s against %s, already exists" % (read_set.name, template.name))
                continue

            template_sequence = template.sequence

            print("Aligning %s against %s" % (read_set.name, template.name))
            
            file_set = Illumina_FASTQ_File_Set(amplified_directory, read_set.name)

            if len(file_set.file_paths) == 0:
                print("Cant find %s in %s" % (read_set.name, cell_set.name))
                continue

            alignments = align_cell_barcode_templates(file_set, template_sequence, cell_barcode_UMI_length)

            if not os.path.exists(alignment_file_directory):
                os.makedirs(alignment_file_directory)

            temp_alignment_file_path = alignment_file_path + ".tmp"

            with open(temp_alignment_file_path, "w") as temp_alignment_file:
                for alignment in sorted(alignments.items(), key=lambda x: x[1], reverse=True):
                    temp_alignment_file.write(alignment[0][0])
                    temp_alignment_file.write(",")
                    temp_alignment_file.write(",".join([str(x) for x in alignment[0][1]]))
                    temp_alignment_file.write(",")
                    temp_alignment_file.write(str(alignment[1]))
                    temp_alignment_file.write("\n")

            shutil.move(src=temp_alignment_file_path, dst=alignment_file_path)