In [None]:
import os
import shutil
import subprocess

from aavomics import database

In [None]:
BASH_SCRIPT_TEMPLATE = os.path.join("..", "bin", "cellranger_count.sh")
FILE_NAMES_TO_TRANSFER = ["filtered_feature_bc_matrix.h5", "raw_feature_bc_matrix.h5", "possorted_genome_bam.bam", "possorted_genome_bam.bam.bai", "web_summary.html"]
FORCE_RERUN = False
REMOVE_TMP_DIR = False

In [None]:
for alignment in database.ALIGNMENTS:
        
    # Get the name of the reference we will use for this alignment
    reference_name = database.ALIGNMENTS_DICT[alignment.name].reference.name

    for cell_set in database.CELL_SETS:
        
        print(alignment.name, cell_set.name)

        transcriptome_dir = os.path.join(database.DATA_PATH, "cell_sets", cell_set.name, "transcriptome")
        alignment_dir = os.path.join(transcriptome_dir, "transcripts", alignment.name)
        working_dir = os.path.join(alignment_dir, "tmp")
            
        # Check if the working directory already exists - this means an alignment may be in progress
        if os.path.exists(working_dir):
            if not REMOVE_TMP_DIR:
                print("Skipping %s, alignment in progress detected" % cell_set.name)
                continue
            else:
                shutil.rmtree(working_dir)

        # Check if this alignment has been completed already
        already_exists = True

        for file_name in FILE_NAMES_TO_TRANSFER:
             if not os.path.exists(os.path.join(alignment_dir, file_name)):
                    already_exists = False
        
        if already_exists:
            if not FORCE_RERUN:
                print("Skipping %s, alignment already detected" % cell_set.name)
                continue
            else:
                shutil.rmtree(alignment_dir)

        if not cell_set.has_reads(read_type="Transcriptome"):    
            print("Skipping %s, doesn't have transcriptome reads" % cell_set.name)
            continue

        read_set_names = set()

        for sequencing_library in cell_set.sequencing_libraries:

            if sequencing_library.type != "Transcriptome":
                continue

            for read_set in sequencing_library.read_sets:
                read_set_names.add(read_set.name)

        if len(read_set_names) == 0:
            print("Skipping %s, can't find read sets" % cell_set.name)
            continue

        if os.path.exists(alignment_dir):
            shutil.rmtree(alignment_dir)

        bash_script_file_name = "%s.sh" % cell_set.name
        shutil.copy(BASH_SCRIPT_TEMPLATE, bash_script_file_name)

        cellranger_args = {
            "id": cell_set.name,
            "fastqs": os.path.join(transcriptome_dir, "reads"),
            "sample": ",".join(read_set_names),
            "transcriptome": os.path.join(database.DATA_PATH, "references", reference_name),
            "localcores": "32"
        }

        with open(bash_script_file_name, "a") as bash_script_file:
            
            bash_script_file.write("mkdir -p %s\n" % working_dir)
            bash_script_file.write("pushd %s\n" % working_dir)
            bash_script_file.write("cellranger count")

            for key, value in cellranger_args.items():

                if value is not None:
                    bash_script_file.write(" --%s=%s" % (key, value))
                else:
                    bash_script_file.write(" --%s" % key)

            bash_script_file.write("\n")
            for file_name in FILE_NAMES_TO_TRANSFER:
                bash_script_file.write("mv %s %s\n" % (os.path.join(working_dir, cell_set.name, "outs", file_name), alignment_dir))
            bash_script_file.write("popd\n")
            bash_script_file.write("rm -r %s\n" % working_dir)
            bash_script_file.write("echo done\n")

        print("Queueing %s" % cell_set.name)
        subprocess.run("sbatch %s" % bash_script_file_name, shell=True)
        os.remove(bash_script_file_name)