# Imports

In [1]:
import os
import time
from pathlib import Path

import yaml

# # loading initial data
# from oligo_designer_toolsuite.database import (
#     CustomGenomicRegionGenerator,
#     NcbiGenomicRegionGenerator,
#     EnsemblGenomicRegionGenerator,
#     OligoDatabase,
#     ReferenceDatabase,
# )

# # Property filters
# from oligo_designer_toolsuite.oligo_property_filter import (
#     MaskedSequences,
#     ConsecutiveRepeats,
#     GCContent,
#     PropertyFilter
# )   

# # Specificity filters
# from oligo_designer_toolsuite.oligo_specificity_filter import (
#     ExactMatches,
#     Blastn,
#     SpecificityFilter
# )

# # Readout probes
# from oligo_designer_toolsuite.sequence_design import (
#     SeqFishReadoutProbes
# )

In [2]:
from oligo_designer_toolsuite.pipelines import __seqFish_plus_probe_designer

# Paths

In [10]:
seqfish_config_path = "../../tutorials/configs/probe_design_SPOT_ncbi.yaml"
file_genes = "../../tutorials/data/genes/ncbi_10.txt"

In [11]:
with open(seqfish_config_path, "r") as handle:
        config = yaml.safe_load(handle)
        
dir_output = "output_seqfish"
Path(dir_output).mkdir(parents=True, exist_ok=True)

# Loading initial data

In [None]:
region_generator = NcbiGenomicRegionGenerator(
            taxon=config["taxon"],
            species=config["species"],
            annotation_release=config["annotation_release"],
            dir_output=dir_output,
        )

  csv_df = pd.read_csv(csv_file, sep="\t", names=self.GFF_HEADER, header=None)


In [5]:
file_transcriptome = region_generator.generate_transcript_reduced_representation(
        include_exon_junctions=True, exon_junction_size=2 * config["oligo_length_max"]
    )

index file output_seqfish/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna.fai not found, generating...


In [8]:
oligo_database = OligoDatabase(
        file_fasta=file_transcriptome,
        min_oligos_per_region=config["min_oligos_per_gene"],
        files_source=region_generator.files_source,
        species=region_generator.species,
        annotation_release=region_generator.annotation_release,
        genome_assembly=region_generator.genome_assembly,
        n_jobs=config["n_jobs"],
        dir_output=dir_output,
    )

In [10]:
with open(file_genes) as handle:
    lines = handle.readlines()
    genes = [line.rstrip() for line in lines]

# generate the oligo sequences from gene transcripts
oligo_database.create_database(region_ids=genes,
                                oligo_length_min=config["oligo_length_min"],
                                oligo_length_max=config["oligo_length_max"])

In [13]:
oligo_database.write_database(filename='database_0.tsv')

'/Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_0.tsv'

In [17]:
!wc -l /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_0.tsv

  202574 /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_0.tsv


# Pipeline:

## Property filters

In [31]:
masked_sequences = MaskedSequences()
gc_content = GCContent(GC_content_min=config["GC_content_min"], GC_content_max=config["GC_content_max"])
consecutive_repeats = ConsecutiveRepeats(num_consecutive = config["number_consecutive"])

filters = [masked_sequences, consecutive_repeats ,gc_content]
property_filter = PropertyFilter(filters=filters, write_regions_with_insufficient_oligos=config["write_removed_genes"])

oligo_database = property_filter.apply(oligo_database=oligo_database, n_jobs=config["n_jobs"])

file_database = oligo_database.write_database(filename="database_property_filter.tsv")


In [32]:
file_database

'/Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_property_filter.tsv'

In [33]:
!wc -l /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_property_filter.tsv

   76114 /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_property_filter.tsv


## Specificity filters

In [44]:
dir_specificity = os.path.join(dir_output, "specificity_temporary") # folder where the temporary files will be written
reference = ReferenceDatabase(
    file_fasta = file_transcriptome,
    files_source = region_generator.files_source,
    species=region_generator.species,
    annotation_release=region_generator.annotation_release,
    genome_assembly=region_generator.genome_assembly,
    dir_output=dir_output
    )
reference.load_fasta_into_database()
exact_mathces = ExactMatches(dir_specificity=dir_specificity)
blastn = Blastn(
    dir_specificity=dir_specificity, 
    word_size=config["word_size"],
    percent_identity=config["percent_identity"],
    coverage=config["coverage"],
    strand=config["strand"],
)
filters = [exact_mathces,blastn]
specificity_filter = SpecificityFilter(filters=filters, write_regions_with_insufficient_oligos=config["write_removed_genes"])
oligo_database = specificity_filter.apply(oligo_database=oligo_database, reference_database=reference, n_jobs=config["n_jobs"])

file_database = oligo_database.write_database(filename="database_specificity_filter.tsv")



In [45]:
file_database

'/Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_specificity_filter.tsv'

In [46]:
!wc -l /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_specificity_filter.tsv

    3665 /Users/isra.mekki/Projects/odt/oligo-designer-toolsuite/tests/output_seqfish/oligo_database/database_specificity_filter.tsv


## Readout probes

In [None]:
readouts_generator = SeqFISHReadoutProbeDesigner(
    self.config, blastn, self.reference, self.dir_output
)

self.readout_probes = readouts_generator.create_readout_probes()

In [54]:
import itertools
import random

def _get_barcode(region_idx, length=4, seed=0):
            """Get barcode sub sequence of padlock oligo for in situ sequencing

            For SCRINSHOT padlock oligos this could be constant, however it makes sense to have
            different barcodes so that the oligo set could also be used for ISS experiments.

            Arguments
            ---------
            region_idx: int
                Identifier for a given region. The identifier makes sure to return the same bar code
                for the different padlock oligos of a given region.
            length: int
                Length of barcode sequence
            seed: int
                Defines the random assignment of barcodes to each region_idx.

            Returns
            -------
            str: barcode sequence (5' to 3')

            """
            bases = ["A", "C", "T", "G"]

            barcodes = ["".join(nts) for nts in itertools.product(bases, repeat=length)]
            print(barcodes)
            random.seed(seed)
            random.shuffle(barcodes)

            if region_idx >= len(barcodes):
                raise ValueError(
                    "Barcode index exceeds number of possible combinations of barcodes. Increase barcode length?"
                )

            return barcodes[region_idx]

In [55]:
_get_barcode(0)

['AAAA', 'AAAC', 'AAAT', 'AAAG', 'AACA', 'AACC', 'AACT', 'AACG', 'AATA', 'AATC', 'AATT', 'AATG', 'AAGA', 'AAGC', 'AAGT', 'AAGG', 'ACAA', 'ACAC', 'ACAT', 'ACAG', 'ACCA', 'ACCC', 'ACCT', 'ACCG', 'ACTA', 'ACTC', 'ACTT', 'ACTG', 'ACGA', 'ACGC', 'ACGT', 'ACGG', 'ATAA', 'ATAC', 'ATAT', 'ATAG', 'ATCA', 'ATCC', 'ATCT', 'ATCG', 'ATTA', 'ATTC', 'ATTT', 'ATTG', 'ATGA', 'ATGC', 'ATGT', 'ATGG', 'AGAA', 'AGAC', 'AGAT', 'AGAG', 'AGCA', 'AGCC', 'AGCT', 'AGCG', 'AGTA', 'AGTC', 'AGTT', 'AGTG', 'AGGA', 'AGGC', 'AGGT', 'AGGG', 'CAAA', 'CAAC', 'CAAT', 'CAAG', 'CACA', 'CACC', 'CACT', 'CACG', 'CATA', 'CATC', 'CATT', 'CATG', 'CAGA', 'CAGC', 'CAGT', 'CAGG', 'CCAA', 'CCAC', 'CCAT', 'CCAG', 'CCCA', 'CCCC', 'CCCT', 'CCCG', 'CCTA', 'CCTC', 'CCTT', 'CCTG', 'CCGA', 'CCGC', 'CCGT', 'CCGG', 'CTAA', 'CTAC', 'CTAT', 'CTAG', 'CTCA', 'CTCC', 'CTCT', 'CTCG', 'CTTA', 'CTTC', 'CTTT', 'CTTG', 'CTGA', 'CTGC', 'CTGT', 'CTGG', 'CGAA', 'CGAC', 'CGAT', 'CGAG', 'CGCA', 'CGCC', 'CGCT', 'CGCG', 'CGTA', 'CGTC', 'CGTT', 'CGTG', 'CGGA',

'AAAC'