# Imports

In [7]:
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))
import yaml
from Bio.SeqUtils import MeltingTemp as mt

## Define the parameters


Once the configuration file has been set up we have to read its content:

In [8]:
config_file = "./configs/probe_design_SPOT_custom.yaml"
# config_file = "./SPOT/configs/probe_design_SPOT_custom.yaml"
with open(config_file, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)
dir_output = os.path.join(os.path.dirname(os.getcwd()), config["dir_output"]) # create the complete path for the output directory

## Oligo sequences generation

Now we can start to actually build the pipeline, we will start by generating all the possible oligos with length between the maximum and minimum value given belonging to the genes defined in the config file. The oligos will be saved in a nested dicionary with the following structure: 

[gene][oligo_id][oligo_feature].


In [9]:
from oligo_designer_toolsuite.database import CustomGenomicRegionGenerator, NcbiGenomicRegionGenerator, EnsemblGenomicRegionGenerator

# If the custom config file is selected
if config["source"] == "custom":
    region_generator_custom = CustomGenomicRegionGenerator(
        annotation_file=config["file_annotation"], 
        sequence_file=config["file_sequence"], 
        files_source=config["files_source"], 
        species=config["species"], 
        annotation_release=config["annotation_release"], 
        genome_assembly=config["genome_assembly"],
        dir_output=dir_output
    )
# If the Ncbi config file is selected
elif config["source"] == "ncbi":
    region_generator = NcbiGenomicRegionGenerator(
        taxon=config["taxon"],
        species=config["species"], 
        annotation_release=config["annotation_release"], 
        dir_output=dir_output
    )
# If the Ensembl config file is generated
elif config["source"] == "ensembl":
    region_generator = EnsemblGenomicRegionGenerator(
        species=config["species"], 
        annotation_release=config["annotation_release"], 
        dir_output=dir_output
    )

file_transcriptome = region_generator_custom.generate_transcript_reduced_representation(include_exon_junctions=True, exon_junction_size=2*config["oligo_length_max"])

index file ./data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna.fai not found, generating...


In [10]:
from oligo_designer_toolsuite.database import OligoDatabase

# define the database class
oligo_database = OligoDatabase(
    file_fasta = file_transcriptome,
    oligo_length_min = config["oligo_length_min"],
    oligo_length_max = config["oligo_length_max"],
    min_oligos_per_region = config["min_oligos_per_gene"],
    files_source = region_generator_custom.files_source,
    species = region_generator_custom.species,
    annotation_release = region_generator_custom.annotation_release,
    genome_assembly = region_generator_custom.genome_assembly,
    n_jobs = 2,
    dir_output=dir_output
)

# read the genes file
if config["file_genes"] is None:
    warnings.warn(
        "No file containing the genes was provided, all the genes are ussed to generate the probes. This chioce can use a lot of resources."
    )
    genes = None
else:
    with open(config["file_genes"]) as handle:
        lines = handle.readlines()
        genes = [line.rstrip() for line in lines]
        
# generate the oligo sequences from gene transcripts
oligo_database.create_database(region_ids=genes) 

# alternative: load database from file
# oligo_database.load_oligo_database(file_database)

### Dictionary structure


In [11]:
gene = list(oligo_database.database.keys())[0]
oligo_id = list(oligo_database.database[gene].keys())[0]

sample_oligos_DB = {}
sample_oligos_DB[gene] = {}
sample_oligos_DB[gene][oligo_id] = oligo_database.database[gene][oligo_id]
print(sample_oligos_DB)

{'AARS1': {'AARS1_16:70265636-70265662(-)': {'sequence': Seq('GAGACACTGCTTGGCTCCTCTATGAC'), 'chromosome': '16', 'start': [70265636], 'end': [70265662], 'strand': '-', 'length': 26, 'additional_information_fasta': ['transcript_id=NM_001605.3,exon_number=10;transcript_id=XM_047433666.1,exon_number=10']}}}


In [12]:
oligo_database.database.keys()

dict_keys(['AARS1', 'DECR2', 'FAM234A', 'RHBDF1', 'WASIR2'])

In [13]:
def number_of_probes(oligo_database):
    d = dict()
    for i in oligo_database.database.keys():
        d[i] = len(oligo_database.database[i])
    print(d)

### Read and write



In [14]:
if config["write_intermediate_steps"]:
    file_database = oligo_database.write_database(filename="oligo_database_initial.txt")
number_of_probes(oligo_database)

{'AARS1': 17175, 'DECR2': 7620, 'FAM234A': 26900, 'RHBDF1': 20650, 'WASIR2': 5060}


## Property filters

Once all the possible sequences are created, we apply a first filtering process based on the sequences properties (e.g. melting temperature or GC content). This is useful to reduce the amount of sequences we have to deal with in the next stages and discard all the sequences that are not suited for the experiment.

In [15]:
from oligo_designer_toolsuite.oligo_property_filter import (
    PropertyFilter,
    MaskedSequences,
    GCContent, 
    ProhibitedSequences
)
# initialize the filters clasees
masked_sequences = MaskedSequences()
gc_content = GCContent(GC_content_min=config["GC_content_min"], GC_content_max=config["GC_content_max"])
proh_seq = ProhibitedSequences(num_consecutive = config["number_consecutive"])

# create the list of filters
filters = [masked_sequences, proh_seq ,gc_content]

# initialize the property filter class
property_filter = PropertyFilter(filters=filters, write_regions_with_insufficient_oligos=config["write_removed_genes"])

# filter the database
oligo_database = property_filter.apply(oligo_database=oligo_database, n_jobs=config["n_jobs"])

# check for unsufficient number of probes
#oligo_database.remove_genes_with_insufficient_oligos(pipeline_step = "after applying property filters")

# write the intermediate result in a file
if config["write_intermediate_steps"]:
    file_database = oligo_database.write_database(filename="oligo_database_property_filter.txt")

In [16]:
number_of_probes(oligo_database)

{'AARS1': 11606, 'DECR2': 2777, 'FAM234A': 13878, 'RHBDF1': 10462, 'WASIR2': 2032}


## Specificity filters


In [17]:
from oligo_designer_toolsuite.database import ReferenceDatabase
from oligo_designer_toolsuite.oligo_specificity_filter import (
    SpecificityFilter,
    ExactMatches,
    Blastn,
)
dir_specificity = os.path.join(dir_output, "specificity_temporary") # folder where the temporary files will be written


reference = ReferenceDatabase(
    file_fasta = file_transcriptome,
    files_source = region_generator_custom.files_source,
    species = region_generator_custom.species,
    annotation_release = region_generator_custom.annotation_release,
    genome_assembly = region_generator_custom.genome_assembly,
    dir_output=dir_output
    )

# intialize the filter classes
exact_mathces = ExactMatches(dir_specificity=dir_specificity)
blastn = Blastn(
    dir_specificity=dir_specificity, 
    word_size=config["word_size"],
    percent_identity=config["percent_identity"],
    coverage=config["coverage"],
    strand=config["strand"],
    #strand='plus',
)
filters = [exact_mathces,blastn]

# initialize the specificity filter class
specificity_filter = SpecificityFilter(filters=filters, write_regions_with_insufficient_oligos=config["write_removed_genes"])
# filte r the database
oligo_database = specificity_filter.apply(oligo_database=oligo_database, reference_database=reference, n_jobs=config["n_jobs"])
# write the intermediate result
if config["write_intermediate_steps"]:
    file_database = oligo_database.write_database(filename="oligo_database_specificity_filter.txt")

In [18]:
number_of_probes(oligo_database)

{'AARS1': 1976, 'DECR2': 540, 'FAM234A': 1802, 'RHBDF1': 1339, 'WASIR2': 104}


## Total probe generator

In [19]:
# create readout probes
from oligo_designer_toolsuite.sequence_design.readout_probes_generator import ReadoutProbes
readout_generator = ReadoutProbes(length=15,  number_probes = 60, GC_min=45, GC_max=55,number_consecutive = 5, random_seed = 0, blast_filter = blastn, reference_DB = reference)
readout_probes = readout_generator.create_probes()

In [20]:
from oligo_designer_toolsuite.sequence_design.barcoding_creation import BarcodingCreator
barcodes_creator = BarcodingCreator(20, list(oligo_database.database.keys()))
barcodes_for_genes =  barcodes_creator.create_barcodes()

In [21]:
barcodes_for_genes

{'AARS1': [7, 9, 13, 9],
 'DECR2': [4, 2, 3, 15],
 'FAM234A': [15, 19, 19, 5],
 'RHBDF1': [1, 11, 7, 3],
 'WASIR2': [8, 1, 0, 13]}

In [22]:
# generate barcodes
from oligo_designer_toolsuite.sequence_design.seqFISH_probes_designer import SeqfishProbesCreator
probes_creator = SeqfishProbesCreator()
oligo_database.database = probes_creator.create_probes(oligo_database.database, readout_probes, barcodes_for_genes)

In [23]:
number_of_probes(oligo_database)

{'AARS1': 1976, 'DECR2': 540, 'FAM234A': 1802, 'RHBDF1': 1339, 'WASIR2': 104}


## Cross-Hybridization check

In [25]:
oligo_database.write_fasta_from_database(filename = 'fasta_from_our_db')
ref_db = ReferenceDatabase(file_fasta ='./output_SPOT_custom/oligo_database/fasta_from_our_db.fna')



In [26]:
exact_mathces = ExactMatches(dir_specificity=dir_specificity)
blastn = Blastn(
    dir_specificity=dir_specificity, 
    word_size=config["word_size"],
    percent_identity=config["percent_identity"],
    coverage=config["coverage"],
    strand='minus',
)
filters = [exact_mathces,blastn]

# initialize the specificity filter class
specificity_filter = SpecificityFilter(filters=filters, write_regions_with_insufficient_oligos=config["write_removed_genes"])
# filte r the database
oligo_database = specificity_filter.apply(oligo_database=oligo_database, reference_database=ref_db, n_jobs=config["n_jobs"])


In [27]:
number_of_probes(oligo_database)

{'AARS1': 1976, 'DECR2': 540, 'FAM234A': 1802, 'RHBDF1': 1339, 'WASIR2': 104}


## Oligoset generation

In [28]:
from oligo_designer_toolsuite.oligo_efficiency import(
    SeqFISHOligoScoring,
    AverageSetScoring
)
from oligo_designer_toolsuite.oligo_selection import OligosetGenerator, padlock_heuristic_selection

# initialize the scoring classes
"""oligos_scoring = PadlockOligoScoring(
    Tm_min=config["Tm_min"],
    Tm_opt=config["Tm_opt"],
    Tm_max=config["Tm_max"],
    GC_content_min=config["GC_content_min"],
    GC_content_opt=config["GC_content_opt"],
    GC_content_max=config["GC_content_max"],
    Tm_weight=config["Tm_weight"],
    GC_weight=config["GC_weight"],
)
"""


oligos_scoring = SeqFISHOligoScoring(
    GC_content_min=config["GC_content_min"],
    GC_content_opt=config["GC_content_opt"],
    GC_content_max=config["GC_content_max"],
    GC_weight=config["GC_weight"],)
set_scoring = AverageSetScoring()

# initialize the oligoset generator class
oligoset_generator = OligosetGenerator(
    oligoset_size=config["oligoset_size"], 
    min_oligoset_size=config["min_oligoset_size"],
    oligos_scoring=oligos_scoring,
    set_scoring=set_scoring,
    heurustic_selection=padlock_heuristic_selection,
    write_regions_with_insufficient_oligos=config["write_removed_genes"]
)

# generate the oligoset
oligo_database = oligoset_generator.apply(oligo_database=oligo_database, n_sets=config["n_sets"], n_jobs=config["n_jobs"])
# write the intermediate result
if config["write_intermediate_steps"]:
    oligo_database.write_oligosets(dir_oligosets="oligosets")



KeyboardInterrupt: 