# Imports

In [1]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
import yaml
from Bio.SeqUtils import MeltingTemp as mt

## Define the parameters


Once the configuration file has been set up we have to read its content:

In [2]:
config_file = "./SPOT/configs/probe_design_SPOT_ncbi.yaml"
# config_file = "./SPOT/configs/probe_design_SPOT_custom.yaml"
with open(config_file, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)
dir_output = os.path.join(os.path.dirname(os.getcwd()), config["dir_output"]) # create the complete path for the output directory

## Oligo sequences generation

Now we can start to actually build the pipeline, we will start by generating all the possible oligos with length between the maximum and minimum value given belonging to the genes defined in the config file. The oligos will be saved in a nested dicionary with the following structure: 

[gene][oligo_id][oligo_feature].


In [3]:
from oligo_designer_toolsuite.database import CustomOligoDB, NcbiOligoDB, EnsemblOligoDB

# define the database class
if config["source"] == "ncbi":
    # dowload the fasta files formthe NCBI server
    oligo_database = NcbiOligoDB(
        oligo_length_min=config["oligo_length_min"],
        oligo_length_max=config["oligo_length_max"],
        species=config["species"],
        annotation_release=config["annotation_release"],
        n_jobs=config["n_jobs"],
        dir_output=dir_output,
        min_oligos_per_gene=config["min_oligos_per_gene"],
        )
elif config["source"] == "ensembl":
    # dowload the fasta files formthe NCBI server
    oligo_database = EnsemblOligoDB(
        oligo_length_min=config["oligo_length_min"],
        oligo_length_max=config["oligo_length_max"],
        species=config["species"],
        annotation_release=config["annotation_release"],
        n_jobs=config["n_jobs"],
        dir_output=dir_output,
        min_oligos_per_gene=config["min_oligos_per_gene"],
        )
elif config["source"] == "custom":
    # use already dowloaded files
    
    oligo_database = CustomOligoDB(
        oligo_length_min=config["oligo_length_min"],
        oligo_length_max=config["oligo_length_max"],
        species=config["species"],
        genome_assembly=config["genome_assembly"],
        annotation_release=config["annotation_release"],
        files_source=config["files_source"],
        annotation_file=config["annotation_file"],
        sequence_file=config["sequence_file"],
        n_jobs=config["n_jobs"],
        dir_output=dir_output,
        min_oligos_per_gene=config["min_oligos_per_gene"],
        )
else:
    raise ValueError("Annotation source not supported!") 

# read the genes file
with open(config["file_genes"]) as handle:
    lines = handle.readlines()
    genes = [line.rstrip() for line in lines]
    
#generate the oligo sequences from gene transcripts
oligo_database.create_oligos_DB(genes=genes, region='transcripts')

### Dictionary structure


In [5]:
gene = list(oligo_database.oligos_DB.keys())[0]
oligo_id = list(oligo_database.oligos_DB[gene].keys())[0]

sample_oligos_DB = {}
sample_oligos_DB[gene] = {}
sample_oligos_DB[gene][oligo_id] = oligo_database.oligos_DB[gene][oligo_id]
print(sample_oligos_DB)

{'LMBRD1': {'LMBRD1_1': {'sequence': Seq('GTGGGGCGGCAGCGCTGGATCTTTGC'), 'transcript_id': ['NM_001367272.1', 'NM_001363722.2', 'NM_001367271.1', 'NM_001367272.1'], 'exon_id': ['NM_001367272.1_exon2_NM_001367272.1_exon1', 'NM_001363722.2_exon1', 'NM_001367271.1_exon1', 'NM_001367272.1_exon1'], 'chromosome': '6', 'start': [69790443, 69796313, 69796576, 69796580], 'end': [69790469, 69796339, 69796602, 69796606], 'strand': '-', 'length': 26}}}


In [6]:
oligo_database.oligos_DB.keys()

dict_keys(['LMBRD1', 'IKZF1'])

In [7]:
def number_of_probes(oligo_database):
    d = dict()
    for i in oligo_database.oligos_DB.keys():
        d[i] = len(oligo_database.oligos_DB[i])
    print(d)

### Read and write



In [8]:
if config["write_intermediate_steps"]:
    oligo_database.write_oligos_DB(format=config["file_format"], dir_oligos_DB="oligos_creation")
number_of_probes(oligo_database)

{'LMBRD1': 21205, 'IKZF1': 116350}


## Property filters

Once all the possible sequences are created, we apply a first filtering process based on the sequences properties (e.g. melting temperature or GC content). This is useful to reduce the amount of sequences we have to deal with in the next stages and discard all the sequences that are not suited for the experiment.

In [9]:
from oligo_designer_toolsuite.oligo_property_filter import (
    PropertyFilter,
    MaskedSequences,
    GCContent, 
    ProhibitedSequences
)
# initialize the filters clasees
masked_sequences = MaskedSequences()
gc_content = GCContent(GC_content_min=config["GC_content_min"], GC_content_max=config["GC_content_max"])
proh_seq = ProhibitedSequences(num_consecutive = config["number_consecutive"])

# create the list of filters
filters = [masked_sequences, proh_seq ,gc_content]

# initialize the property filter class
property_filter = PropertyFilter(filters=filters, write_genes_with_insufficient_oligos=config["write_removed_genes"])

# filter the database
oligo_database = property_filter.apply(oligo_database=oligo_database, n_jobs=config["n_jobs"])

# check for unsufficient number of probes
oligo_database.remove_genes_with_insufficient_oligos(pipeline_step = "after applying property filters")

# write the intermediate result in a file
if config["write_intermediate_steps"]:
    oligo_database.write_oligos_DB(format=config["file_format"], dir_oligos_DB="property_filter")

In [10]:
number_of_probes(oligo_database)

{'LMBRD1': 4626, 'IKZF1': 48840}


## Specificity filters


In [11]:
from oligo_designer_toolsuite.database import CustomReferenceDB, NcbiReferenceDB, EnsemblReferenceDB
from oligo_designer_toolsuite.oligo_specificity_filter import (
    SpecificityFilter,
    ExactMatches,
    Blastn,
)

dir_specificity = os.path.join(dir_output, "specificity_temporary") # folder where the temporary files will be written

# generate the reference
reference_database = CustomReferenceDB(
    species=oligo_database.species,
    genome_assembly=oligo_database.genome_assembly,
    annotation_release=oligo_database.annotation_release,
    files_source=oligo_database.files_source,
    annotation_file=oligo_database.annotation_file,
    sequence_file=oligo_database.sequence_file,
    dir_output=dir_output
)
reference_database.create_reference_DB(block_size=config["block_size"]) # leave the standard parameters

# intialize the filter classes
exact_mathces = ExactMatches(dir_specificity=dir_specificity)
blastn = Blastn(
    dir_specificity=dir_specificity, 
    word_size=config["word_size"],
    percent_identity=config["percent_identity"],
    coverage=config["coverage"],
)
filters = [exact_mathces,blastn]

# initialize the specificity filter class
specificity_filter = SpecificityFilter(filters=filters, write_genes_with_insufficient_oligos=config["write_removed_genes"])

# filter the database
oligo_database = specificity_filter.apply(oligo_database=oligo_database, reference_database=reference_database, n_jobs=config["n_jobs"])

# check for unsufficient number of probes
oligo_database.remove_genes_with_insufficient_oligos(pipeline_step = "after applying specificity filters")

# write the intermediate result
if config["write_intermediate_steps"]:
    oligo_database.write_oligos_DB(format=config["file_format"], dir_oligos_DB="specificity_filter")

KeyboardInterrupt: 

In [None]:
number_of_probes(oligo_database)

{'CYBA': 748, 'MVD': 663, 'GALNS': 2558, 'ZNF778': 4615, 'CPNE7': 1301}


## Oligoset generation


In [12]:
from oligo_designer_toolsuite.oligo_efficiency import(
    PadlockOligoScoring,
    PadlockSetScoring,
    SeqFISHOligoScoring,
    AverageSetScoring
)
from oligo_designer_toolsuite.oligo_selection import OligosetGenerator, padlock_heuristic_selection

# initialize the scoring classes
"""oligos_scoring = PadlockOligoScoring(
    Tm_min=config["Tm_min"],
    Tm_opt=config["Tm_opt"],
    Tm_max=config["Tm_max"],
    GC_content_min=config["GC_content_min"],
    GC_content_opt=config["GC_content_opt"],
    GC_content_max=config["GC_content_max"],
    Tm_weight=config["Tm_weight"],
    GC_weight=config["GC_weight"],
)
"""


oligos_scoring = SeqFISHOligoScoring(
    GC_content_min=config["GC_content_min"],
    GC_content_opt=config["GC_content_opt"],
    GC_content_max=config["GC_content_max"],
    GC_weight=config["GC_weight"],)
set_scoring = AverageSetScoring()

# initialize the oligoset generator class
oligoset_generator = OligosetGenerator(
    oligoset_size=config["oligoset_size"], 
    min_oligoset_size=config["min_oligoset_size"],
    oligos_scoring=oligos_scoring,
    set_scoring=set_scoring,
    heurustic_selection=padlock_heuristic_selection,
    write_genes_with_insufficient_oligos=config["write_removed_genes"]
)

# generate the oligoset
oligo_database = oligoset_generator.apply(oligo_database=oligo_database, n_sets=config["n_sets"], n_jobs=config["n_jobs"])
# write the intermediate result
if config["write_intermediate_steps"]:
    oligo_database.write_oligosets(dir_oligosets="oligosets")



In [17]:
oligo_database.oligosets

{}

## Total probe generator

In [12]:
# create readout probes
from oligo_designer_toolsuite.readout_probes.readout_probes_generator import ReadoutProbes
readout_generator = ReadoutProbes(length=15,  number_probes = 60, GC_min=45, GC_max=55, blast_filter = blastn, reference_DB=reference_database)
readout_probes = readout_generator.create_probes()

In [13]:
readout_probes[0]

Seq('GCAACGTAATCGTGG')

In [14]:
from oligo_designer_toolsuite.readout_probes.barcoding_creation import BarcodingCreator
barcodes_creator = BarcodingCreator(20, list(oligo_database.oligos_DB.keys()))
barcodes_for_genes =  barcodes_creator.create_barcodes()

In [15]:
barcodes_for_genes

{'CYBA': [7, 9, 13, 9],
 'MVD': [4, 2, 3, 15],
 'GALNS': [15, 19, 19, 5],
 'ZNF778': [1, 11, 7, 3],
 'CPNE7': [8, 1, 0, 13]}

In [16]:
# generate barcodes
from oligo_designer_toolsuite.experiment_specific.seqFISH_probes_designer import SeqfishProbesCreator
probes_creator = SeqfishProbesCreator()
oligo_database.oligos_DB = probes_creator.create_probes(oligo_database.oligos_DB, readout_probes, barcodes_for_genes)

In [17]:
oligo_database.oligos_DB['CYBA']['CYBA_10024']

{'sequence': Seq('CCATGCACTAGGCTTTCTCCCACGCTTGCC'),
 'transcript_id': ['XM_011522905.4'],
 'exon_id': ['XM_011522905.4_exon6'],
 'chromosome': '16',
 'start': [88643703],
 'end': [88643733],
 'strand': '-',
 'length': 30,
 'melting_temperature': 66.3320282696755,
 'GC_content': 60.0,
 'sequence_with_barcodes': Seq('CTATAGAGGGTCTACAGGTAATACGCACCGCCATGCACTAGGCTTTCTCCCACG...CCG')}

In [18]:
'T'*6

'TTTTTT'

## !!! Add cross hybridization check here !!!