# Test Specificity Filters

In [1]:
import os
from pathlib import Path
from Bio.SeqUtils import MeltingTemp as mt

from oligo_designer_toolsuite.sequence_generator import CustomGenomicRegionGenerator, OligoSequenceGenerator
from oligo_designer_toolsuite.database import OligoDatabase, ReferenceDatabase
from oligo_designer_toolsuite.oligo_specificity_filter import BlastNFilter, BlastNSeedregionFilter, BlastNSeedregionLigationsiteFilter, BowtieFilter, Bowtie2Filter, CrossHybridizationFilter, RemoveByLargerRegionPolicy, RemoveByDegreePolicy
from oligo_designer_toolsuite.oligo_property_filter import PropertyFilter, PadlockArmsFilter


In [2]:
# Parameter definition
Tm_parameters = {
    "nn_table": getattr(mt, "DNA_NN3"),
    "tmm_table": getattr(mt, "DNA_TMM1"),
    "imm_table": getattr(mt, "DNA_IMM1"),
    "de_table": getattr(mt, "DNA_DE1"),
    "dnac1": 50,  # [nM]
    "dnac2": 0,
    "selfcomp": False,
    "saltcorr": 7,
    "Na": 50,  # [mM]
    "K": 75,  # [mM]
    "Tris": 20,  # [mM]
    "Mg": 10,  # [mM]
    "dNTPs": 0,
}

annotation_file = "../../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf"
sequence_file = "../../data/annotations/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna"
region_ids = ["AARS1","DECR2","FAM234A","RHBDF1","WASIR2"]

# Output directory
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'notebooks/output')
Path(dir_output).mkdir(parents=True, exist_ok=True)

In [3]:
region_generator_ncbi = CustomGenomicRegionGenerator(annotation_file, sequence_file, files_source="NCBI", species = "Homo_sapiens", annotation_release="110", genome_assembly="GRCh38")
file_reference = region_generator_ncbi.get_sequence_exon()

metadata_ncbi = {
    "annotation_source": region_generator_ncbi.files_source, 
    "species": region_generator_ncbi.species, 
    "annotation_release": region_generator_ncbi.annotation_release, 
    "genome_assembly": region_generator_ncbi.genome_assembly
}

In [4]:
oligo_sequence_generator = OligoSequenceGenerator()
file_fasta_oligos = oligo_sequence_generator.create_sequences_sliding_window(
    filename_out="sliding_window_sequences",
    file_fasta_in=file_reference,
    length_interval_sequences=(30, 31),
)

In [5]:
def get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids):
    oligos = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True)
    oligos.load_metadata(metadata_ncbi)
    oligos.load_sequences_from_fasta(
        file_fasta_in=file_fasta_oligos,
        sequence_type="target",
        region_ids=region_ids,
        database_overwrite=True,
    )
    return oligos

In [6]:
reference_exon = ReferenceDatabase()
reference_exon.load_metadata(metadata=metadata_ncbi)
reference_exon.load_sequences_fom_fasta(file_fasta=file_reference, database_overwrite=True)



In [7]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)
file_fasta = oligos.write_database_to_fasta("reference_db_tmp", None, "oligo")

reference_oligos = ReferenceDatabase()
reference_oligos.load_metadata(metadata=oligos.metadata)
reference_oligos.load_sequences_fom_fasta(file_fasta=file_fasta, database_overwrite=True)
reference_oligos.filter_database(region_ids)
os.remove(file_fasta)



## Test Blast Filters

In [8]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

sequence_type="oligo"
blast_search_parameters = {"perc_identity":80, "strand": "minus", "word_size": 10}
blast_hit_parameters = {"coverage": 80}
blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)

hits = blast_filter.get_oligo_pair_hits(sequence_type, oligos, 2, reference_oligos)
hits[:5]

[('AARS1::3995', 'RHBDF1::2076'),
 ('AARS1::3995', 'RHBDF1::2075'),
 ('AARS1::3995', 'RHBDF1::2074'),
 ('AARS1::3995', 'RHBDF1::2073'),
 ('AARS1::3995', 'RHBDF1::2072')]

In [9]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

sequence_type="target"
blast_search_parameters = {"perc_identity":80, "strand": "plus", "word_size": 10}
blast_hit_parameters = {"coverage": 50, "min_alignment_length": 25}
blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
oligos = blast_filter.apply(sequence_type, oligos, 2, reference_exon)



In [10]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

sequence_type="target"
seedregion_start = 12
seedregion_end = 18
blast_search_parameters = {"perc_identity":80, "strand": "plus", "word_size": 10}
blast_hit_parameters = {"coverage": 50}
blast_filter = BlastNSeedregionFilter(seedregion_start, seedregion_end, blast_search_parameters, blast_hit_parameters, dir_output)
oligos = blast_filter.apply(sequence_type, oligos, 2, reference_exon)

In [11]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

padlock_arms_filter = PadlockArmsFilter(arm_length_min = 5, arm_Tm_dif_max = 5, arm_Tm_min = 40, arm_Tm_max = 60, Tm_parameters = Tm_parameters)
property_filters = PropertyFilter([padlock_arms_filter])
property_filters.apply(sequence_type, oligo_database=oligos)

sequence_type="target"
seedregion_size = 5
blast_search_parameters = {"perc_identity":80, "strand": "plus", "word_size": 10}
blast_hit_parameters = {"coverage": 50}
blast_filter = BlastNSeedregionLigationsiteFilter(seedregion_size, blast_search_parameters, blast_hit_parameters, dir_output)
oligos = blast_filter.apply(sequence_type, oligos, 2, reference_exon)

## Test Bowtie Filters

In [12]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

sequence_type="target"
bowtie_search_parameters = {"-v":1, "--nofw": ""}
bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output)
oligos = bowtie_filter.apply(sequence_type, oligos, 2, reference_exon)

# reads processed: 5635
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 5635 (100.00%)
No alignments
# reads processed: 2571
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 2571 (100.00%)
No alignments
# reads processed: 9351
# reads with at least one alignment: 833 (8.91%)
# reads that failed to align: 8518 (91.09%)
Reported 7938 alignments
# reads processed: 7050
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 7050 (100.00%)
No alignments
# reads processed: 1960
# reads with at least one alignment: 285 (14.54%)
# reads that failed to align: 1675 (85.46%)
Reported 5967 alignments


In [13]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)

sequence_type="target"
bowtie_search_parameters = {"-N":1, "--norc": ""}
bowtie_filter = Bowtie2Filter(bowtie_search_parameters, dir_output)
oligos = bowtie_filter.apply(sequence_type, oligos, 2, reference_exon)

## Test Cross-Hybridization Filters

In [14]:
blast_search_parameters = {"perc_identity":80, "strand": "minus", "word_size": 10}
blast_hit_parameters = {"coverage": 80}
specificity_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output)

policy1 = RemoveByLargerRegionPolicy()
policy2 = RemoveByDegreePolicy()

In [15]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)
sequence_type="oligo"

cross_hyb_filter = CrossHybridizationFilter(policy1, specificity_filter, dir_output)
oligos = cross_hyb_filter.apply(sequence_type, oligos, 2)



In [16]:
oligos = get_oligo_database(metadata_ncbi, file_fasta_oligos, region_ids)
sequence_type="oligo"

cross_hyb_filter = CrossHybridizationFilter(policy2, specificity_filter, dir_output)
oligos = cross_hyb_filter.apply(sequence_type, oligos, 2)

