# Test Specificity Filters

In [1]:
import os
from pathlib import Path

from oligo_designer_toolsuite.sequence_generator import CustomGenomicRegionGenerator, OligoSequenceGenerator
from oligo_designer_toolsuite.database import OligoDatabase, ReferenceDatabase
from oligo_designer_toolsuite.oligo_specificity_filter import ExactMatchFilter, BlastNFilter, BlastNSeedregionFilter, BlastNSeedregionLigationsiteFilter, BowtieFilter, Bowtie2Filter, CrossHybridizationFilter, RemoveByLargerRegionPolicy, RemoveByDegreePolicy
from oligo_designer_toolsuite.oligo_property_filter import PropertyFilter, PadlockArmsFilter


## Setup

In [2]:
# Output directory
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'notebooks/output')
Path(dir_output).mkdir(parents=True, exist_ok=True)

In [3]:
# Files
file_database_oligos_exact_match = "../../data/tests/databases/database_oligos_exactmatch.tsv"
file_database_oligos_match = "../../data/tests/databases/database_oligos_match.tsv"
file_database_oligos_nomatch = "../../data/tests/databases/database_oligos_nomatch.tsv"

file_database_oligos_ligation_match = "../../data/tests/databases/database_oligos_ligation_match.tsv"
file_database_oligos_ligation_nomatch = "../../data/tests/databases/database_oligos_ligation_nomatch.tsv"

file_database_oligos_crosshyb = "../../data/tests/databases/database_oligos_crosshybridization.tsv"

file_database_reference = "../../data/tests/databases/database_reference.fna"
file_database_reference_ligation = "../../data/tests/databases/database_reference_ligation.fna"

In [4]:
# Metadata
metadata_ncbi = {
    "files_source": "NCBI",
    "species": "Homo_sapiens",
    "annotation_release": "110",
    "genome_assembly": "GRCh38",
}

region_ids = ["AARS1", "DECR2", "FAM234A", "RHBDF1", "WASIR2"]

In [5]:
# Blast parameters
blast_search_parameters = {"perc_identity": 80, "strand": "plus", "word_size": 10}
blast_search_parameters_crosshyb = {"perc_identity": 80, "strand": "minus", "word_size": 10}

blast_hit_parameters = {"coverage": 50}
blast_hit_parameters_crosshyb = {"coverage": 50}

# Bowtie parameters
bowtie_search_parameters = {"-n": 3, "-l": 5}
bowtie_search_parameters_crosshyb = {"-n": 3, "-l": 5, "--nofw": ""}

bowtie2_search_parameters = {"-N": 0}

# Parameters Cross-hybridization
expected_oligos_bigger_region = {
    "region_1": {"region_1::oligo_7", "region_1::oligo_5", "region_1::oligo_6", "region_1::oligo_8", "region_1::oligo_4"},
    "region_2": {"region_2::oligo_3", "region_2::oligo_2", "region_2::oligo_6", "region_2::oligo_5", "region_2::oligo_4"},
    "region_3": {"region_3::oligo_1", "region_3::oligo_4", "region_3::oligo_3", "region_3::oligo_2", "region_3::oligo_5"},
}

expected_oligos_degree = {
    "region_1": {"region_1::oligo_1", "region_1::oligo_4", "region_1::oligo_5", "region_1::oligo_6", "region_1::oligo_7"},
    "region_2": {"region_2::oligo_1", "region_2::oligo_2", "region_2::oligo_3", "region_2::oligo_4", "region_2::oligo_5", "region_2::oligo_6", "region_2::oligo_7"},
    "region_3": {"region_3::oligo_2", "region_3::oligo_3", "region_3::oligo_4", "region_3::oligo_5"},
}

## Test Exact Match Filter

In [6]:
sequence_type = "oligo"

oligo_database_exact_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_exact_match.load_database(file_database_oligos_exact_match)

exactmatch_filter = ExactMatchFilter()
res = exactmatch_filter.apply(sequence_type, oligo_database_exact_match, 2)

assert "WASH7P::2" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered from exact matches!"
assert "AGRN::1" in res.database["AGRN"].keys(), "A non-matching oligo has been filtered from exact mathces!"

## Test BlastN Filter

In [7]:
reference_database = ReferenceDatabase(dir_output=dir_output)
reference_database.load_sequences_from_fasta(file_fasta=file_database_reference, database_overwrite=True)



In [8]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Blast!"

In [9]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Blast!"

In [10]:
reference_database_ligation = ReferenceDatabase(dir_output=dir_output)
reference_database_ligation.load_sequences_from_fasta(file_fasta=file_database_reference_ligation, database_overwrite=True)



In [11]:
sequence_type = "target"
seedregion_size = 10

oligo_database_ligation_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_ligation_match.load_database(file_database_oligos_ligation_match)

blast_ligation_filter = BlastNSeedregionLigationsiteFilter(seedregion_size, blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_ligation_filter.apply(sequence_type, oligo_database_ligation_match, 2, reference_database_ligation)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Blast!"

In [12]:
sequence_type = "target"
seedregion_size = 5

oligo_database_ligation_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_ligation_nomatch.load_database(file_database_oligos_ligation_nomatch)

blast_ligation_filter = BlastNSeedregionLigationsiteFilter(seedregion_size, blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_ligation_filter.apply(sequence_type, oligo_database_ligation_nomatch, 2, reference_database_ligation)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Blast!"

## Test Bowtie Filter

In [14]:
reference_database = ReferenceDatabase(dir_output=dir_output)
reference_database.load_sequences_from_fasta(file_fasta=file_database_reference, database_overwrite=True)



In [15]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Bowtie!"

# reads processed: 1
# reads with at least one alignment: 1 (100.00%)
# reads that failed to align: 0 (0.00%)
Reported 1 alignments


In [16]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Bowtie!"

# reads processed: 1
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 1 (100.00%)
No alignments


In [17]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

bowtie_filter = Bowtie2Filter(bowtie2_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Bowtie2!"

In [18]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

bowtie_filter = Bowtie2Filter(bowtie2_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Bowtie2!"

## Test Crosshybridization Filter

In [19]:
exactmatch_filter = ExactMatchFilter()
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_exact_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_exact_match.load_database(file_database_oligos_exact_match)

cross_hyb_filter = CrossHybridizationFilter(policy, exactmatch_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_exact_match, 2)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A non matching oligo has been filtered by exact matches!"
assert "WASH7P::3" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by exact mathces!"
assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by exact matches!"




In [20]:
blast_filter = BlastNFilter(blast_search_parameters_crosshyb, blast_hit_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, blast_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_bigger_region == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_bigger_region} Got: {filtered_oligos}"



In [21]:
blast_filter = BlastNFilter(blast_search_parameters_crosshyb, blast_hit_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByDegreePolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, blast_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_degree == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_degree} Got: {filtered_oligos}"



In [22]:
bowtie_filter = BowtieFilter(bowtie_search_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, bowtie_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_bigger_region == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_bigger_region} Got: {filtered_oligos}"

# reads processed: 8
# reads with at least one alignment: 4 (50.00%)
# reads that failed to align: 4 (50.00%)
Reported 4 alignments
# reads processed: 7
# reads with at least one alignment: 2 (28.57%)
# reads that failed to align: 5 (71.43%)
Reported 2 alignments
# reads processed: 5
# reads with at least one alignment: 3 (60.00%)
# reads that failed to align: 2 (40.00%)
Reported 4 alignments


In [23]:
bowtie_filter = BowtieFilter(bowtie_search_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByDegreePolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, bowtie_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_degree == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_degree} Got: {filtered_oligos}"

# reads processed: 8
# reads with at least one alignment: 4 (50.00%)
# reads that failed to align: 4 (50.00%)
Reported 4 alignments
# reads processed: 7
# reads with at least one alignment: 2 (28.57%)
# reads that failed to align: 5 (71.43%)
Reported 2 alignments
# reads processed: 5
# reads with at least one alignment: 3 (60.00%)
# reads that failed to align: 2 (40.00%)
Reported 4 alignments
