# Test Specificity Filters

In [2]:
import os
from pathlib import Path

from oligo_designer_toolsuite.database import OligoDatabase, ReferenceDatabase
from oligo_designer_toolsuite.oligo_specificity_filter import ExactMatchFilter, BlastNFilter, BlastNSeedregionFilter, BlastNSeedregionLigationsiteFilter, BowtieFilter, Bowtie2Filter, CrossHybridizationFilter, RemoveByLargerRegionPolicy, RemoveByDegreePolicy

## Setup

In [2]:
# Output directory
dir_output = os.path.join(os.path.dirname(os.getcwd()), 'notebooks/output')
Path(dir_output).mkdir(parents=True, exist_ok=True)

In [3]:
# Files
file_database_oligos_exact_match = "../../data/tests/databases/database_oligos_exactmatch.tsv"
file_database_oligos_match = "../../data/tests/databases/database_oligos_match.tsv"
file_database_oligos_nomatch = "../../data/tests/databases/database_oligos_nomatch.tsv"

file_database_oligos_ligation_match = "../../data/tests/databases/database_oligos_ligation_match.tsv"
file_database_oligos_ligation_nomatch = "../../data/tests/databases/database_oligos_ligation_nomatch.tsv"

file_database_oligos_crosshyb = "../../data/tests/databases/database_oligos_crosshybridization.tsv"

file_database_reference = "../../data/tests/databases/database_reference.fna"
file_database_reference_ligation = "../../data/tests/databases/database_reference_ligation.fna"

In [4]:
# Metadata
metadata_ncbi = {
    "files_source": "NCBI",
    "species": "Homo_sapiens",
    "annotation_release": "110",
    "genome_assembly": "GRCh38",
}

region_ids = ["AARS1", "DECR2", "FAM234A", "RHBDF1", "WASIR2"]

In [5]:
# Blast parameters
blast_search_parameters = {"perc_identity": 80, "strand": "plus", "word_size": 10}
 

blast_hit_parameters = {"coverage": 50}
blast_hit_parameters_crosshyb = {"coverage": 50}

# Bowtie parameters
bowtie_search_parameters = {"-n": 3, "-l": 5}
bowtie_search_parameters_crosshyb = {"-n": 3, "-l": 5, "--nofw": ""}

bowtie2_search_parameters = {"-N": 0}

# Parameters Cross-hybridization
expected_oligos_bigger_region = {
    "region_1": {"region_1::oligo_7", "region_1::oligo_5", "region_1::oligo_6", "region_1::oligo_8", "region_1::oligo_4"},
    "region_2": {"region_2::oligo_3", "region_2::oligo_2", "region_2::oligo_6", "region_2::oligo_5", "region_2::oligo_4"},
    "region_3": {"region_3::oligo_1", "region_3::oligo_4", "region_3::oligo_3", "region_3::oligo_2", "region_3::oligo_5"},
}

expected_oligos_degree = {
    "region_1": {"region_1::oligo_1", "region_1::oligo_4", "region_1::oligo_5", "region_1::oligo_6", "region_1::oligo_7"},
    "region_2": {"region_2::oligo_1", "region_2::oligo_2", "region_2::oligo_3", "region_2::oligo_4", "region_2::oligo_5", "region_2::oligo_6", "region_2::oligo_7"},
    "region_3": {"region_3::oligo_2", "region_3::oligo_3", "region_3::oligo_4", "region_3::oligo_5"},
}

## Test Exact Match Filter

In [6]:
sequence_type = "oligo"

oligo_database_exact_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_exact_match.load_database(file_database_oligos_exact_match)

policy = RemoveByLargerRegionPolicy()
exactmatch_filter = ExactMatchFilter(policy)
res = exactmatch_filter.apply(sequence_type, oligo_database_exact_match, 2)

assert "WASH7P::2" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered from exact matches!"
assert "AGRN::1" in res.database["AGRN"].keys(), "A non-matching oligo has been filtered from exact mathces!"

## Test BlastN Filter

In [7]:
reference_database = ReferenceDatabase(dir_output=dir_output)
reference_database.load_sequences_from_fasta(file_fasta=file_database_reference, database_overwrite=True)



In [8]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
print(type(blast_filter))
res = blast_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Blast!"

<class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_blastn.BlastNFilter'>


In [10]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Blast!"

In [11]:
reference_database_ligation = ReferenceDatabase(dir_output=dir_output)
reference_database_ligation.load_sequences_from_fasta(file_fasta=file_database_reference_ligation, database_overwrite=True)



In [18]:
from oligo_designer_toolsuite.oligo_specificity_filter import _filter_blastn
import inspect
sequence_type = "target"
seedregion_size = 10

oligo_database_ligation_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_ligation_match.load_database(file_database_oligos_ligation_match)

blast_ligation_filter = BlastNSeedregionLigationsiteFilter(seedregion_size, blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_ligation_filter.apply(sequence_type, oligo_database_ligation_match, 2, reference_database_ligation)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Blast!"

[('AlignmentSpecificityFilter', <class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_base.AlignmentSpecificityFilter'>), ('BlastNFilter', <class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_blastn.BlastNFilter'>), ('BlastNSeedregionFilter', <class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_blastn.BlastNSeedregionFilter'>), ('BlastNSeedregionFilterBase', <class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_blastn.BlastNSeedregionFilterBase'>), ('BlastNSeedregionLigationsiteFilter', <class 'oligo_designer_toolsuite.oligo_specificity_filter._filter_blastn.BlastNSeedregionLigationsiteFilter'>), ('NcbiblastnCommandline', <class 'Bio.Blast.Applications.NcbiblastnCommandline'>), ('NcbimakeblastdbCommandline', <class 'Bio.Blast.Applications.NcbimakeblastdbCommandline'>), ('OligoDatabase', <class 'oligo_designer_toolsuite.database._oligos_database.OligoDatabase'>), ('Path', <class 'pathlib.Path'>), ('ReferenceDatabase', <class 'oligo_desig

In [None]:
sequence_type = "target"
seedregion_size = 5

oligo_database_ligation_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_ligation_nomatch.load_database(file_database_oligos_ligation_nomatch)

blast_ligation_filter = BlastNSeedregionLigationsiteFilter(seedregion_size, blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_ligation_filter.apply(sequence_type, oligo_database_ligation_nomatch, 2, reference_database_ligation)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Blast!"

## Test Bowtie Filter

In [None]:
reference_database = ReferenceDatabase(dir_output=dir_output)
reference_database.load_sequences_from_fasta(file_fasta=file_database_reference, database_overwrite=True)



In [None]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Bowtie!"

['GCTGGTCCGGGGACAGCGCGCCGGTCGAGT']
[Seq('GCTGGTCCGGGGACAGCGCGCCGGTCGAGC')]
['GCTGGTCCGGGGACAGCGCGCCGGTCGAGT']
[Seq('GCTGGTCCGGGGACAGCGCGCCGGTCGAGC')]


# reads processed: 1
# reads with at least one alignment: 1 (100.00%)
# reads that failed to align: 0 (0.00%)
Reported 1 alignments


In [None]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Bowtie!"

# reads processed: 1
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 1 (100.00%)
No alignments


In [None]:
sequence_type = "target"

oligo_database_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_match.load_database(file_database_oligos_match)

bowtie_filter = Bowtie2Filter(bowtie2_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_match, 2, reference_database)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by Bowtie2!"

In [None]:
sequence_type = "target"

oligo_database_nomatch = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_nomatch.load_database(file_database_oligos_nomatch)

bowtie_filter = Bowtie2Filter(bowtie2_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply(sequence_type, oligo_database_nomatch, 2, reference_database)

assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by Bowtie2!"

## Test Crosshybridization Filter

In [None]:
exactmatch_filter = ExactMatchFilter()
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_exact_match = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_exact_match.load_database(file_database_oligos_exact_match)

cross_hyb_filter = CrossHybridizationFilter(policy, exactmatch_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_exact_match, 2)

assert "WASH7P::1" not in res.database["WASH7P"].keys(), "A non matching oligo has been filtered by exact matches!"
assert "WASH7P::3" not in res.database["WASH7P"].keys(), "A matching oligo has not been filtered by exact mathces!"
assert "AGRN::1" in res.database["AGRN"].keys(), "A non matching oligo has been filtered by exact matches!"




In [None]:
blast_filter = BlastNFilter(blast_search_parameters_crosshyb, blast_hit_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, blast_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_bigger_region == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_bigger_region} Got: {filtered_oligos}"



In [None]:
blast_filter = BlastNFilter(blast_search_parameters_crosshyb, blast_hit_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByDegreePolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, blast_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_degree == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_degree} Got: {filtered_oligos}"



In [None]:
bowtie_filter = BowtieFilter(bowtie_search_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByLargerRegionPolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, bowtie_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_bigger_region == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_bigger_region} Got: {filtered_oligos}"



# reads processed: 8
# reads with at least one alignment: 4 (50.00%)
# reads that failed to align: 4 (50.00%)
Reported 4 alignments
# reads processed: 7
# reads with at least one alignment: 2 (28.57%)
# reads that failed to align: 5 (71.43%)
Reported 2 alignments
# reads processed: 5
# reads with at least one alignment: 3 (60.00%)
# reads that failed to align: 2 (40.00%)
Reported 4 alignments


In [None]:
bowtie_filter = BowtieFilter(bowtie_search_parameters_crosshyb, dir_output=dir_output)
policy = RemoveByDegreePolicy()

sequence_type = "oligo"

oligo_database_crosshyb = OligoDatabase(min_oligos_per_region=2, write_regions_with_insufficient_oligos=True, dir_output=dir_output)
oligo_database_crosshyb.load_database(file_database_oligos_crosshyb)

cross_hyb_filter = CrossHybridizationFilter(policy, bowtie_filter, dir_output)
res = cross_hyb_filter.apply(sequence_type, oligo_database_crosshyb, 2)

filtered_oligos = {key: {key_2 for key_2 in list(res.database[key].keys())} for key in list(res.database.keys())}
assert expected_oligos_degree == filtered_oligos, f"The cross-hybridization filter didn't return the expected oligos. Expected: {expected_oligos_degree} Got: {filtered_oligos}"



# reads processed: 8
# reads with at least one alignment: 4 (50.00%)
# reads that failed to align: 4 (50.00%)
Reported 4 alignments
# reads processed: 7
# reads with at least one alignment: 2 (28.57%)
# reads that failed to align: 5 (71.43%)
Reported 2 alignments
# reads processed: 5
# reads with at least one alignment: 3 (60.00%)
# reads that failed to align: 2 (40.00%)
Reported 4 alignments


## AI FIlters



In [None]:
def reverse_complement(sequence):
    nt = {"A": "T", "T": "A", "C": "G", "G":'C',"-":'-'}
    rc_seqeunce = []
    for n in sequence:
        rc_seqeunce = [nt[n]] + rc_seqeunce
    return "".join(rc_seqeunce)

Test blast in specific scenarious

In [None]:
blast_search_parameters = {"perc_identity": 80, "strand": "both", "word_size": 10}

# define the seqeunces
sequences = [
    "GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT", #100% match
    "TACAGGCATGACCCACCATGCCTGGCCAACTTACATTTTT", #2 mismatches
    "AAGGCCAAGGTCTCTGGGGGGCTGGACAAGCCGCCCTCAT",  #90% mismatches
    "TTTTGCACCAGCCCAGATCGCATCTTCTTTCACCTGTTTT",  #80% coverage
    "CCGCTCGGCTGCATGAAACCAAAACGGCTGTCCGGGGACA", #gaps on the target 
    "AACCCGGCATCACCAAGAGGAGGTTCAAGGGAACGCTGCA", #gaps on reference
    "TGCCCGCGCCGGAGTTCTCCCCAGCCGGAGTCCGGCAGGG", #gaps on the target, 80% cov
    "AACCTGGTTGCACCTCGGCCTGGTCCCAGCAGGTATGGTT", #gaps on reference, 80% cov
    "ACTGATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC", #overflow start
    "TATATATTTTGCACACTTTAAAATATTGGGTTGTTTACCG" # overflow end
]
refrences = [
    "GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT",  #100% match
    "TACAGGCATGAGCCACCATGCCTGGCCAACTCACATTTTT", #2 mismatches
    "AAGGCCGGGGTCTCTGGGGGGCTGGAGAAGCCTCCCTCAT",  #90% mismatches
    "AGCAGCACCAGCCCAGATCGCATCTTCTTTCACCTGAACG",  #80% coverage
    "CCGCTACCGGCTGCATGACAACCAAAACGGCTGGTCCGGGGACA",  #gaps on the target
    "AACCCCATCACCAAGAGGAGGTTCAGGGAAGCTGCA", #gaps on reference
    "TGCCCGCGCCGGAGTTCTCCCCGGAGCCGGAGTCCGGCAGGG", #gaps on the target, 80% cov
    "TCCCTGGGCACCTCGGCCTGGTCCCAGCAGGTATGGGC", #gaps on reference, 80% cov
    "----ATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC", #overflow start
    "TATATATTTTGCACACTTTAAAATATTGGGTTGTTT----", # oevrflow end
]
gapped_queries = [
    "GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT", #100% match
    "TACAGGCATGACCCACCATGCCTGGCCAACTTACATTTTT", #2 mismatches
    "AAGGCCAAGGTCTCTGGGGGGCTGGACAAGCCGCCCTCAT",  #90% mismatches
    "TTTTGCACCAGCCCAGATCGCATCTTCTTTCACCTGTTTT",  #80% coverage
    "CCGCT--CGGCTGCATGA-AACCAAAACGGCTG-TCCGGGGACA", #gaps on the target 
    "AACCCGGCATCACCAAGAGGAGGTTCAAGGGAACGCTGCA", #gaps on reference
    "TGCCCGCGCCGGAGTTCTCCCC--AGCCGGAGTCCGGCAGGG", #gaps on the target, 80% cov
    "AACCTGGTTGCACCTCGGCCTGGTCCCAGCAGGTATGGTT", #gaps on reference, 80% cov
    "ACTGATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC", #overflow start
    "TATATATTTTGCACACTTTAAAATATTGGGTTGTTTACCG" # overflow end
]
gapped_references = [
    "GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT",  #100% match
    "TACAGGCATGAGCCACCATGCCTGGCCAACTCACATTTTT", #2 mismatches
    "AAGGCCGGGGTCTCTGGGGGGCTGGAGAAGCCTCCCTCAT",  #90% mismatches
    "AGCAGCACCAGCCCAGATCGCATCTTCTTTCACCTGAACG",  #80% coverage
    "CCGCTACCGGCTGCATGACAACCAAAACGGCTGGTCCGGGGACA",  #gaps on the target
    "AACCC--CATCACCAAGAGGAGGTTCA-GGGAA-GCTGCA", #gaps on reference
    "TGCCCGCGCCGGAGTTCTCCCCGGAGCCGGAGTCCGGCAGGG", #gaps on the target, 80% cov
    "TCCCTGG--GCACCTCGGCCTGGTCCCAGCAGGTATGGGC", #gaps on reference, 80% cov
    "----ATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC", #overflow start
    "TATATATTTTGCACACTTTAAAATATTGGGTTGTTT----" # oevrflow end
]
# define dictionary
dict = {"region":{}}
for i, seq in enumerate(sequences):
    #add both the oligo and its reverse complement to search also on the minus strandnd
    dict["region"][f"region::{2*i}"] = {
        "oligo": reverse_complement(sequences[i]),
        "target": sequences[i],
        'chromosome': [None],
        'start': [None],
        'end': [None],
        'strand': [None],
        'regiontype': ['random_sequence']
    }
    dict["region"][f"region::{2*i + 1}"] = {
        "oligo": sequences[i],
        "target": reverse_complement(sequences[i]),
        'chromosome': [None],
        'start': [None],
        'end': [None],
        'strand': [None],
        'regiontype': ['random_sequence']
    }
oligo_database = OligoDatabase()
oligo_database.database = dict
oligo_database.save_database()
blast_filter = BlastNFilter(blast_search_parameters, blast_hit_parameters, dir_output=dir_output)
res = blast_filter.apply("target", oligo_database, 2, reference_database)
print(res.database)


['GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT', 'AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC', 'TACAGGCATGACCCACCATGCCTGGCCAACTTACATTTTT', 'AAAAATGTAAGTTGGCCAGGCATGGTGGGTCATGCCTGTA', 'AAGGCCAAGGTCTCTGGGGGGCTGGACAAGCCGCCCTCAT', 'ATGAGGGCGGCTTGTCCAGCCCCCCAGAGACCTTGGCCTT', 'TTTTGCACCAGCCCAGATCGCATCTTCTTTCACCTGTTTT', 'AAAACAGGTGAAAGAAGATGCGATCTGGGCTGGTGCAAAA', 'CCGCTCGGCTGCATGAAACCAAAACGGCTGTCCGGGGACA', 'TGTCCCCGGACAGCCGTTTTGGTTTCATGCAGCCGAGCGG', 'AACCCGGCATCACCAAGAGGAGGTTCAAGGGAACGCTGCA', 'TGCAGCGTTCCCTTGAACCTCCTCTTGGTGATGCCGGGTT', 'TGCCCGCGCCGGAGTTCTCCCCAGCCGGAGTCCGGCAGGG', 'CCCTGCCGGACTCCGGCTGGGGAGAACTCCGGCGCGGGCA', 'AACCTGGTTGCACCTCGGCCTGGTCCCAGCAGGTATGGTT', 'AACCATACCTGCTGGGACCAGGCCGAGGTGCAACCAGGTT', 'ACTGATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC', 'GTGCAGTGAGTGTCTGGGGTGAGCGTCTGCAGCAATCAGT', 'TATATATTTTGCACACTTTAAAATATTGGGTTGTTTACCG', 'CGGTAAACAACCCAATATTTTAAAGTGTGCAAAATATATA']
[Seq('GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT'), Seq('AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC'), Seq('TACAGGCATGAGCCA

Test bowtie in specific scenarious

In [None]:
dict = {"region":{}}
for i, seq in enumerate(sequences):
    #add both the oligo and its reverse complement to search also on the minus strandnd
    dict["region"][f"region::{2*i}"] = {
        "oligo": reverse_complement(sequences[i]),
        "target": sequences[i],
        'chromosome': [None],
        'start': [None],
        'end': [None],
        'strand': [None],
        'regiontype': ['random_sequence']
    }
    dict["region"][f"region::{2*i + 1}"] = {
        "oligo": sequences[i],
        "target": reverse_complement(sequences[i]),
        'chromosome': [None],
        'start': [None],
        'end': [None],
        'strand': [None],
        'regiontype': ['random_sequence']
    }
oligo_database = OligoDatabase()
oligo_database.database = dict
bowtie_filter = BowtieFilter(bowtie_search_parameters, dir_output=dir_output)
res = bowtie_filter.apply("target", oligo_database, 2, reference_database)
print(res.database)

['GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT', 'AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC', 'TACAGGCATGACCCACCATGCCTGGCCAACTTACATTTTT', 'AAAAATGTAAGTTGGCCAGGCATGGTGGGTCATGCCTGTA']
[Seq('GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT'), Seq('AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC'), Seq('TACAGGCATGAGCCACCATGCCTGGCCAACTCACATTTTT'), Seq('AAAAATGTGAGTTGGCCAGGCATGGTGGCTCATGCCTGTA')]
['GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT', 'AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC', 'TACAGGCATGACCCACCATGCCTGGCCAACTTACATTTTT', 'AAAAATGTAAGTTGGCCAGGCATGGTGGGTCATGCCTGTA']
[Seq('GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT'), Seq('AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC'), Seq('TACAGGCATGAGCCACCATGCCTGGCCAACTCACATTTTT'), Seq('AAAAATGTGAGTTGGCCAGGCATGGTGGCTCATGCCTGTA')]
{'region': {'region::4': {'oligo': 'ATGAGGGCGGCTTGTCCAGCCCCCCAGAGACCTTGGCCTT', 'target': 'AAGGCCAAGGTCTCTGGGGGGCTGGACAAGCCGCCCTCAT', 'chromosome': [None], 'start': [None], 'end': [None], 'strand': [None], 'regiontype': ['random_sequence']}, 'region::5'

# reads processed: 20
# reads with at least one alignment: 4 (20.00%)
# reads that failed to align: 16 (80.00%)
Reported 4 alignments


In [None]:
from Bio.Seq import Seq
list_to_change = sequences
new_list = []
for elem in list_to_change:
    new_list.append(Seq(elem))
    new_list.append(Seq(reverse_complement(elem)))
print(new_list)

[Seq('GCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCT'), Seq('AGCTTGCTCAGCTGGGTCCATCCTGTGGACAAGCCCGAGC'), Seq('AAGGCCAAGGTCTCTGGGGGGCTGGACAAGCCGCCCTCAT'), Seq('ATGAGGGCGGCTTGTCCAGCCCCCCAGAGACCTTGGCCTT'), Seq('TTTTGCACCAGCCCAGATCGCATCTTCTTTCACCTGTTTT'), Seq('AAAACAGGTGAAAGAAGATGCGATCTGGGCTGGTGCAAAA'), Seq('CCGCTCGGCTGCATGAAACCAAAACGGCTGTCCGGGGACA'), Seq('TGTCCCCGGACAGCCGTTTTGGTTTCATGCAGCCGAGCGG'), Seq('AACCCGGCATCACCAAGAGGAGGTTCAAGGGAACGCTGCA'), Seq('TGCAGCGTTCCCTTGAACCTCCTCTTGGTGATGCCGGGTT'), Seq('TGCCCGCGCCGGAGTTCTCCCCAGCCGGAGTCCGGCAGGG'), Seq('CCCTGCCGGACTCCGGCTGGGGAGAACTCCGGCGCGGGCA'), Seq('AACCTGGTTGCACCTCGGCCTGGTCCCAGCAGGTATGGTT'), Seq('AACCATACCTGCTGGGACCAGGCCGAGGTGCAACCAGGTT'), Seq('ACTGATTGCTGCAGACGCTCACCCCAGACACTCACTGCAC'), Seq('GTGCAGTGAGTGTCTGGGGTGAGCGTCTGCAGCAATCAGT'), Seq('TATATATTTTGCACACTTTAAAATATTGGGTTGTTTACCG'), Seq('CGGTAAACAACCCAATATTTTAAAGTGTGCAAAATATATA')]


In [7]:
from oligo_designer_toolsuite.oligo_specificity_filter import HybridizationProbabilityFilter
from oligo_designer_toolsuite_ai_filters.api import APIBase
import numpy as np
import pandas as pd

FILE_DATABASE_REFERENCE = "../../data/tests/databases/database_reference.fna"
FILE_DATABASE_OLIGOS_AI = "../../data/tests/databases/database_oligos_ai.tsv"
FILE_TABLE_HITS_BLAST_AI = "../../data/tests/table_hits/table_hits_blast_ai.tsv"
FILE_TABLE_HITS_BOWTIE_AI = "../../data/tests/table_hits/table_hits_bowtie_ai.tsv"

class DummyAPI(APIBase):
    # Class that considers real hits all the hits that have a 100% match
    def predict(self, queries,gapped_queries,references,gapped_references):
        predictions = np.ndarray(shape=(len(queries),), dtype=np.float32)
        for i, (q, r) in enumerate(zip(gapped_queries, gapped_references)):
            if q == r:
                predictions[i] = 1
            else:
                predictions[i] = 0
        return predictions
    
tmp_path = os.path.join(os.getcwd(), "tmp_hybridization_probability_outputs")
blast_search_parameters = blast_search_parameters = {"perc_identity": 80, "strand": "both", "word_size": 10}
blast_hit_parameters = {"coverage": 50}
alignment_filter = BlastNFilter(
    blast_search_parameters=blast_search_parameters, 
    blast_hit_parameters=blast_hit_parameters, 
    dir_output=tmp_path
)
filter = HybridizationProbabilityFilter(alignment_method=alignment_filter, threshold=0.1)
filter.model = DummyAPI()
database = OligoDatabase(dir_output=tmp_path)
database.load_database(FILE_DATABASE_OLIGOS_AI)
reference_database = ReferenceDatabase(dir_output=tmp_path)
reference_database.load_sequences_from_fasta(file_fasta=FILE_DATABASE_REFERENCE, database_overwrite=True)
table_hits = pd.read_csv(FILE_TABLE_HITS_BLAST_AI, sep="\t")
sequence_type = "target"
region_id = "region"

filtered_database = filter.apply(
    sequence_type=sequence_type,
    oligo_database=database,
    n_jobs=2,
    reference_database=reference_database,
)
returned_oligos = set(filtered_database.database["region"].keys())
expected_oligos = set(f"region::{i}" for i in range(2, 20))

assert returned_oligos == expected_oligos, f"The Blast ai filter didn't return the expected oligos. \n\nExpected:\n{expected_oligos}\n\nGot:\n{returned_oligos}"




[         query    reference  ...  reference_region_id  min_alignment_length
0    region::0  NM_130786.4  ...          NM_130786.4                  20.0
1    region::1  NM_130786.4  ...          NM_130786.4                  20.0
2    region::2  NM_130786.4  ...          NM_130786.4                  20.0
4    region::3  NM_130786.4  ...          NM_130786.4                  20.0
6    region::4  NM_130786.4  ...          NM_130786.4                  20.0
8    region::5  NM_130786.4  ...          NM_130786.4                  20.0
10   region::6  NM_130786.4  ...          NM_130786.4                  20.0
12   region::7  NM_130786.4  ...          NM_130786.4                  20.0
14   region::8  NM_130786.4  ...          NM_130786.4                  20.0
15   region::9  NM_130786.4  ...          NM_130786.4                  20.0
16  region::10  NM_130786.4  ...          NM_130786.4                  20.0
17  region::11  NM_130786.4  ...          NM_130786.4                  20.0
18  region: