In [None]:
import time
import numpy

from pepars.utils import Sequence_Trie
from pepars.utils import DNA
from pepars.analysis import DNA as DNA_analysis

In [None]:
num_sequences = 100000
sequence_length = 21
alphabet = set(DNA.get_nucleotides())

sequences = []
for _ in range(num_sequences):
    sequences.append("".join(numpy.random.choice(list(DNA.get_nucleotides()), sequence_length)))

In [None]:
sequence_trie = Sequence_Trie(by_nucleotide=True)
sequence_dict = {}
start_time = time.time()
for sequence_index, sequence in enumerate(sequences):
    sequence_trie.add(sequence)
    sequence_dict[sequence] = True
end_time = time.time()
duration = end_time - start_time
print("Duration: %i" % duration)

In [None]:
num_new_sequences = 1000

for _ in range(num_new_sequences):
    sequences.append("".join(numpy.random.choice(list(DNA.get_nucleotides()), sequence_length)))

In [None]:
# Dictionary search

num_one_offs = 0

start_time = time.time()
for sequence in sequences:
    for index, character in enumerate(sequence):
        for other_character in alphabet.difference(character):
            one_off_sequence = sequence[0:index] + other_character + sequence[index+1:]
            if one_off_sequence in sequence_dict:
                num_one_offs += 1
    
print("Num one-offs: %i" % num_one_offs)
end_time = time.time()
duration = end_time - start_time
print("Duration: %i" % duration)

In [None]:
# Naive sequence trie search

num_one_offs = 0

start_time = time.time()
for sequence in sequences:
    for index, character in enumerate(sequence):
        for other_character in alphabet.difference(character):
            one_off_sequence = sequence[0:index] + other_character + sequence[index+1:]
            if sequence_trie.find(one_off_sequence):
                num_one_offs += 1


print("Num one-offs: %i" % num_one_offs)
end_time = time.time()
duration = end_time - start_time
print("Duration: %i" % duration)

In [None]:
# Trie-optimized search

num_one_offs = 0

start_time = time.time()
for sequence in sequences:
    num_one_offs += len(DNA_analysis.find_all_sequences_of_distance_n(sequence, sequence_trie, allow_invalid=False))

print("Num one-offs: %i" % num_one_offs)
end_time = time.time()
duration = end_time - start_time
print("Duration: %i" % duration)