In [2]:
'''
DESCRIPTION 
    Utility function that chops a sequence into several reads with bounded random lengths that 
    have a bounded random overlap
INPUT
    sequence       | a sequence of characters that will be divided into overlapping subsequences
    min_subseq_len | the shortest length a subsequence can have
    max_subseq_len | the longest length a subsequence can have
    min_overlap    | the shortest overlap two subsequences can share
    max_overlap    | the longest overlap two subsequences can share
    seed           | random seed for the random function for reproducibility
OUTPUT
    A list of overlapping reads of random bounded size which share a bounded random amount of
    overlap
'''
def generate_reads(sequence,min_subseq_len,max_subseq_len,min_overlap,max_overlap,seed=None):
    import random

    random.seed(seed)
    reads = []
    start = 0
    end = random.randint(min_subseq_len,max_subseq_len)
    reads += [sequence[start:end]]
    while end < len(sequence):
        start = random.randint(end-max_overlap,end-min_overlap)
        if (len(sequence) - start)/max_subseq_len < 2:
            if (len(sequence) - start)/max_subseq_len < 1:
                end = len(sequence)
            else:
                a = 0
                while (len(sequence) - start)/(min_subseq_len+a) > 2: a+=1
                end = random.randint(start+min_subseq_len+a,start+max_subseq_len) 
        else: end = random.randint(start+min_subseq_len,start+max_subseq_len) 
        reads += [sequence[start:end]]
    return reads

'''
DESCRIPTION 
    Utility function that creates a random sequence containing only the letters A, T, G, and C.
INPUT
    n    | the length of the sequence
    seed | random seed for the random function for reproducibility
OUTPUT
    A random sequence of length n
'''
def generate_genome_sequence(n,seed=None):
    import random
    
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    for _ in range(n):
        seq += nucleotides[random.randint(1,4)]
    return seq

# Sequitur

In [3]:
'''
DESCRIPTION
INPUT
OUTPUT
'''
class Stalk:
    def __init__(self,stalk):
        self.stalk = stalk
        if len(self.stalk) == 0: self.stalk = '$'

    def __repr__(self): return self.stalk

    def __eq__(self,other): return self[0] == other[0]

    def __hash__(self): return hash(self[0])

    def __getitem__(self,index): return self.stalk[index]

    def __len__(self):
        if self.stalk == '$' or self.stalk == '^': return 0
        return len(self.stalk)
    
    def __str__(self):
        if self.stalk =='$': return ''
        else: return self.stalk

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def common_substring(self,other):
        i = 0
        substr = ''
        if type(other) == str: other = Stalk(other)
        while i < min(len(self),len(other)) and self[i] == other[i]:
            substr += self[i]
            i += 1
        return Stalk(substr), Stalk(self[i:]), Stalk(other[i:])

In [4]:
'''
DESCRIPTION
INPUT
OUTPUT
'''
class Leaf:
    def __init__(self,left,right=''):
        if len(left) == 0:
            self.left = left
            self.right = 1
        else:
            self.left = left
            self.right = Leaf(right)
    
    def __repr__(self): return str(self.right)

    def __eq__(self,other): return self[0] == other[0]

    def __hash__(self): return hash(self.left)

    def __getitem__(self,index): return self.left[index]

    def __len__(self):
        if self.left == '$': return 0
        return len(self.left)
    
    def __is_shallow__(self): return True

    def reads(self): return set()

In [5]:
'''
DESCRIPTION
INPUT
OUTPUT
    '''
class Branch:
    def __init__(self):
        self.b = {}
        self.s = {}

    def __repr__(self): return repr(self.b)

    def __str__(self):
        s = '' 
        for i in range(len(list(self.b.values()))-1):s+=str(list(self.b.values())[i])+'\n'
        return s+str(list(self.b.values())[-1])
    
    def __getitem__(self,index):
        if type(index) == str: return self.b[Stalk(index)]
        return self.b[index]

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def __is_shallow__(self):
        for a in self.b.values():
            if type(a) == Branch: return False
        return True

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def __traverse__(self,context):
        b = self[context[0]]
        s = self.s[context[0]]
        context = context[len(s[0]):]
        while len(context) > 0 and len(b) > 1:
            s = b.s[context[0]]
            b = b[context[0]]
            context = context[len(s[0]):]
        return b
    
    def __setitem__(self,index,value):
        if type(index) == str: self.b[Stalk(index)] = value
        else: self.b[index] = value

    def __contains__(self,other): 
        if type(other) == str: return Stalk(other) in self.b
        return other in self.b

    def __len__(self): return len(self.b)

    def pop(self,index): return self.b.pop(index)

    '''
    DESCRIPTION
        adds a suffix to the trie
    INPUT
        stalk | a Stalk() which is a common substring of every read up to this point and beyond
        reads | a set of reads which have with the same common substring up to this point
    '''
    def add(self,stalk,reads):
        if stalk in self:
            if not len(stalk):
                self[stalk].right+=1
                self.s[stalk][1].update(reads)
                return
            if type(self[stalk]) == Leaf:
                branch = Branch()
                l1 = self.pop(stalk)
                stalk_ = list(self.s.pop(stalk))
                stalk_[0],l1.left,l2 = stalk_[0].common_substring(stalk)
                branch.add(l1.left,stalk_[1].copy())
                stalk_[1].update(reads)
                branch.add(l2,reads)
                stalk_ = tuple(stalk_)
                self[stalk_[0]] = branch
                self.s[stalk_[0]] = stalk_
            else:
                stalk_ = list(self.s.pop(stalk))
                branch = self.pop(stalk)
                stalk_[0],bstalk,stalk = stalk_[0].common_substring(stalk)
                if len(bstalk):
                    br = Branch()
                    br[bstalk] = branch 
                    br.s[bstalk] = (bstalk,stalk_[1].copy())
                    br.add(stalk,reads)
                    self[stalk_[0]] = br
                else: 
                    branch.add(stalk,reads)
                stalk_[1].update(reads)
                stalk_ = tuple(stalk_)
                if not len(bstalk): self[stalk_[0]] = branch
                self.s[stalk_[0]] = stalk_
        else:
            if type(stalk) == str: stalk = Stalk(stalk)
            self.s[stalk] = (stalk,reads)
            self[stalk] = Leaf(stalk)

In [6]:
'''
DESCRIPTION
    an object which constructs a suffix trie out of fragments of a sequence and can traverse 
    the trie to resconstruct some target sequence
INPUT
    reads | a list of strings which overlap and are fragments of a longer sequence
'''
class Sequitur:
    def __init__(self,reads,k_min=3,**kwargs):
        if "correct_sequence" in kwargs: self.correct_sequence = kwargs["correct_sequence"]
        self.branch = Branch()
        self.reads = self.remove_containments(reads)
        self.k_min = k_min
        for read in reads:
            for i in range(len(read)):
                if len(read[i:]) < self.k_min: continue 
                self.branch.add(Stalk(read[i:]),{read})
        if "len" in kwargs or ("assemble" in kwargs and kwargs["assemble"]): self.phase1(**kwargs)

    def phase1(self,**kwargs): 
        if len(self.reads) == 1: 
            self.sequence = self.reads[0]
            return True
        else:
            if "k_min_add" not in kwargs: kwargs["k_min_add"] = 0   
            extensions = {}
            stalks = self.branch.b.keys()
            for stalk in stalks: self.longest_common_substring(self.branch,stalk,[stalk.stalk],extensions)
            k_max = max(extensions.keys())
            i = 0
            overlaps = {}
            if "len" in kwargs and kwargs["len"] == len(self.reads):
                if self.k_min + kwargs["k_min_add"] < k_max: kwargs["k_min_add"] += 1
                else:
                    if kwargs["assemble"]: kwargs["assemble"] = False
                    for v in extensions.values():
                        for k_, v_ in v.items():
                            for read in v_['endswith']:
                                if read not in overlaps: overlaps[read] = set()
                                for extension in v_['is_in']:
                                    for _ in range(extension.count(k_)):
                                        pre = extension[:extension.find(k_)+len(k_)]
                                        suf = extension[extension.find(k_)+len(k_):]
                                        if read.endswith(pre): overlaps[read].add((extension,suf))
                    if "biphasic" in kwargs and kwargs["biphasic"]:
                        if self.phase2(overlaps,**kwargs): return True
                        else: 
                            self.sequence = self.reads
                            return False
                    else:
                        self.sequence = self.reads
                        return True
            else:
                kwargs["k_min_add"] = 0 
                kwargs["len"] = len(self.reads)
            for read in self.reads:
                while min(k_max,len(read)-1) - i > self.k_min + kwargs["k_min_add"]:
                    if read[:min(k_max,len(read)-1)-i] in extensions[min(k_max,len(read)-1)-i]:
                        if len(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith']) > 1: i += 1
                        else: break
                    else: i+=1
                if (read[:min(k_max,len(read)-1)-i] in extensions[min(k_max,len(read)-1)-i] and len(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith']) > 1)\
                or read[:min(k_max,len(read)-1)-i] not in extensions[min(k_max,len(read)-1)-i] or read not in extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['is_in']:
                    i = 0
                    continue
                if list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0] not in overlaps: overlaps[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = (read[:min(k_max,len(read)-1)-i],read,read[min(k_max,len(read)-1)-i:])
                else: overlaps[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = ('','','')
                i = 0
            if len(overlaps):
                overlaps = list(overlaps.items())
                overlaps.sort(key=lambda e: len(e[1][0]),reverse=True)
                overlaps = dict(overlaps)
                key = list(overlaps.keys())[0]
                seq = key
                self.reads.remove(key)
                while key in overlaps:
                    if len(overlaps[key][0]) < sum(len(o[0]) for o in overlaps.values())/len(overlaps): break
                    seq += overlaps[key][2]
                    key = overlaps[key][1]
                    if key not in self.reads or not len(key): break
                    self.reads.remove(key)
                self.reads += [seq]
            self.__init__(self.reads,self.k_min,**kwargs)
        
    def phase2(self,overlaps,**kwargs):
        for read,extensions in overlaps.items():
            if not len(extensions): continue
            self.reads.remove(read)
            for extension in extensions:
                self.reads.remove(extension[0])
                kwargs["len"] = len(self.reads)+1
                self.__init__(self.reads+[read+extension[1]],self.k_min,**kwargs)
                if self.phase1(**kwargs): return True
                else: 
                    self.reads.remove(read+extension[1])
                    self.reads += [extension[0]]
            self.reads += [read]
        return False
        
    def longest_common_substring(self,branch,stalk,substring,extensions):
        if branch.__is_shallow__():
            if len(branch.s[stalk][1]) > 1:
                if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                for read in branch.s[stalk][1]:
                    if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                    else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
            return extensions
        if type(branch.__traverse__(stalk.stalk)) is Leaf:
            if len(branch.s[stalk][1]) > 1:
                if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                for read in branch.s[stalk][1]:
                    if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                    else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
            return extensions
        for c in branch.__traverse__(stalk.stalk).b:
            if c.stalk != '$': extensions = self.longest_common_substring(branch.__traverse__(stalk.stalk),c,substring+[c.stalk],extensions)
            else: 
                if len(branch.s[stalk][1]) > 1:
                    if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                    if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                    for read in branch.s[stalk][1]:
                        if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                        else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
        return extensions
    
    def remove_containments(self,reads):
        i = 0
        r = set()
        m = max([len(r) for r in reads])
        b = False
        while i < len(reads):
            if len(reads[i]) == m: 
                r.add(reads[i])
                i+=1
                continue
            for r_ in r:
                if reads[i] in r_: 
                    b = True
                    break
            if b: 
                b = False
                i+=1
                continue
            r.add(reads[i])
            i+=1
        return list(r)

In [7]:
sequence = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
sequitur = Sequitur(reads,assemble=True)
sequitur.sequence == sequence if type(sequitur.sequence) is str else all(s in sequence for s in sequitur.sequence)

True

In [8]:
sequence = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
sequitur = Sequitur(reads,assemble=True)
sequitur.sequence == sequence if type(sequitur.sequence) is str else all(s in sequence for s in sequitur.sequence)

True

In [9]:
sequence = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
sequitur = Sequitur(reads,assemble=True)
sequitur.sequence == sequence if type(sequitur.sequence) is str else all(s in sequence for s in sequitur.sequence)

True

In [10]:
successes = 0
n = 1
for seed in range(n):   
    sequence = generate_genome_sequence(10000,seed=seed)
    reads = generate_reads(sequence,250,500,50,100,seed=seed)
    sequitur = Sequitur(reads,assemble=True)
    s = '| Seed: ' + str(seed) + ' | '
    if type(sequitur.sequence) is str and sequitur.sequence == sequence:
        s+='SUC | ' + str(sequitur.sequence) + ' == ' + sequence
        successes+=1
    elif type(sequitur.sequence) is list and all(s in sequence for s in sequitur.sequence):
        s+='PAR | ' + str(sequitur.sequence) + ' ~~ ' + sequence
        successes+=0.5
    else: s+='FAI | ' + sequitur.sequence + ' != ' + sequence
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

| Seed: 0 | SUC | TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAAACCATAATGTTCTAGCGCTCGAAATCATTGCACCACTTGCATCTTTGTTCCAGGGACGCTGTAAAACCAGATGCCTGTAAATCGTTTCAACGGGATGGTTTACCCGGAATTCTACGTATTTAATCAACGAGCTTAATGAGCTGACATTGCTGAAATGACCATGACTTAATAATCATTTATGGAGAAGAGGCACGACCACAAGGACCCTATGGCACGGTGGGCAAGCTCCCGCCCGGTACATAACTGTCTGGACTGATTATGTCGGTACAGACTTCTTCCTGCGTATCGATTACGAGCTTATCTGAAGAAGTTTAGGGCAAAGGGACCATGGCCATTGGTGCCAATTTCGGTTCTTGTATGCTACAGTTAAATAGAAAGGCCGCATTGTCGTTCTCGCCCTGTTTTCCTCATACACGACCGAGGTTATTTGTCGGAAACGAGACATCTCTCGAAGGTGGAACGACGCCGGGTGTGCAGAATTTATTTTAAACACTCTATTACCTCCGGGTAGCGTTGGCAAACTCCGATAATGAGCGCCAGGCGTGCCAGGACTCCACCTCCCCTGCTAAGTTGACCTTGAGCTCGGTACAGCGTCGGCGAGACGATAACAACGAAGTCCTTCGGCGTTATGTAATTCACCAGCCCACCATATCAGGTAATAGGCTCGCTGGTTAGGTAGATT

KeyboardInterrupt: 

    SUC: returns the target sequence fully reconstructed
    PAR: returns contigs all of which exist in the target sequence (consider coverage?)
    FAI: returns a full sequence that is incorrectly reconstructed or a set of contigs where at least one is not found in the target sequence

In [12]:
sequence = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(sequence,250,500,50,100,seed=seed)
reads

['CAGATTTTCATATTATGCAGAAAATCTACTTCGCCTGATACGAGTCGGTTATCTTCGGATACTGTATAGTCCCACCTGGTGATCCTATGCTTGTGAGTACCCAGAAAATAGCGACGGACCGCGGTGTTAAGTGTCGAGCTACATCACTTCTCATGTAGCCAGAAGGCTGCAACTCATCGACTCTATGTAGTGACCGCGTCGATGTCAAACCCCGGGGGGAGCTCAGATATCCGATACAGGGATGAAGAAATAACCTCATCCCATTGGTGACGAAAGGTTGTAAG',
 'GCTCAGATATCCGATACAGGGATGAAGAAATAACCTCATCCCATTGGTGACGAAAGGTTGTAAGTAGCTGGCCGCCGAGATAGCTGAGCGGCGAACCACTAGAAAAGGTTCAGACCCCGGAGCCCAGCCGTCACGATTGTTATGCGTATAAGCCCGGTTCACTACGTCCGTTCTGGCAAGCCGGGGCTAATCCGTCATTGTCAAGAGACATCTTTCGTCTCATTAGGCTACTAACGCCGCCGGGTCGTTACTCGAAAAGCAGGTGGAATTGGTGTATTCAGCTTGCTCGATTTGATCGATCTGCAAGGTGCTGTCTAGATAGATACCATGGCCCGGAAGTACGGGCTTCTGGCGCATGTCGCACTCGTCCCTGGTCACGAACTGTACAAACATTGGACACTCTTTCCCGTTCTGGTACAAAATGTGCTCCAATCATGCATGAAACAGATACATCGCTTGGGCCACG',
 'GTACAAAATGTGCTCCAATCATGCATGAAACAGATACATCGCTTGGGCCACGTAGTCTAGAGCACACTAAATGAGACATCTTAGAGGAGATAGGCGTAGATCCGGTTACTAGCCGTGATGCAAGGTGGGGGAACGGGATGTTGTAACATGCGGGTGTGCACGCCACTAAGACGAAACCTAGTGCCTCTTGCTAGTCATTATTAGTACGAAGGGTTGTGCTCCGATAGTTGAAAATGTG

In [20]:
sequence = 'ATGCCGTATGGACAACGACT'
reads = ['ATGCCGTATG','GCCGTATGGA','GTATGGACAA','GACAACGACT']
sequitur = Sequitur(reads,sequence,assemble=True)
sequitur.sequence,sequence

('ATGCCGTATGGACAACGACT', 'ATGCCGTATGGACAACGACT')

# DeBruijn Graph

In [14]:
# ! pip install networkx
import networkx as nx

In [15]:
def construct_debruijn_graph(reads,k=3):
    import networkx as nx
    
    G = nx.MultiDiGraph()
    for read in reads:
        for i in range(len(read)-k+1):
            G.add_edge(read[i:i+k-1],read[i+1:i+k])
    return G

def all_eulerian_paths_of(G):
    paths = []
    g = nx.DiGraph()
    while len(list(nx.selfloop_edges(G))):
        g.add_edge(list(nx.selfloop_edges(G))[0][0],list(nx.selfloop_edges(G))[0][1])
        G.remove_edges_from(g.edges)
        paths += [g.copy()]
        g.clear()
    n = min(i[1] for i in G.in_degree())
    while n <= max(i[1] for i in G.in_degree()):
        if not len(g): 
            if len(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n)): 
                if len(G.in_edges(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n).nodes())): 
                    edge = list(G.in_edges(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n).nodes()))[0]
                    if len(set(G.in_edges(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n).nodes())).intersection(G.in_edges(edge[0]))): 
                        edge = list(set(G.in_edges(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n).nodes())).intersection(G.in_edges(edge[0])))[0]
                else: edge = list(G.out_edges(nx.subgraph_view(G,filter_node=lambda node: G.in_degree(node)==n).nodes()))[0]
            else: 
                n+=1
                continue
        else: edge = list(G.out_edges([edge[1]]))[0]
        g.add_edge(edge[0],edge[1])
        if G.out_degree(edge[1]) != 1:
            G.remove_edges_from(g.edges)
            paths += [g.copy()]
            g.clear()
            if not len(G.edges): break
            n = min(i[1] for i in nx.subgraph_view(G,filter_node=lambda node: G.out_degree(node) > 0).in_degree())
    return paths

def assemble(G):
    contigs = []
    for g in G:
        seq = ''
        init = True
        for n in nx.eulerian_path(g):
            if init: 
                seq = n[0] + n[1][-1]
                init = False
                continue
            seq += n[1][-1]
        contigs += [seq]
    return contigs

In [16]:
sequence = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
for k in range(3,min(len(read) for read in reads)):
      contigs = assemble(all_eulerian_paths_of(construct_debruijn_graph(reads,k)))
      if len(contigs) == 1 and contigs[0] == sequence: print("| k = " + str(k) + " | SUC | "+contigs[0] )
      elif all(seq in sequence for seq in contigs): print("| k = " + str(k) + " | PAR | " + str(contigs))
      else: print("| k = " + str(k) + " | FAI | " + str(contigs) )

| k = 3 | PAR | ['_to_make', 'ke_', 'ke_', 'ht_', 'ht_', 't_b', 't_b', 'r_w', 'r_w', '_wa', '_wa', 'was', 'was', 'as_', 'as_', 'as_', 's_b', 's_b', '_be', '_be', '_be', 'bet', 'bet', 'bet', 'bet', 'ett', 'ett', 'ett', 'ett', 'tty', 'tty', 'tty', 'ty_', 'ty_', 'ty_', 'y_b', 'y_b', 'y_b', '_bo', '_bo', '_bo', 'bou', 'bou', 'bou', 'oug', 'oug', 'oug', 'ugh', 'ugh', 'ugh', 'ght', 'ght', 'ght', 'the', 'the', 'the', 'he_', 'he_', 'he_', 'he_', 'e_b', 'e_b', 'e_b', 'e_t', 'e_t', '_bi', '_bi', '_bi', 'bit', 'bit', 'bit', 'itt', 'itt', 'itt', 'r_t', 'r_t', 'r_t', 'r_t', '_th', '_th', '_th', '_th', 'r_b', 'r_b', 'r_b', 'r_b', 'r_b', '_bu', '_bu', '_bu', '_bu', '_bu', '_bu', 'but', 'but', 'but', 'but', 'but', 'but', 'utt', 'utt', 'utt', 'utt', 'utt', 'utt', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'tte', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'ter', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_', 'er_'

In [17]:
sequence = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
for k in range(3,min(len(read) for read in reads)):
      contigs = assemble(all_eulerian_paths_of(construct_debruijn_graph(reads,k)))
      if len(contigs) == 1 and contigs[0] == sequence: print("| k = " + str(k) + " | SUC | "+contigs[0] )
      elif all(seq in sequence for seq in contigs): print("| k = " + str(k) + " | PAR | " + str(contigs))
      else: print("| k = " + str(k) + " | FAI | " + str(contigs) )

| k = 3 | PAR | ['you s', ' sa', ' sa', 'say', 'say', 'ay ', 'ay ', 'y h', 'y h', 'o h', ' he', ' he', ' he', 'hel', 'hel', 'hel', 'bel', 'ell', 'ell', 'ell', 'llo', 'llo', 'lo ', 'lo ', 'low', 'ow ', 'ow ', 'w g', 'w g', ' go', ' go', 'go ', 'go ', ' to ', 'o w', 'o w', 'o t', 'o t', ' world', ' wo', 'ld,', 'ld,', 'd, ', 'd, ', ', i', ', i', ' i ', ' i ', 'i b', 'i b', ' be', ' be']
| k = 4 | PAR | ['you sa', ' say', ' say', 'say ', 'say ', 'ay h', 'ay h', 'y he', 'y he', 'to he', ' hel', ' hel', ' hel', 'hell', 'hell', ' bell', 'ello', 'ello', 'llo ', 'llow ', 'lo w', 'lo w', 'o world,', 'o wo', 'ld, ', 'ld, ', 'd, i', 'd, i', ', i ', ', i ', ' i b', ' i b', 'i be', 'i be', 'ow g', 'ow g', 'w go', 'w go', ' go ', ' go ', 'go to ', 'go t']
| k = 5 | PAR | ['you say', ' say ', ' say ', 'say h', 'say h', 'ay he', 'ay he', 'y hel', 'y hel', 'to hel', ' hello', ' hell', ' bello', 'ello w', 'ellow g', 'lo world, ', 'lo wo', 'ld, i', 'ld, i', 'd, i ', 'd, i ', ', i b', ', i b', ' i bel', ' 

In [18]:
sequence = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
for k in range(3,min(len(read) for read in reads)):
      contigs = assemble(all_eulerian_paths_of(construct_debruijn_graph(reads,k)))
      if len(contigs) == 1 and contigs[0] == sequence: print("| k = " + str(k) + " | SUC | "+contigs[0])
      elif all(seq in sequence for seq in contigs): print("| k = " + str(k) + " | PAR | " + str(contigs))
      else: print("| k = " + str(k) + " | FAI | " + str(contigs))

| k = 3 | PAR | ['_on_the', 'shore', 'sea', 'sea', 'ea_', 'ea_', 'ea_', 'ea_', 'a_s', 'a_s', 'a_s', 'a_s', '_sh', '_sh', '_sh', 'she', 'she', 'she', 'she', 'he_', 'he_', 'he_', 'hel', 'hel', 'hel', 'e_s', 'e_s', 'e_s', 'sel', 'ell', 'ell', 'ell', 'lls', 'lls', 'lls', 'lls', 'ls_', 'ls_', 'ls_', 'ls_', 's_s', 's_s', '_se', '_se', '_se', 's_o', 's_o']
| k = 4 | PAR | ['_sell', 's_on_the_', 'she_', 'he_s', 'he_s', 'he_s', 'e_se', 'e_se', 's_se', '_sea', '_sea', 'sea_', 'sea_', 'ea_s', 'ea_s', 'ea_s', 'ea_s', 'a_sh', 'a_sh', 'a_sh', '_she', '_she', 'shel', 'shel', 'shel', '_shore', 'hell', 'hell', 'ells', 'ells', 'ells', 'lls_', 'lls_', 'lls_', 'lls_', 'ls_s', 'ls_s', 'ls_o', 'ls_o']
| k = 5 | PAR | ['she_s', '_the_s', 'he_se', 'he_se', 'e_sells', 'e_sea', 'ls_sea', '_sea_', '_sea_', 'sea_s', 'sea_s', 'ea_sh', 'ea_sh', 'ea_sh', 'a_she', 'a_she', 'a_shore', '_shel', '_shel', 'shell', 'shell', 'hells', 'hells', 'ells_', 'ells_', 'ells_', 'lls_s', 'lls_s', 'lls_on_the', 'lls_o']
| k = 6 | PAR

In [19]:
sequence = 'ATGCCGTATGGACAACGACT'
reads = ['ATGCCGTATG','GCCGTATGGA','GTATGGACAA','GACAACGACT']
for k in range(3,min(len(read) for read in reads)):
      contigs = assemble(all_eulerian_paths_of(construct_debruijn_graph(reads,k)))
      if len(contigs) == 1 and contigs[0] == sequence: print("| k = " + str(k) + " | SUC | "+contigs[0])
      elif all(seq in sequence for seq in contigs): print("| k = " + str(k) + " | PAR | " + str(contigs))
      else: print("| k = " + str(k) + " | FAI | " + str(contigs))

| k = 3 | PAR | ['TGC', 'GCC', 'GCC', 'CCG', 'CCG', 'ACG', 'CGT', 'CGT', 'CGA', 'GTA', 'GTA', 'GTA', 'TAT', 'TAT', 'TAT', 'ATG', 'ATG', 'ATG', 'ATG', 'TGG', 'TGG', 'GGA', 'GGA', 'GAC', 'GAC', 'GAC', 'AAC', 'ACA', 'ACA', 'ACT', 'CAA', 'CAA']
| k = 4 | PAR | ['ATGCC', 'GCCG', 'GCCG', 'CCGT', 'CCGT', 'CGTA', 'CGTA', 'GTAT', 'GTAT', 'GTAT', 'TATG', 'TATG', 'TATG', 'ATGG', 'ATGG', 'TGGAC', 'TGGA', 'ACGAC', 'GACA', 'GACA', 'GACT', 'ACAACG', 'ACAA']
| k = 5 | PAR | ['ATGCCG', 'GCCGT', 'GCCGT', 'CCGTA', 'CCGTA', 'CGTAT', 'CGTAT', 'GTATG', 'GTATG', 'GTATG', 'TATGG', 'TATGG', 'ATGGACA', 'ATGGA', 'GACAACGACT', 'GACAA']
| k = 6 | PAR | ['ATGCCGT', 'GCCGTA', 'GCCGTA', 'CCGTAT', 'CCGTAT', 'CGTATG', 'CGTATG', 'GTATGG', 'GTATGG', 'TATGGACAACGACT', 'TATGGA']
| k = 7 | PAR | ['ATGCCGTA', 'GCCGTAT', 'GCCGTAT', 'CCGTATGG', 'CCGTATG', 'GTATGGACAA', 'GTATGGA', 'GACAACGACT']
| k = 8 | PAR | ['ATGCCGTAT', 'GCCGTATGGACAA', 'GCCGTATG', 'GACAACGACT']
| k = 9 | PAR | ['ATGCCGTATGGA', 'GTATGGACAA', 'GACAACGACT']


In [106]:
sequence = generate_genome_sequence(10000,seed=seed)
reads = generate_reads(sequence,250,500,100,180,seed=seed)
# for k in range(3,min(len(read) for read in reads)):
k = 183
contigs = assemble(all_eulerian_paths_of(construct_debruijn_graph(reads,k)))
# if len(contigs) == 1 and contigs[0] == sequence: print("| k = " + str(k) + " | SUC | "+contigs[0])
# elif all(seq in sequence for seq in contigs): print("| k = " + str(k) + " | PAR | " + str(contigs))
# else: print("| k = " + str(k) + " | FAI | " + str(contigs))

In [107]:
len(reads),len(contigs)

(39, 39)

In [108]:
all(r==c for r,c in zip(reads,contigs))

True

In [None]:
from Bio import SeqIO

In [None]:
reads = []
for record in SeqIO.parse("data\Batrachochytrium_dendrobatidis\GCA_000203795.1\GCA_000203795.1_v1.0_genomic.fna",'fasta'):
    reads += [record.seq]
reads

[Seq('CATGCAATAAAATAGTCTTTCTGTacttttttcagttgcatcgtttctgattcc...TCC'),
 Seq('ACAATAATCCGACGCAATTACCAATTAATAATTACCCGACGGATTGCTCCGACA...TTC'),
 Seq('TCAGAGAAGACCTGACCCAAAAGAAGTAATCGTGTctatcagtcaatcaacaat...CTG'),
 Seq('GGCGAGAGGCGCGACCCGAGAATATTTACCCCCGACTGTCCCTATTCATCATTA...TAT'),
 Seq('tttttttgcaatttcatgacaatacatgcatgcatcgtacataccaatgcaata...GGA'),
 Seq('ctaatcaaacaactggtcagaatgcaggttaaatactagttaaaatactcgtca...cac'),
 Seq('CGGTTGGATGCCGTGGAAGAAGACACTTGACTGAGGGAGCGTCCTACATAAATC...TAA'),
 Seq('AtaccatgtttggcttgccgCGTCTGAGGAAACGatacccaattgtatcattcc...ATA'),
 Seq('accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta...ATG'),
 Seq('ATAAAAAAAgagaaaagaaaaaaaggaagaaaagaaaagaaagGATACAAGGAT...TAT'),
 Seq('CACAAATAAACCGGTTACACCATTATCCATCAAAGCTGGGAACCTGTCCATAAT...TTT'),
 Seq('attttattgcaatcatttgtagtCCCATCAGTAACACTCAGGATAGTATCCATA...CGA'),
 Seq('TAACACAACgccaatagcagttatttcccagccatatttaaactgattgtgtat...TCA'),
 Seq('CGATGGTCGAAATAGCGTGTATAATTCAAGTCGCGATGGAAATATTTTTGGCTG...GGA'),
 Seq('Agcctaaccctaac