In [7]:
import random
def generate_reads(seq, k, min_overlap, max_overlap,seed=None):
    random.seed(seed)
    reads = []
    for i in range(0, len(seq)-k+1):
        # Generate a random overlap within the specified range
        overlap = random.randint(min_overlap, max_overlap)
        start = i
        end = i + k + overlap
        reads.append(seq[start:end])
    return reads

def generate_genome_sequence(n,seed=None):
    random.seed(seed)
    nucleotides = {1:'A',2:'C',3:'G',4:'T'}
    seq = ''
    for i in range(n):
        seq += nucleotides[random.randint(1,4)]
    return seq

def remove_containments(reads):
    i = 0
    r = set()
    m = max([len(r) for r in reads])
    b = False
    while i < len(reads):
        if len(reads[i]) == m: 
            r.add(reads[i])
            i+=1
            continue
        for r_ in r:
            if reads[i] in r_: 
                b = True
                break
        if b: 
            b = False
            i+=1
            continue
        r.add(reads[i])
        i+=1
    return list(r)

# Sequitur

In [8]:
'''
DESCRIPTION
INPUT
OUTPUT
'''
class Stalk:
    def __init__(self,stalk):
        self.stalk = stalk
        if len(self.stalk) == 0: self.stalk = '$'

    def __repr__(self): return self.stalk

    def __eq__(self,other): return self[0] == other[0]

    def __hash__(self): return hash(self[0])

    def __getitem__(self,index): return self.stalk[index]

    def __len__(self):
        if self.stalk == '$' or self.stalk == '^': return 0
        return len(self.stalk)
    
    def __str__(self):
        if self.stalk =='$': return ''
        else: return self.stalk

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def common_substring(self,other):
        i = 0
        substr = ''
        if type(other) == str: other = Stalk(other)
        while i < min(len(self),len(other)) and self[i] == other[i]:
            substr += self[i]
            i += 1
        return Stalk(substr), Stalk(self[i:]), Stalk(other[i:])

In [9]:
'''
DESCRIPTION
INPUT
OUTPUT
'''
class Leaf:
    def __init__(self,left,right=''):
        if len(left) == 0:
            self.left = left
            self.right = 1
        else:
            self.left = left
            self.right = Leaf(right)
    
    def __repr__(self): return str(self.right)

    def __eq__(self,other): return self[0] == other[0]

    def __hash__(self): return hash(self.left)

    def __getitem__(self,index): return self.left[index]

    def __len__(self):
        if self.left == '$': return 0
        return len(self.left)
    
    def __is_shallow__(self): return True

    def reads(self): return set()

In [10]:
class Sequence:
    def __init__(self,seq=None,seen=None):
        self.extensions = []
        self.contains = set()
        if seq is None: 
            self.seq = ''
            self.seen = tuple()
        else: 
            self.seq = seq
            if seen is None: self.seen = (seq,)
            else: self.seen = seen

    def __repr__(self): return self.seq

    def copy(self):
        c = Sequence()
        c.extensions = self.extensions
        c.contains = self.contains
        c.seq = self.seq
        c.seen = self.seen
        return c
    
    def reset(self, other):
        self.extensions = other.extensions
        self.contains = other.contains
        self.seq = other.seq
        self.seen = other.seen

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def add(self,item): 
        if item not in self.seen: self.seen = tuple(list(self.seen) + [item]) 

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def __add__(self,other):
        self.seq += other.seq
        self.seen = tuple(list(self.seen)+list(other.seen))
        for a in set(self.extensions).intersection(self.seen):
            self.extensions.remove(a)
        for a in set(other.extensions).intersection(self.seen):
            other.extensions.remove(a)
        self.extensions+=other.extensions
        return self

In [11]:
'''
DESCRIPTION
INPUT
OUTPUT
    '''
class Branch:
    def __init__(self):
        self.b = {}
        self.s = {}

    def __repr__(self): return repr(self.b)

    def __str__(self):
        s = '' 
        for i in range(len(list(self.b.values()))-1):s+=str(list(self.b.values())[i])+'\n'
        return s+str(list(self.b.values())[-1])
    
    def __getitem__(self,index):
        if type(index) == str: return self.b[Stalk(index)]
        return self.b[index]
    
    def reads(self):
        r = set()
        for s in self.s.values():
            r.update(s[1])
        return r

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def __is_shallow__(self):
        for a in self.b.values():
            if type(a) == Branch: return False
        return True

    '''
    DESCRIPTION
    INPUT
    OUTPUT
    '''
    def __traverse__(self,context):
        b = self[context[0]]
        s = self.s[context[0]]
        context = context[len(s[0]):]
        while len(context) > 0 and len(b) > 1:
            s = b.s[context[0]]
            b = b[context[0]]
            context = context[len(s[0]):]
        return b
    
    def __setitem__(self,index,value):
        if type(index) == str: self.b[Stalk(index)] = value
        else: self.b[index] = value

    def __contains__(self,other): 
        if type(other) == str: return Stalk(other) in self.b
        return other in self.b

    def __len__(self): return len(self.b)

    def pop(self,index): return self.b.pop(index)

    '''
    DESCRIPTION
        adds a suffix to the trie
    INPUT
        stalk | a Stalk() which is a common substring of every read up to this point and beyond
        reads | a set of reads which have with the same common substring up to this point
    '''
    def add(self,stalk,reads):
        if stalk in self:
            if not len(stalk):
                self[stalk].right+=1
                self.s[stalk][1].update(reads)
                return
            if type(self[stalk]) == Leaf:
                branch = Branch()
                l1 = self.pop(stalk)
                stalk_ = list(self.s.pop(stalk))
                stalk_[0],l1.left,l2 = stalk_[0].common_substring(stalk)
                branch.add(l1.left,stalk_[1].copy())
                stalk_[1].update(reads)
                branch.add(l2,reads)
                stalk_ = tuple(stalk_)
                self[stalk_[0]] = branch
                self.s[stalk_[0]] = stalk_
            else:
                stalk_ = list(self.s.pop(stalk))
                branch = self.pop(stalk)
                stalk_[0],bstalk,stalk = stalk_[0].common_substring(stalk)
                if len(bstalk):
                    br = Branch()
                    br[bstalk] = branch 
                    br.s[bstalk] = (bstalk,stalk_[1].copy())
                    br.add(stalk,reads)
                    self[stalk_[0]] = br
                else: 
                    branch.add(stalk,reads)
                stalk_[1].update(reads)
                stalk_ = tuple(stalk_)
                if not len(bstalk): self[stalk_[0]] = branch
                self.s[stalk_[0]] = stalk_
        else:
            if type(stalk) == str: stalk = Stalk(stalk)
            self.s[stalk] = (stalk,reads)
            self[stalk] = Leaf(stalk)

    '''
    DESCRIPTION
        a method which returns all reads beyond a certain point on a branch
    INPUT
        exclude    | a list of all reads which should not be added as an extension
        context    | the path from the root up to that point in the branch
        t          | a string which holds the path to the read from the context
    OUTPUT
        extensions | a dictionary of all possible extensions indexed on the read and pointing
                   | to a 3-tuple holding the context, a Sequence() of the read along the path
                   | a Sequence() of the read after the context 
    '''
    def extensions(self,exclude,context,sequence):
        extensions = []
        for s in self.s:
            if type(self[s]) is Branch:
                for b in self[s].b:
                    if type(self[s].b[b]) is Branch:
                        extensions += self[s].b[b].extensions(exclude,context,sequence)
                        continue
                    for read in self[s].s[self[s].b[b].left][1]:
                        if read in exclude: continue
                        if read in sequence.seq:
                            sequence.contains.add(read)
                            continue
                        pre = ''
                        suf = read
                        for _ in range(read.count(context)):
                            pre += suf[:suf.find(context)+len(context)]
                            suf = suf[suf.find(context)+len(context):]
                            if sequence.seq.endswith(pre) and len(pre) > len(context): extensions += [(pre,\
                                                                                                    Sequence(read),\
                                                                                                    Sequence(suf,(read,)))]
                        exclude.add(read)
            else:
                for read in self.s[s][1]:
                    if read in exclude: continue
                    if read in sequence.seq:
                            sequence.contains.add(read)
                            continue
                    pre = ''
                    suf = read
                    for _ in range(read.count(context)):
                        pre += suf[:suf.find(context)+len(context)]
                        suf = suf[suf.find(context)+len(context):]
                        if sequence.seq.endswith(pre) and len(pre) > len(context): extensions += [(pre,\
                                                                                                Sequence(read),\
                                                                                                Sequence(suf,(read,)))]
                    exclude.add(read)
        return extensions

TODO: [investigate more robust ways to select *k*](https://visualgo.net/en/suffixtree?slide=1)
- LCS or LRS or similar?

In [26]:
'''
DESCRIPTION
    an object which constructs a suffix trie out of fragments of a sequence and can traverse 
    the trie to resconstruct some target sequence
INPUT
    reads | a list of strings which overlap and are fragments of a longer sequence
'''
class Sequitur:
    def __init__(self,reads,correct_sequence=None,k_min=3,**kwargs):
        if correct_sequence is not None: self.correct_sequence = correct_sequence
        self.branch = Branch()
        self.reads = reads
        self.k_min = k_min
        for read in reads:
            for i in range(len(read)):
                if len(read[i:]) < self.k_min: continue 
                self.branch.add(Stalk(read[i:]),{read})
        if "assemble" in kwargs and kwargs["assemble"]: self.assemble()

    def assemble(self):    
        extensions = {}
        stalks = self.branch.b.keys()
        for stalk in stalks: self.longest_common_substring(self.branch,stalk,[stalk.stalk],extensions)
        k_max = max(extensions.keys())
        i = 0
        overlaps = {}
        for read in self.reads:
            while min(k_max,len(read)-1)-i > self.k_min and read[:min(k_max,len(read)-1)-i] not in extensions[min(k_max,len(read)-1)-i]: i+=1
            if read[:min(k_max,len(read)-1)-i] not in extensions[min(k_max,len(read)-1)-i] or read not in extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['is_in']: 
                i = 0
                continue
            if list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0] not in overlaps: overlaps[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = (read[:min(k_max,len(read)-1)-i],read,read[min(k_max,len(read)-1)-i:])
            else: overlaps[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = ('','','')
            i = 0
        overlaps = list(overlaps.items())
        overlaps.sort(key=lambda e: len(e[1][0]),reverse=True)
        overlaps = dict(overlaps)
        key = list(overlaps.keys())[0]
        seq = key
        self.reads.remove(key)
        while key in overlaps:
            # if len(read_ext[key][0]) < sum(len(e[0]) for e in read_ext.values())/len(read_ext): break
            # if seq + read_ext[key][2] not in self.sequence:
            #     print()
            seq += overlaps[key][2]
            key = overlaps[key][1]
            if key not in self.reads or not len(key): break
            self.reads.remove(key)
        self.reads += [seq]
        if len(self.reads) == 1: 
            self.sequence = self.reads[0]
            return
        else:
            self.__init__(self.reads,assemble=True)
        
    def longest_common_substring(self,branch,stalk,substring,extensions):
        if branch.__is_shallow__():
            if len(branch.s[stalk][1]) > 1:
                if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                for read in branch.s[stalk][1]:
                    if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                    else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
            return extensions
        if type(branch.__traverse__(stalk.stalk)) is Leaf:
            if len(branch.s[stalk][1]) > 1:
                if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                for read in branch.s[stalk][1]:
                    if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                    else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
            return extensions
        for c in branch.__traverse__(stalk.stalk).b:
            if c.stalk != '$': extensions = self.longest_common_substring(branch.__traverse__(stalk.stalk),c,substring+[c.stalk],extensions)
            else: 
                if len(branch.s[stalk][1]) > 1:
                    if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                    if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                    for read in branch.s[stalk][1]:
                        if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                        else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
        return extensions

In [27]:
sequence = 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better'
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                                     'r_butter_to_make_the_',
                                                                                   'ke_the_bitter_butter_better']
sequitur = Sequitur(reads,sequence,assemble=True)
sequitur.sequence == sequence

True

In [28]:
sequence = 'you say hello world, i bellow go to hell'
reads = ['you say hel',
            ' say hello wo',
                    'lo world, i be',
                          'ld, i bellow go t',
                                    'ow go to hell']
sequitur = Sequitur(reads,sequence,assemble=True)
sequitur.sequence == sequence

True

In [29]:
sequence = 'she_sells_sea_shells_on_the_sea_shore'
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
sequitur = Sequitur(reads,sequence,assemble=True)
sequitur.sequence == sequence

True

In [30]:
seed = 1
sequence = generate_genome_sequence(200,seed=seed)
reads = remove_containments(generate_reads(sequence,3,4,10,seed=seed))
sequitur = Sequitur(reads,sequence)

In [31]:
sequitur.assemble()
sequitur.sequence == sequence

True

In [32]:
successes = 0
n = 200
for seed in range(n):   
    sequence = generate_genome_sequence(200,seed=seed)
    reads = remove_containments(generate_reads(sequence,3,4,10,seed=seed))
    sequitur = Sequitur(reads,sequence,assemble=True)
    s = '| Seed: ' + str(seed) + ' | '
    if sequitur.sequence == sequence: 
        s+='SUCCESS | ' + sequitur.sequence + ' == ' + sequence
        successes+=1
    else: s+='FAILURE | ' + sequitur.sequence + ' != ' + sequence
    print(s)
    print('-----------------------------------------')
print('ACCURACY: '+str((successes/n)*100)+'%')

| Seed: 0 | SUCCESS | TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGT == TTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGT
-----------------------------------------
| Seed: 1 | SUCCESS | CAGATTTTCATATTATGCAGAAAATCTACTTCGCCTGATACGAGTCGGTTATCTTCGGATACTGTATAGTCCCACCTGGTGATCCTATGCTTGTGAGTACCCAGAAAATAGCGACGGACCGCGGTGTTAAGTGTCGAGCTACATCACTTCTCATGTAGCCAGAAGGCTGCAACTCATCGACTCTATGTAGTGACCGCGTC == CAGATTTTCATATTATGCAGAAAATCTACTTCGCCTGATACGAGTCGGTTATCTTCGGATACTGTATAGTCCCACCTGGTGATCCTATGCTTGTGAGTACCCAGAAAATAGCGACGGACCGCGGTGTTAAGTGTCGAGCTACATCACTTCTCATGTAGCCAGAAGGCTGCAACTCATCGACTCTATGTAGTGACCGCGTC
-----------------------------------------
| Seed: 2 | SUCCESS | AAAGCGGCACTTGTGAAGTGTTCCCCACGCCGCTTGGGTC

KeyboardInterrupt: 

# DeBruijn Graph

In [15]:
# ! pip install toyplot networkx
import networkx as nx
import toyplot, math, json

In [13]:
def construct_debruijn_graph(reads,k=3,allow_parallel_edges=True):
    import networkx as nx
    
    if allow_parallel_edges: G = nx.MultiDiGraph()
    else: G = nx.DiGraph()
    for read in reads:
        for i in range(len(read)-k+1):
            G.add_edge(read[i:i+k-1],read[i+1:i+k])
    return G

def plot_debruijn_graph(edges, width=500, height=500):
    "returns a toyplot graph from an input of edges"
    graph = toyplot.graph(
        [i[0] for i in edges],
        [i[1] for i in edges],
        width=width,
        height=height,
        tmarker=">", 
        vsize=25,
        vstyle={"stroke": "black", "stroke-width": 2, "fill": "none"},
        vlstyle={"font-size": "11px"},
        estyle={"stroke": "black", "stroke-width": 2},
        layout=toyplot.layout.FruchtermanReingold(edges=toyplot.layout.CurvedEdges()))
    return graph

In [14]:
def longest_common_substring(branch,stalk,substring,extensions):
    if branch.__is_shallow__():
        if len(branch.s[stalk][1]) > 1:
            if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
            if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
            for read in branch.s[stalk][1]:
                if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
        return extensions
    if type(branch.__traverse__(stalk.stalk)) is Leaf:
        if len(branch.s[stalk][1]) > 1:
            if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
            if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
            for read in branch.s[stalk][1]:
                if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
        return extensions
    for c in branch.__traverse__(stalk.stalk).b:
        if c.stalk != '$': extensions = longest_common_substring(branch.__traverse__(stalk.stalk),c,substring+[c.stalk],extensions)
        else: 
            if len(branch.s[stalk][1]) > 1:
                if len(''.join(substring)) not in extensions: extensions[len(''.join(substring))] = {}
                if ''.join(substring) not in extensions[len(''.join(substring))]: extensions[len(''.join(substring))][''.join(substring)] = {'endswith':set(),'is_in':set()}
                for read in branch.s[stalk][1]:
                    if read.endswith(''.join(substring)): extensions[len(''.join(substring))][''.join(substring)]['endswith'].add(read)
                    else: extensions[len(''.join(substring))][''.join(substring)]['is_in'].add(read)
    return extensions

In [24]:
seed = 5
sequence = generate_genome_sequence(200,seed=seed)
reads = remove_containments(generate_reads(sequence,3,4,10,seed=seed))

In [89]:

# while len(reads) > 1:    
#     extensions = {}
#     read_counts = {}
#     sequitur = Sequitur(reads,sequence)
#     stalks = sequitur.branch.b.keys()
#     for stalk in stalks:
#         longest_common_substring(sequitur.branch,stalk,[stalk.stalk],extensions,read_counts)
#     read_counts = list((c,len(r)) for c,r in read_counts.items())
#     read_counts.sort()
#     i = 0
#     m = 0
#     for l,r in read_counts:
#         if r >= m:
#             if m == 0: m = r
#             i += 1
#             continue
#         else: break
#     read_counts = list(a[0] for a in read_counts[:i])
#     read_counts.sort(reverse=True)
#     read_ext = {}
#     read_ext_ = {}
#     i = 0
#     while i < len(read_counts):
#         if reads[-1] == 'TCAGTGGGTAAAGGTG':
#             print()
#         for key in read_counts[i:]:
#             for key_,value_ in extensions[key].items():
#                 extensions[key][key_]['endswith'] = list(extensions[key][key_]['endswith'])
#                 for read in extensions[key][key_]['endswith']:
#                     if read == 'ATTCATGGCAGACAAC':
#                         print()
#                     if read not in read_ext: read_ext[read] = ('','','')
#                     # if read not in read_ext: read_ext[read] = set()
#                     flag = False
#                     for extension in extensions[key][key_]['is_in']:
#                         pre = ''
#                         suf = extension
#                         for _ in range(read.count(key_)):
#                             pre += suf[:suf.find(key_)+len(key_)]
#                             suf = suf[suf.find(key_)+len(key_):]
#                             if read.endswith(pre): 
#                                 if len(read_ext[read][0]) < len(pre):
#                                     if not len(read_ext[read][2]) or read_ext[read][2].startswith(suf): read_ext[read] = (pre,extension,suf)
#                                     else:
#                                         c = 0
#                                         while c < len(read_ext[read][2]):
#                                             if len(read_ext[read][0])+c not in extensions: 
#                                                 c -= 1
#                                                 break
#                                             if read_ext[read][0]+read_ext[read][2][:c] in extensions[len(read_ext[read][0])+c]: 
#                                                 c_ = c
#                                                 if len(extensions[len(read_ext[read][0])+c][read_ext[read][0]+read_ext[read][2][:c]]['endswith']) + len(extensions[len(read_ext[read][0])+c][read_ext[read][0]+read_ext[read][2][:c]]['is_in']) == 2: break
#                                             c += 1
#                                         if read_ext[read][0]+read_ext[read][2][:c] not in extensions[len(read_ext[read][0])+c]: c = c_
#                                         # if len(extensions[len(read_ext[read][0])+c][read_ext[read][0]+read_ext[read][2][:c]]['endswith']) + len(extensions[len(read_ext[read][0])+c][read_ext[read][0]+read_ext[read][2][:c]]['is_in']) > 2:
#                                         #     read_ext[read] = ('','','')
#                                         #     flag = True
#                                         #     break
#                                         read_ext[list(extensions[len(read_ext[read][0])+c][read_ext[read][0]+read_ext[read][2][:c]]['endswith'])[0]] = (read_ext[read][0]+read_ext[read][2][:c],extension,read_ext[read][2][c:])
#                                 elif len(read_ext[read][0]) >= len(pre):
#                                     if suf.startswith(read_ext[read][2]): continue
#                                     c = 0
#                                     while c < len(suf):
#                                         if len(pre)+c not in extensions: 
#                                             c -= 1
#                                             break
#                                         if pre+suf[:c] in extensions[len(pre)+c]: 
#                                             c_ = c
#                                             if len(extensions[len(pre)+c][pre+suf[:c]]['endswith']) + len(extensions[len(pre)+c][pre+suf[:c]]['is_in']) == 2: break
#                                         c += 1
#                                     if pre+suf[:c] not in extensions[len(pre)+c]: c = c_
#                                     # if len(extensions[len(pre)+c][pre+suf[:c]]['endswith']) + len(extensions[len(pre)+c][pre+suf[:c]]['is_in']) > 2:
#                                     #     read_ext[read] = ('','','')
#                                     #     flag = True
#                                     #     break
#                                     read_ext[list(extensions[len(pre)+c][pre+suf[:c]]['endswith'])[0]] = (pre+suf[:c],extension,suf[c:])
#                                 else:
#                                     print()
#                         if flag: break
#                     if read_ext[read] == ('','',''): read_ext.pop(read)
#         if len(read_ext):
#             if sum(len(e[0]) for e in read_ext.values())/len(read_ext) == 3 and len(read_ext)>1:
#                 print() 
#             if len(read_ext) > len(read_ext_): read_ext_ = read_ext.copy()
#         i += 1
#     read_ext = list(read_ext_.items())
#     del read_ext_
#     read_ext.sort(key=lambda e: len(e[1][0]),reverse=True)
#     read_ext = dict(read_ext)
#     key = list(read_ext.keys())[0]
#     seq = key
#     reads.remove(key)
#     while key in read_ext:
#         if len(read_ext[key][0]) < sum(len(e[0]) for e in read_ext.values())/len(read_ext): break
#         if seq + read_ext[key][2] not in sequence:
#             print()
#         seq += read_ext[key][2]
#         key = read_ext[key][1]
#         if key not in reads or not len(key): break
#         reads.remove(key)
#     reads += [seq]
# reads

['betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better']

In [25]:
k_min = 3
while len(reads) > 1:    
    extensions = {}
    sequitur = Sequitur(reads,sequence,k_min)
    stalks = sequitur.branch.b.keys()
    for stalk in stalks: longest_common_substring(sequitur.branch,stalk,[stalk.stalk],extensions)
    k_max = max(extensions.keys())
    i = 0
    read_ext = {}
    for read in reads:
        while min(k_max,len(read)-1)-i > k_min and read[:min(k_max,len(read)-1)-i] not in extensions[min(k_max,len(read)-1)-i]: i+=1
        if read[:min(k_max,len(read)-1)-i] not in extensions[min(k_max,len(read)-1)-i] or read not in extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['is_in']: 
            i = 0
            continue
        if list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0] not in read_ext: read_ext[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = (read[:min(k_max,len(read)-1)-i],read,read[min(k_max,len(read)-1)-i:])
        else: read_ext[list(extensions[min(k_max,len(read)-1)-i][read[:min(k_max,len(read)-1)-i]]['endswith'])[0]] = ('','','')
        i = 0
    read_ext = list(read_ext.items())
    read_ext.sort(key=lambda e: len(e[1][0]),reverse=True)
    read_ext = dict(read_ext)
    key = list(read_ext.keys())[0]
    seq = key
    reads.remove(key)
    while key in read_ext:
        # if len(read_ext[key][0]) < sum(len(e[0]) for e in read_ext.values())/len(read_ext): break
        if seq + read_ext[key][2] not in sequence:
            print()
        seq += read_ext[key][2]
        key = read_ext[key][1]
        if key not in reads or not len(key): break
        reads.remove(key)
    reads += [seq]
reads

['betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better']