In [66]:
"""
Goal:
- Find Eulerian path in de Bruijn graph (a graph of kmers' prefixes/suffixes)
- (In an Eulerian path all nodes have degree 2 (in_degree = out_degre), except 2 nodes).

Idea: 
- Use a union-find approach, reducing the number of sets from len(kmers) to 1.
    - "Construct" de Bruijn graph on the fly, just by retrieving the current kmer's prefix/suffix.
    - Find: while "building" the graph, check the place in the current target string the current prefix/suffix has.
    - Union: when the prefix/suffix of current node matches the suffix/prefix of the current target string.
"""


def str_from_kmer_composition(k, kmers):

    # lexicographical sorting
    kmers = sorted(kmers)

    # initialization
    target_string = kmers[0]

    # while we don't have just one set.
    while len(kmers) > 1:
        for i, kmer in enumerate(kmers):
            target_len = len(target_string)
            
            # suffix(curr_kmer) == prefix(target)
            if kmer[1:] == target_string[:k-1]:
                target_string = kmer[0] + target_string
            # prefix(curr_kmer) == suffix(target)
            elif kmer[:-1] == target_string[-k+1:]:
                target_string += kmer[-1]

            # if we added the current kmer's preffix/suffix, then unite
            if target_len != len(target_string):
                kmers.pop(i)
                break

    return target_string[:-(k-1)]

In [67]:
def genbin(n, arr, bs = ''):
    if n:
        genbin(n-1, arr, bs + '0')
        genbin(n-1, arr, bs + '1')
    else:
        arr.append(bs)

In [79]:
def k_universal_circular_str(k):
    """
    Using '0 is better than 1' rule,
    constructs binary DeBrujin sequence
    containing all k-mers, and then converts
    it to circular one by stripping the tail of 1s.
    """
    seen_kmers = set()
    output = ''.join(['1' for _ in range(k)])
    seen_kmers.add(output)
    while True:
        tail = output[-(k-1):]
        if tail + '0' in seen_kmers:
            if tail + '1' in seen_kmers:
                break
            output = output + '1'
            seen_kmers.add(tail + '1')
        else:
            output = output + '0'
            seen_kmers.add(tail + '0')
    assert len(seen_kmers) == 2**k
    return output[:-(k-1)]

k = int(list(open("rosalind_ba3i.txt", "r"))[0])
answer = k_universal_circular_str(k)
print(answer)

1111111100000000100000011000001010000011100001001000010110000110100001111000100010011000101010001011100011001000110110001110100011111001001010010011100101011001011010010111100110011010100110111001110110011110100111111010101011101011011010111110110111101110


In [71]:
one = '11111111000000000010000000110000001010000001110000010010000010110000011010000011110000100010000100110000101010000101110000110010000110110000111010000111110001000110001001010001001110001010010001010110001011010001011110001100110001101010001101110001110010001110110001111010001111110010010010110010011010010011110010100110010101010010101110010110110010111010010111110011001110011010110011011010011011110011101010011101110011110110011111010011111110101010110101011110101101110101110110101111110110110111110111011110'

In [72]:
two = '11111111100000000010000000110000001010000001110000010010000010110000011010000011110000100010000100110000101010000101110000110010000110110000111010000111110001000110001001010001001110001010010001010110001011010001011110001100110001101010001101110001110010001110110001111010001111110010010010110010011010010011110010100110010101010010101110010110110010111010010111110011001110011010110011011010011011110011101010011101110011110110011111010011111110101010110101011110101101110101110110101111110110110111110111011110'

In [73]:
one == two

False

In [74]:
len(one), len(two)

(512, 512)

In [83]:
def k_universal_circular(k):
    edges = ['1' for _ in range(k)]
    visited = {''.join(edges)}
    
    for i in range((1 << k) - k):
        pattern = ''.join(edges[-k + 1:] + ['0'])
        if pattern in visited:
            edges.append('1')
        else:
            edges.append('0')
        visited.add(''.join(edges[-k:]))

    return ''.join(edges)

In [84]:
def main():
    
    file = open('rosalind_ba3i.txt', 'r')
    
    k = int(next(file))
    
    print(k_universal_circular(k))

    file.close()

In [85]:
if __name__ == '__main__':
    main()

1111111100000000100000011000001010000011100001001000010110000110100001111000100010011000101010001011100011001000110110001110100011111001001010010011100101011001011010010111100110011010100110111001110110011110100111111010101011101011011010111110110111101110


In [86]:
one = '1111111100000000100000011000001010000011100001001000010110000110100001111000100010011000101010001011100011001000110110001110100011111001001010010011100101011001011010010111100110011010100110111001110110011110100111111010101011101011011010111110110111101110'

In [87]:
two = '1111111100000000100000011000001010000011100001001000010110000110100001111000100010011000101010001011100011001000110110001110100011111001001010010011100101011001011010010111100110011010100110111001110110011110100111111010101011101011011010111110110111101110'

In [88]:
one == two

True