In [1]:
from collections import defaultdict
import sys
import itertools


def build_graph(first_genome, second_genome):
    node_to_synteny_block = dict()
    synteny_block_to_nodes = dict()
    adjacency_lists = defaultdict(list)
    last_node = -1
    for cycle in first_genome:
        for signed_synteny_block in cycle:
            synteny_block = abs(signed_synteny_block)
            if synteny_block not in synteny_block_to_nodes:
                start_node = last_node + 1
                end_node = last_node + 2
                synteny_block_to_nodes[synteny_block] = (start_node, end_node)
                node_to_synteny_block[start_node] = synteny_block
                node_to_synteny_block[end_node] = synteny_block
                last_node = end_node
    for cycle in itertools.chain(first_genome, second_genome):
        start, end = synteny_block_to_nodes[abs(cycle[0])]
        prev_node = end if cycle[0] > 0 else start
        extended_cycle = cycle[1:] + [cycle[0]]
        for current_block in extended_cycle:
            start, end = synteny_block_to_nodes[abs(current_block)]
            current_node = start if current_block > 0 else end
            adjacency_lists[current_node].append(prev_node)
            adjacency_lists[prev_node].append(current_node)
            prev_node = start if current_node == end else end
    num_synteny_blocks = len(synteny_block_to_nodes)
    return adjacency_lists, num_synteny_blocks


def dfs(node, adjacency_lists, visited):
    visited.add(node)
    for child in adjacency_lists[node]:
        if child not in visited:
            dfs(child, adjacency_lists, visited)


def parse_genome(str):
    tokens = str.split(")(")
    tokens[0] = tokens[0][1:]
    tokens[-1] = tokens[-1][:-1]
    genome = []
    for token in tokens:
        genome.append(list(map(int, token.split())))
    return genome


def main():
    
    file = open('rosalind_ba6c.txt', 'r')
    
    first_genome = parse_genome(next(file).strip())
    second_genome = parse_genome(next(file).strip())
    adjacency_lists, num_blocks = build_graph(first_genome, second_genome)
    visited = set()
    num_cycles = 0
    sys.setrecursionlimit(2 * num_blocks)
    for node in adjacency_lists:
        if node not in visited:
            dfs(node, adjacency_lists, visited)
            num_cycles += 1
    print(num_blocks - num_cycles)

In [2]:
if __name__ == "__main__":
    main()

14764


In [None]:
s1 = '(+1 +2 +3 +4 +5 +6)'
s2 = '(+1 -3 -6 -5)(+2 -4)'

In [None]:
genome_1 = parse_genome(s1)
genome_1

In [None]:
genome_2 = parse_genome(s2)
genome_2

In [None]:
import itertools

In [None]:
list(itertools.chain(genome_1, genome_2))

In [None]:
genome_1 + genome_2

In [4]:
import sys
from collections import defaultdict


def get_genomes(gen):
    gen_list = gen.split(")(")
    gen_list[0] = gen_list[0][1:]
    gen_list[-1] = gen_list[-1][:-1]
    genome = []
    for token in gen_list:
        genome.append(list(map(int, token.split())))
    return genome


def build_breakpoint_graph(genome_1, genome_2):
    Adj = defaultdict(list)

    node_by_block = dict()
    block_by_node = dict()

    n_blocks = 0
    curr_last = -1
    for cycle in genome_1:
        for block in cycle:
            block = abs(block)
            if block not in node_by_block:
                n_blocks += 1
                start = curr_last + 1
                end = curr_last + 2

                node_by_block[block] = (start, end)
                block_by_node[start] = block
                block_by_node[end] = block
                curr_last = end

    concat = genome_1 + genome_2
    for cycle in concat:
        block = abs(cycle[0])

        start, end = node_by_block[block]
        u = end if cycle[0] > 0 else start  # previous node

        cycle = cycle[1:] + [cycle[0]]
        for curr_block in cycle:
            start, end = node_by_block[abs(curr_block)]
            v = start if curr_block > 0 else end  # current node
            Adj[v].append(u)
            Adj[u].append(v)
            u = start if v == end else end

    return n_blocks, Adj


def dfs(u_node, Adj, visited):
    visited.add(u_node)
    for v_node in Adj[u_node]:
        if v_node not in visited:
            dfs(v_node, Adj, visited)


def two_break_distance(genome_1, genome_2):
    num_cycles = 0
    visited = set()
    n_blocks, Adj = build_breakpoint_graph(genome_1, genome_2)
    for u_node in Adj:
        if u_node not in visited:
            dfs(u_node, Adj, visited)
            num_cycles += 1

    # page 321: 2-Break Distance Theorem: The 2-break distance between genomes
    # P and Q is equal to BLOCKS(P, Q) - CYCLES(P, Q).
    return n_blocks - num_cycles


def main():
    sys.setrecursionlimit(int(1e5))

    file = open('rosalind_ba6c.txt', 'r')

    genome_1 = get_genomes(next(file).strip())
    genome_2 = get_genomes(next(file).strip())

    print(two_break_distance(genome_1, genome_2))


if __name__ == '__main__':
    main()

14764


In [None]:
import sys

def str_to_in_vertex(s):
    v = 2 * abs(int(s)) - 2
    if s[0] == '-':
        v += 1
    return v


def str_to_out_vertex(s):
    v = 2 * abs(int(s)) - 1
    if s[0] == '-':
        v -= 1
    return v


def add_egdes(permutation, graph):
    n = len(permutation)
    for i in range(n):
        j = (i + 1) % n
        out_vertex = str_to_out_vertex(permutation[i])
        in_vertex = str_to_in_vertex(permutation[j])
        graph[out_vertex].append(in_vertex)
        graph[in_vertex].append(out_vertex)


def split_into_cycles(str):
    return [s[:-1].split() for s in str.split('(')]


def dfs(graph, v, visited):
    visited[v] = 1
    for u in graph[v]:
        if not visited[u]:
            dfs(graph, u, visited)


def main():
    sys.setrecursionlimit(100000)
    file = open('rosalind_ba6c.txt', 'r')
    
    p = split_into_cycles(next(file).strip())
    q = split_into_cycles(next(file).strip())
    n = sum(len(cycle) for cycle in p)
    graph = [[] for _ in range(2 * n)]
    for cycle in p:
        add_egdes(cycle, graph)
    for cycle in q:
        add_egdes(cycle, graph)
    visited = [0 for _ in range(2 * n)]
    dist = n
    for v in range(2 * n):
        if visited[v]:
            continue
        dfs(graph, v, visited)
        dist -= 1
    print(dist)


In [None]:
if __name__ == '__main__':
    main()