In [None]:
def assemble_dna_segments(segments):
    # Start with the first segment as the initial sequence
    sequence = segments[0]

    # Iterate over the remaining segments in order
    for segment in segments[1:]:
        # Find the maximum overlap between the current sequence and the segment
        max_overlap = len(sequence)
        while max_overlap > 0:
            if segment.startswith(sequence[-max_overlap:]):
                break
            max_overlap -= 1

        # Merge the segment with the current sequence using the maximum overlap
        sequence += segment[max_overlap:]

    # Return the final sequence
    return sequence


In [None]:
from collections import defaultdict


def debruijn_graph(segments, k):
    # Create a defaultdict to store the graph edges
    graph = defaultdict(list)

    # Iterate over the segments to create k-mers and add them as edges
    for segment in segments:
        for i in range(len(segment) - k + 1):
            kmer = segment[i:i+k]
            graph[kmer[:-1]].append(kmer[1:])

    # Find the start node (i.e., the node with in-degree 0)
    start_node = None
    for node in graph:
        if not any(kmer == node for successors in graph.values() for kmer in successors):
            start_node = node
            break

    # Traverse the graph from the start node to assemble the sequence
    sequence = start_node
    while graph[start_node]:
        next_node = graph[start_node].pop()
        sequence += next_node[-1]
        start_node = next_node

    # Return the assembled sequence
    return sequence


# Example usage:
segments = [
    'TTAATTA',
    'ATTACTC',
    'ACTCAC',
    'TCACTGGCTAA',
    'CTAATTACTCACTGG',
    'CTGGGT',
    'GGGTCACT',
    'CACTACGCACTG'
]

# choose the k-mer length (typically between 20 and 30 for short-read sequencing data)
k = 5

sequence = debruijn_graph(segments, k)
print(sequence)
# Output: TTAATTACTCACTGGCTAATTACTCACTGGGTCACTACGCACTG
