In [18]:
# Import necessary library
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [20]:
# Read file and kmers length
def readsinglefile(filename):
    with open(filename, encoding='utf-16-le') as file:  # Specify utf-16 encoding
        lines = [[line.rstrip()] for line in file]
        k = int(''.join(lines[0]).lstrip('\ufeff'))  # Strip BOM character
        sequence = [lines[i][0] for i in range(1, len(lines))]
        return sequence, k

# Create DE Bruijn Graph
def singleDBG(sequence, k):
    combination = []
    for seq in sequence:
        combination.append(seq[0:k-1])
        combination.append(seq[1:k])
    graph = [[combination[i], combination[i+1]] for i in range(0, len(combination)-1, 2)]
    return graph

# Help us to find start point in graph
def single_findStartPoint(graph):
    tmp_tofind_startpoint = [graph[i][1] for i in range(len(graph))]
    return tmp_tofind_startpoint

# Create path from tracing graph and begin with start point
def singlePath(graph, tmp_tofind_startpoint):
    path = []
    for i in range(len(graph)):
        if graph[i][0] not in tmp_tofind_startpoint:
            path.insert(0, graph[i][0])
            path.insert(1, graph[i][1])
            z = i
            break
    for l in graph:
        for j in range(len(graph)):
            if graph[z][1] == graph[j][0]:
                path.append(graph[j][1])
                z = j
    return path

# Start create genome from path
def createsinglegenome(path, k):
    genome = [path[0]] + [path[i][k-2] for i in range(1, len(path))]
    wholegenome = ''.join(genome)
    return wholegenome

# Main function
def SingleRead():
    # Adjust the path to your file in Google Drive
    filename = "/content/drive/My Drive/SingleReadInput.txt"
    sequence, k = readsinglefile(filename)
    graph = singleDBG(sequence, k)
    startpoint = single_findStartPoint(graph)
    path = singlePath(graph, startpoint)
    wholegenome = createsinglegenome(path, k)
    print("Whole genome:", wholegenome)
    return wholegenome

# Call the main function to test
SingleRead()

Whole genome: TGCCCCTTTGATCGCGGTTCTCGAATCCATGTAAATACAAAGATCTTATGTCCGCCGCGTATAGCGGTCGTAAAAATCTACGAGTTTCGATAACTCCAGGATCAATGCGGAACTATGCCCTTATAATAAGGCCACAATTAGTGCGCGTATTAGTGCGATTCCCATTTGCTCCTTTTCTCAACGACCAACGTAGGCGGGGGATGAGTATGCACACGCCCACCCGCTACACTCGACCCTCTCGGCTCTTTTTGTACCGGGGGCCTATATCTCCTGCACCGCCACCATCGCGTTCTCTCTTATTTTGCTATTATTATTCTTTCCAGAACATATGACATATCAGTGCAAGCTGAATCGCGAAGCGGCACTTAATACGATTTCTTGCGATGTGTCTTCTCGCGGCAATTGCTAGTGCCTGGTAAGTCACCGTGATCGTGTCTATG


'TGCCCCTTTGATCGCGGTTCTCGAATCCATGTAAATACAAAGATCTTATGTCCGCCGCGTATAGCGGTCGTAAAAATCTACGAGTTTCGATAACTCCAGGATCAATGCGGAACTATGCCCTTATAATAAGGCCACAATTAGTGCGCGTATTAGTGCGATTCCCATTTGCTCCTTTTCTCAACGACCAACGTAGGCGGGGGATGAGTATGCACACGCCCACCCGCTACACTCGACCCTCTCGGCTCTTTTTGTACCGGGGGCCTATATCTCCTGCACCGCCACCATCGCGTTCTCTCTTATTTTGCTATTATTATTCTTTCCAGAACATATGACATATCAGTGCAAGCTGAATCGCGAAGCGGCACTTAATACGATTTCTTGCGATGTGTCTTCTCGCGGCAATTGCTAGTGCCTGGTAAGTCACCGTGATCGTGTCTATG'

## paired

In [11]:
# Read file, kmers, and gap length
def readpairfile(filename):
    with open(filename) as file:
        lines = [(line.strip()).split() for line in file]
    k = int(''.join(lines[0][0]))
    gap = int(''.join(lines[0][1]))
    sequence = []
    for i in range(1, len(lines)):
        sequence.append(lines[i][0])
    for i in range(len(sequence)):
        sequence[i] = "".join(ch for ch in sequence[i] if ch.isalnum())
    return sequence, k, gap

# Create DE Bruijn Graph
def pairDBG(sequence, k):
    combination = []
    for seq in sequence:
        combination.append(seq[0:k-1])
        combination.append(seq[k:k+k-1])
        combination.append(seq[1:k])
        combination.append(seq[k+1:k+k])
    graph = [list(combination[i: i + 2]) for i in range(0, len(combination), 2)]
    Graph = [list(graph[i: i + 2]) for i in range(0, len(graph), 2)]
    return Graph

# Help us to find start point in graph
def pair_findStartPoint(graph):
    tmp_tofind_startpoint = []
    for i in range(len(graph)):
        tmp_tofind_startpoint.append(graph[i][1])
    return tmp_tofind_startpoint

# Create path from tracing graph and begin with start point
def pairPath(graph, tmp_tofind_startpoint):
    path = []
    for i in range(len(graph)):
        if (graph[i][0] not in tmp_tofind_startpoint):
            path.insert(0, graph[i][0])
            path.insert(1, graph[i][1])
            z = i
            break
    for l in graph:
        for j in range(len(graph)):
            if (graph[z][1] == graph[j][0]):
                path.append(graph[j][1])
                z = j
    return path

# Start create genome from path
def createpairgenome(path, k, gap):
    prefix = []
    suffix = []
    for i in range(len(path)-1):
        prefix.append(path[i][0][0])
    prefix.append(path[-1][0])
    prefix = ''.join(prefix)

    for i in range(len(path)-1):
        suffix.append(path[i][1][0])
    suffix.append(path[-1][1])
    suffix = ''.join(suffix)

    suffixlastindex = k + gap
    wholegenome = prefix + suffix[-suffixlastindex:]
    return wholegenome

# Main function
def PairRead():
    # Adjust the path to your file in Google Drive
    filename = "/content/drive/My Drive/ReadPairsInput.txt"
    sequence, k, gap = readpairfile(filename)
    graph = pairDBG(sequence, k)
    startpoint = pair_findStartPoint(graph)
    path = pairPath(graph, startpoint)
    wholegenome = createpairgenome(path, k, gap)
    print("Whole genome:", wholegenome)
    return wholegenome

# Call the main function to test
PairRead()


Whole genome: GAAAGGTACAAATACTGGCGACCTCGCTGTTCGACACTTCATCACTGCTCCGGGGCGCTCAGGAGGGACGGTTCCCTGTACCATTGGAAGTCAATAGTCTAAGGTACAAAGAGAAGACCCGACCCGACAGAGGGGGTTCTGCGCCGGGTTTCGAGCTTGTAACCCCCCAGAGAATTAGATCCACCGTCTGTGTGGACAAAGTAGTAAAGCTAGCATACCAAATTGAAATTCGGAGTTTGACTACCAGATCCACGCATACGCTGCACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGTAGAAATTCAGAACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGGGGGGTAATTCGTAGTTAGGTACAGAAAACTCCCGGACAGAACCGCATATAACCGATGAAGCAAGGGTTCTTCATTTAATACGACCCTAACCGGTATTGCTGCTAGCTTGATTTTCCTAGCAATCTAAACTCTATGTATGAGGCCACTCGGACGCCCGCTAGTGCCGGCAGCTAGCTACTGCCCTTCACCAGGAGCACGCACTATGCCTATCGGGCAATGCTGATCATACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGATCCCTCTGCAGAAAGCGGTGGCGGCGGGTCTAAGCAAGTCCAACGCAATACCAGGAAATCACCGTATCGTTAGCGACCAGTAGGTGATGGTTTGTAAGTTCGGACTACAGGCGGATGTGTCCCCGCCAGTTAAAAGTCGACTTTCTGTTACAACTGCTCCCTACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGTCTAATGATCCCCACAGATCGTGTTTCAACGTTGAAGTCTGAATGGGTTCGTGAATATAGCCATCCAACGTGGACAATAAGATGAGCTTTATAGTTTCCGATCCTCATGGCGATCGAAT

'GAAAGGTACAAATACTGGCGACCTCGCTGTTCGACACTTCATCACTGCTCCGGGGCGCTCAGGAGGGACGGTTCCCTGTACCATTGGAAGTCAATAGTCTAAGGTACAAAGAGAAGACCCGACCCGACAGAGGGGGTTCTGCGCCGGGTTTCGAGCTTGTAACCCCCCAGAGAATTAGATCCACCGTCTGTGTGGACAAAGTAGTAAAGCTAGCATACCAAATTGAAATTCGGAGTTTGACTACCAGATCCACGCATACGCTGCACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGTAGAAATTCAGAACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGGGGGGTAATTCGTAGTTAGGTACAGAAAACTCCCGGACAGAACCGCATATAACCGATGAAGCAAGGGTTCTTCATTTAATACGACCCTAACCGGTATTGCTGCTAGCTTGATTTTCCTAGCAATCTAAACTCTATGTATGAGGCCACTCGGACGCCCGCTAGTGCCGGCAGCTAGCTACTGCCCTTCACCAGGAGCACGCACTATGCCTATCGGGCAATGCTGATCATACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGATCCCTCTGCAGAAAGCGGTGGCGGCGGGTCTAAGCAAGTCCAACGCAATACCAGGAAATCACCGTATCGTTAGCGACCAGTAGGTGATGGTTTGTAAGTTCGGACTACAGGCGGATGTGTCCCCGCCAGTTAAAAGTCGACTTTCTGTTACAACTGCTCCCTACAAGGGACCCTGCTCACTCGATTGGGAATCTAATGCGGTCTGCCATGGGTCTAATGATCCCCACAGATCGTGTTTCAACGTTGAAGTCTGAATGGGTTCGTGAATATAGCCATCCAACGTGGACAATAAGATGAGCTTTATAGTTTCCGATCCTCATGGCGATCGAATAAGATCTATCCGC