In [1]:
import random
import numpy as np
import math

In [2]:
def generate_genome(length): #length = 10000
    bases = ['A', 'C', 'G', 'T']
    genome = ''.join(random.choices(bases, k=length))
    return genome

In [3]:
def generate_short_reads(genome, coverage_percentage, read_length=500):
    coverage = coverage_percentage / 100.0
    total_bases = len(genome)
    reads = []

    # Calculate the number of reads needed to achieve the desired coverage
    total_reads_needed = int(total_bases * coverage)
    reads_per_base = total_reads_needed / total_bases

    # Generate short reads based on the calculated number of reads per base
    current_base_index = 0
    while current_base_index < total_bases:
        # Calculate the number of reads to cover this base
        reads_for_this_base = max(1, int(reads_per_base))
        for _ in range(reads_for_this_base):
            read_end = min(current_base_index + read_length, total_bases)
            reads.append(genome[current_base_index:read_end])
        current_base_index += read_length

    return reads

In [4]:
def introduce_mutations(genome, mutation_rate):
    mutation_indices = random.sample(range(len(genome)), int(len(genome)*mutation_rate))
    reference_sequence = list(genome)
    for index in mutation_indices:
        reference_sequence[index] = random.choice(['A', 'C', 'G', 'T'])
    return ''.join(reference_sequence)

In [5]:
def find_overlap(read1, read2):
    max_overlap = min(len(read1), len(read2))
    for overlap in range(max_overlap, 4, -1):
        if read1[-overlap:] == read2[:overlap]:  # Compare the end of read1 with the beginning of read2
            return overlap
    return 0
def reconstruct_genome(short_reads):
    assembled_genome = short_reads[0]  # Start with the first read
    for read in short_reads[1:]:
        overlap = find_overlap(assembled_genome, read)
        assembled_genome += read[overlap:]  # Append the part of the read that doesn't overlap
    return assembled_genome

In [6]:
genome = generate_genome(10000)

In [7]:
print(genome)

AGGGTCTTGCGACGGTGTCTATAATCGATTATCGGCCCTATCCATGACAATGAGGAAATTGCCATGTACCCGAGTGACCACATATTGACGTTCGGGGGGTGTTATGATGCAACTTTGATCAAACCCGGCTTAACTTGTAATGCAATCAAAAGAGGCATGGGATCTGGAGGTTCACGCGCATTTGTAAGTGGGACGCCTACCACGGTGTCTACCCATGCATGTTTGAGGTGTGAGGTTTAAGTTCCTTAGGTTGGGCAGATGCTAGCCGAGAAAGACAACATTCTACGCTGAAAGTCGTGCGCACTTACTCGAACCCCTCTTAATGGAGCCTAGAGGTATTTCATCTTCTTGCGTTGTTAACCACAAATCTAGGCTCTCGACATTAGGCCTGTCTTCGTACCTTCAACGACGTCTGCAGAATCGCGCGCGTATAACCGCCGGCATACATGCGACGAGAGCGTATCTCTGCTAAAGTAGAACATGTAGAACATGACATCGAATACGGGTCCCTTCGTTGGTAAACCAGACGCGATCACCGGGACGGGATAGTGCGCCCGATTCACCCTAATAACGTAAGCTGAGGGCTCTTAGAACCCATGTGCGGTGACATACGATTGGTCAGTTCATTGGACGCCTCCCTGTCTGGGGGGGCATCTGATGGCACTTCATAGTCACGCACCAGGTTGTTGCCGGCACCGACAGCCGATATTAAATTCGGACCTCGTGTCGCCACCCGAAGCATCAACGCCACGTGGTTGGCTCGTATTGATAGAATACGATGGTGATTCTAGTCACCCAGCATTCATAACCGTTGAGGCGAAGCAGTTCAAACGAATTCATCGGCTGTTCGTATCCTCCTCCTTCAGTCTGTATATTCAACTGTGCCAACATTATAACGAGGCCTGGTCGACACAACCCGTACGGTGGCGGCTGATTTTGTCGCTTTCGTGTCCACTCGTAAAGTGGGGGTGCGATTAGGACACATGGGTTAGAGCTCGCA

In [8]:
coverages = [100, 200, 300]  # Adjust coverages as needed
short_reads = [generate_short_reads(genome, coverage, read_length=50) for coverage in coverages]

In [9]:
print(len(short_reads),len(short_reads[0]),len(short_reads[0][0]), short_reads[0][0])

3 200 50 AGGGTCTTGCGACGGTGTCTATAATCGATTATCGGCCCTATCCATGACAA


In [10]:
mutation_rate = random.uniform(0.05, 0.10)
reference_sequence = introduce_mutations(genome, mutation_rate)

In [11]:
if genome != reference_sequence:
    print("Reference sequence successfully generated.")
else:
    print("Reference sequence generation failed.")

Reference sequence successfully generated.


In [12]:
reconstructed_genome_mapping = reconstruct_genome(short_reads[0]) #runtime = 0 seconds

In [13]:
print("genome: ",genome, "\nNew:    ", reconstructed_genome_mapping)

genome:  AGGGTCTTGCGACGGTGTCTATAATCGATTATCGGCCCTATCCATGACAATGAGGAAATTGCCATGTACCCGAGTGACCACATATTGACGTTCGGGGGGTGTTATGATGCAACTTTGATCAAACCCGGCTTAACTTGTAATGCAATCAAAAGAGGCATGGGATCTGGAGGTTCACGCGCATTTGTAAGTGGGACGCCTACCACGGTGTCTACCCATGCATGTTTGAGGTGTGAGGTTTAAGTTCCTTAGGTTGGGCAGATGCTAGCCGAGAAAGACAACATTCTACGCTGAAAGTCGTGCGCACTTACTCGAACCCCTCTTAATGGAGCCTAGAGGTATTTCATCTTCTTGCGTTGTTAACCACAAATCTAGGCTCTCGACATTAGGCCTGTCTTCGTACCTTCAACGACGTCTGCAGAATCGCGCGCGTATAACCGCCGGCATACATGCGACGAGAGCGTATCTCTGCTAAAGTAGAACATGTAGAACATGACATCGAATACGGGTCCCTTCGTTGGTAAACCAGACGCGATCACCGGGACGGGATAGTGCGCCCGATTCACCCTAATAACGTAAGCTGAGGGCTCTTAGAACCCATGTGCGGTGACATACGATTGGTCAGTTCATTGGACGCCTCCCTGTCTGGGGGGGCATCTGATGGCACTTCATAGTCACGCACCAGGTTGTTGCCGGCACCGACAGCCGATATTAAATTCGGACCTCGTGTCGCCACCCGAAGCATCAACGCCACGTGGTTGGCTCGTATTGATAGAATACGATGGTGATTCTAGTCACCCAGCATTCATAACCGTTGAGGCGAAGCAGTTCAAACGAATTCATCGGCTGTTCGTATCCTCCTCCTTCAGTCTGTATATTCAACTGTGCCAACATTATAACGAGGCCTGGTCGACACAACCCGTACGGTGGCGGCTGATTTTGTCGCTTTCGTGTCCACTCGTAAAGTGGGGGTGCGATTAGGACACATGGGTTA

In [14]:
if genome == reconstructed_genome_mapping:
    print("Genome successfully reconstructed using denovo.")
else:
    print("Genome reconstruction failed using denovo.")


Genome successfully reconstructed using denovo.


In [15]:
reconstructed_genome_mapping = reconstruct_genome(short_reads[1]) #runtime = 0 seconds

In [16]:
print("genome: ",genome, "\nNew:    ", reconstructed_genome_mapping)

genome:  AGGGTCTTGCGACGGTGTCTATAATCGATTATCGGCCCTATCCATGACAATGAGGAAATTGCCATGTACCCGAGTGACCACATATTGACGTTCGGGGGGTGTTATGATGCAACTTTGATCAAACCCGGCTTAACTTGTAATGCAATCAAAAGAGGCATGGGATCTGGAGGTTCACGCGCATTTGTAAGTGGGACGCCTACCACGGTGTCTACCCATGCATGTTTGAGGTGTGAGGTTTAAGTTCCTTAGGTTGGGCAGATGCTAGCCGAGAAAGACAACATTCTACGCTGAAAGTCGTGCGCACTTACTCGAACCCCTCTTAATGGAGCCTAGAGGTATTTCATCTTCTTGCGTTGTTAACCACAAATCTAGGCTCTCGACATTAGGCCTGTCTTCGTACCTTCAACGACGTCTGCAGAATCGCGCGCGTATAACCGCCGGCATACATGCGACGAGAGCGTATCTCTGCTAAAGTAGAACATGTAGAACATGACATCGAATACGGGTCCCTTCGTTGGTAAACCAGACGCGATCACCGGGACGGGATAGTGCGCCCGATTCACCCTAATAACGTAAGCTGAGGGCTCTTAGAACCCATGTGCGGTGACATACGATTGGTCAGTTCATTGGACGCCTCCCTGTCTGGGGGGGCATCTGATGGCACTTCATAGTCACGCACCAGGTTGTTGCCGGCACCGACAGCCGATATTAAATTCGGACCTCGTGTCGCCACCCGAAGCATCAACGCCACGTGGTTGGCTCGTATTGATAGAATACGATGGTGATTCTAGTCACCCAGCATTCATAACCGTTGAGGCGAAGCAGTTCAAACGAATTCATCGGCTGTTCGTATCCTCCTCCTTCAGTCTGTATATTCAACTGTGCCAACATTATAACGAGGCCTGGTCGACACAACCCGTACGGTGGCGGCTGATTTTGTCGCTTTCGTGTCCACTCGTAAAGTGGGGGTGCGATTAGGACACATGGGTTA

In [17]:
if genome == reconstructed_genome_mapping:
    print("Genome successfully reconstructed using denovo.")
else:
    print("Genome reconstruction failed using denovo.")


Genome successfully reconstructed using denovo.


In [18]:
reconstructed_genome_mapping = reconstruct_genome(short_reads[2]) #runtime = 0 seconds

In [19]:
print("genome: ",genome, "\nNew:    ", reconstructed_genome_mapping)

genome:  AGGGTCTTGCGACGGTGTCTATAATCGATTATCGGCCCTATCCATGACAATGAGGAAATTGCCATGTACCCGAGTGACCACATATTGACGTTCGGGGGGTGTTATGATGCAACTTTGATCAAACCCGGCTTAACTTGTAATGCAATCAAAAGAGGCATGGGATCTGGAGGTTCACGCGCATTTGTAAGTGGGACGCCTACCACGGTGTCTACCCATGCATGTTTGAGGTGTGAGGTTTAAGTTCCTTAGGTTGGGCAGATGCTAGCCGAGAAAGACAACATTCTACGCTGAAAGTCGTGCGCACTTACTCGAACCCCTCTTAATGGAGCCTAGAGGTATTTCATCTTCTTGCGTTGTTAACCACAAATCTAGGCTCTCGACATTAGGCCTGTCTTCGTACCTTCAACGACGTCTGCAGAATCGCGCGCGTATAACCGCCGGCATACATGCGACGAGAGCGTATCTCTGCTAAAGTAGAACATGTAGAACATGACATCGAATACGGGTCCCTTCGTTGGTAAACCAGACGCGATCACCGGGACGGGATAGTGCGCCCGATTCACCCTAATAACGTAAGCTGAGGGCTCTTAGAACCCATGTGCGGTGACATACGATTGGTCAGTTCATTGGACGCCTCCCTGTCTGGGGGGGCATCTGATGGCACTTCATAGTCACGCACCAGGTTGTTGCCGGCACCGACAGCCGATATTAAATTCGGACCTCGTGTCGCCACCCGAAGCATCAACGCCACGTGGTTGGCTCGTATTGATAGAATACGATGGTGATTCTAGTCACCCAGCATTCATAACCGTTGAGGCGAAGCAGTTCAAACGAATTCATCGGCTGTTCGTATCCTCCTCCTTCAGTCTGTATATTCAACTGTGCCAACATTATAACGAGGCCTGGTCGACACAACCCGTACGGTGGCGGCTGATTTTGTCGCTTTCGTGTCCACTCGTAAAGTGGGGGTGCGATTAGGACACATGGGTTA

In [20]:
if genome == reconstructed_genome_mapping:
    print("Genome successfully reconstructed using denovo.")
else:
    print("Genome reconstruction failed using denovo.")


Genome successfully reconstructed using denovo.
