# Algorithms for Sequence Assembly

The shortest common superstring algorithm is developed and implemented for assembly of DNA sequences.

In [1]:
import itertools

In [2]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match


def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [3]:
len(scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']))

11

In [4]:
def scs_count(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
            num_shortest_sup = 1
        elif len(sup) == len(shortest_sup):
            num_shortest_sup +=1 # found another superstring equal to the current shortest superstring
    return shortest_sup, num_shortest_sup  # return shortest

Testing out assembly with shortest common superstring below

In [5]:
strings = ['ABC', 'BCA', 'CAB']
scs_count(strings)

('ABCAB', 3)

In [6]:
strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']
scs_count(strings)

('TCGATGCAATAG', 10)

In [7]:
strings = ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
scs_count(strings)

('CCTTGGATTGC', 4)

In [8]:
def pick_maximal_overlap(reads, k):
    reada, readb = None, None
    best_olen = 0
    for a,b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length = k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen            

In [9]:
def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)                     

In [10]:
greedy_scs(['ABC', 'BCA', 'CAB'], 2)

'CABCA'

In [11]:
def readReads(filename):
    with open(filename, 'r') as f:
        reads = []
        read = ''
        for line in f:
            if line[0] == '@':
                if read != '':
                    reads.append(read)
                read = ''
                
            else:
                read += line.rstrip()
        reads.append(read)
    reads = list(map(lambda x: x.split('+')[0], reads))
    return reads

In [12]:
reads = readReads('ads1_week4_reads.fq')

In [13]:
len(reads)

1881

In [14]:
def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)    

In [15]:
def pick_max_overlap_faster(reads, k):
    reada, readb = None, None
    best_olen = 0
    kmer_dict = {}
    for read in reads:
        for i in range(len(read)-k+1):
            kmer = read[i:i+k]
            if kmer in kmer_dict:
                kmer_dict[kmer].add(read)
            else:
                kmer_dict[kmer] = set([read])
#     finding overlaps for each read
    for read in reads:
        read_suffix = read[-k:]
        potential_overlaps = kmer_dict[read_suffix]
        for possible_pair in potential_overlaps:
            olen = overlap(read, possible_pair, min_length=k)
            if olen > best_olen and read != possible_pair:
                reada = read
                readb = possible_pair
                best_olen = olen
    return reada, readb, best_olen

In [16]:
def greedy_scs_faster(reads, k):
    read_a, read_b, olen = pick_max_overlap_faster(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_max_overlap_faster(reads, k)
    return ''.join(reads)   

In [17]:
greedy_scs_results = greedy_scs_faster(reads, 20)

In [18]:
print(greedy_scs_results[:200])

ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCTAGTACACTTAGGATTCAAGATCCTATTATCAGGGACAAGAGCAGGATTAGGGATATCCGAGATGGCCACACTTTTGAGGAGCTTAGCATTGTTCAAAAGAAACAAGGACAAACCACCCATTACATCAGGATCCGGTGGAGCCATCAGAGGAATC


In [19]:
print(len(greedy_scs_results))

15894


In [20]:
a_count = 0
for char in greedy_scs_results:
    if char == 'A' or char =='a':
        a_count += 1
print(a_count)


4633


In [21]:
t_count = 0
for char in greedy_scs_results:
    if char == 't' or char =='T':
        t_count += 1
print(t_count)

3723
