*to handle errors and diploidy*:
    
    1. find every unsequenced read
    2. see if the unsequenced reads exist in the constructed sequence
    3. construct sequences with unseen reads until they combine with a sequenced read in both directions
    4. sort kmers by frequency of kmer occurence in all reads, more frequent kmers are more likely polyploidy and less frequent kmers are more likely error

In [23]:
import random

In [24]:
class Sequitur:
    def __init__(self,read):
        self.sequence = read.read
        self.reads = [read]

    def __repr__(self): return self.sequence

    def __getitem__(self,key): return self.sequence[key]

    def __eq__(self,other): return self.sequence == other

    def __add__(self,other): 
        self.sequence += other
        return self

    def __radd__(self,other): 
        self.sequence = other + self.sequence
        return self

    def partition(self,root): return self.sequence.partition(root)

    def rpartition(self,root): return self.sequence.rpartition(root)

In [25]:
''' 
The read is held by a leaf and sustains their uniqueness and ensures they're added to the sequence at most once.
'''
class Read:
    def __init__(self,read):
        self.read = read
        self.is_sequenced = False

    def __repr__(self): return self.read

    def endswith(self,prefix): return self.read.endswith(prefix)
    def startswith(self,suffix): return self.read.startswith(suffix)

    def partition(self,root,dir):
        if dir: return Node(self.read.rpartition(root)[0],dir), self.read.rpartition(root)[2]
        else: return Node(self.read.partition(root)[2],dir), self.read.partition(root)[0]


In [26]:
'''
A leaf is the end point of a branch and can only the gained information. 
Every Trie must have a leaf for every read containing the root
Leaves can become branches
'''
class Leaf:
    def __init__(self,context,information,read):
        self.context = context
        if len(information) == 0: self.information = '$'
        else: self.information = information
        self.read = read

    def __repr__(self): return str(self.information)
    
    def branch(self,context):
        stalk = ""
        i = 0
        while i < min(len(context.stalk),len(self.context.stalk)) and context[i] == self.context[i]: 
            stalk += context[i]
            i += 1
        return stalk,self.context.stalk[len(stalk):]

In [27]:
'''
The node is a convenience class for accessing the branch
'''
class Node:
    def __init__(self,stalk,dir=0):
        if len(stalk) > 0: self.stalk = stalk
        else: self.stalk = "^"
        self.reversed = False
        if dir: 
            self.stalk = ''.join(reversed(self.stalk))
            self.reversed = True

    def __eq__(self,other): #return self.stalk[0] == other.stalk[0]
        i = 0
        while i < min(len(self.stalk),len(other.stalk)):
            if self.stalk[i] != other.stalk[i]: return False
            i += 1
        return True

    def __hash__(self): return hash(self.stalk[0])

    def __getitem__(self,index): return self.stalk[index]

    def __repr__(self): return self.stalk

    def __len__(self):
        if self.stalk == '^': return 0
        else: return len(self.stalk)

In [28]:
'''
A branch has either a collection of branches or a collection of leaves
Every branch must have at least 1 leaf
'''
class Branch:
    def __init__(self,root,network,node=None,foothold=None):
        self.branches = {}
        self.leaves = {}
        self.root = root
        self.network = network
        self.node = node
        self.foothold = foothold

    def grow(self,leaf,dir):
        leaf.context.stalk = leaf.context[len(self.node):]
        if leaf.context.stalk == '': leaf.context.stalk = '^'
        if leaf.context in self.branches: self.branches[leaf.context].grow(leaf,dir)
        else:
            node = Node('',dir)
            if leaf.context.stalk == '^':
                if node in self.leaves: self.leaves[node] += [leaf]
                else: self.leaves[node] = [leaf]
            else:
                stalk = ''
                new_leaves = []
                for l in list(self.leaves):
                    s = leaf.branch(l)[0]
                    if len(s) == 0: continue
                    elif len(stalk) == 0 or len(s) < len(stalk): 
                        stalk = s
                        new_leaves += [(l,self.leaves.pop(l))]
                if len(stalk) > 0: node.stalk = stalk
                self.branches[node] = Branch(self.root,self.network,node,self)
                for l in new_leaves:
                    l[0].stalk = l[0][len(node):]
                    if l[0].stalk == '':
                        l[0].stalk = '^'
                        if l[0] in self.branches[node].leaves: self.branches[node].leaves[l[0]] += [l[1]]
                        else: self.branches[node].leaves[l[0]] = [l[1]]
                    else: self.branches[node].leaves[l[0]] = l[1]
                leaf.context.stalk = leaf.context.stalk[len(node):]
                if leaf.context.stalk == '':
                    leaf.context.stalk = '^'
                    if leaf.context in self.branches[node].leaves: self.branches[node].leaves[leaf.context] += [leaf]
                    else: self.branches[node].leaves[leaf.context] = [leaf]
                else: self.branches[node].leaves[leaf.context] = leaf

    def climb(self,sequence,dir,stalk='',join_at=None):
        if type(self) == Root:
            stalk = self.root
            if dir: context,_,_ = sequence.rpartition(stalk)
            else: _,_,context = sequence.partition(stalk)
        else:
            if dir: 
                stalk = self.node.stalk + stalk
                context,_,_ = sequence.rpartition(stalk)
            else: 
                stalk += self.node.stalk 
                _,_,context = sequence.partition(stalk)
        context = Node(context,dir)
        if context in self.branches: return self.branches[context].climb(sequence,dir,stalk,join_at)
        else: 
            if join_at and context in self.leaves and not context.stalk == '^' and join_at == self.leaves[context].read: return True
            if context in self.leaves and not context.stalk == '^' and not self.leaves[context].read.is_sequenced and self.joins_with(self.leaves[context].read,sequence,not dir): 
                if dir: sequence += self.leaves[context].information
                else: sequence = self.leaves[context].information + sequence
                self.leaves[context].read.is_sequenced = True
                sequence.reads += [self.leaves[context].read]
                return sequence
            else:
                carat = Node('^',dir) 
                if carat in self.leaves:
                    for leaf in self.leaves[carat]:
                        if join_at and join_at == leaf.read: return True
                        elif join_at: continue
                        if leaf.read == sequence.reads[-1] or leaf.read.is_sequenced: continue
                        if self.joins_with(leaf.read,sequence,not dir):
                            if dir: sequence += leaf.information
                            else: sequence = leaf.information + sequence
                            leaf.read.is_sequenced = True
                            sequence.reads += [leaf.read]
                            return sequence
                if self.foothold:
                    return self.descend(sequence,dir,stalk[len(self.node.stalk):],join_at)
                else: 
                    if join_at: return False
                    if dir: return sequence + '$'
                    else: return '^' + sequence
    
    def joins_with(self,read,sequence,dir):
        if dir: # starts with
            root = read.read[-self.network.k:]
            ind = sequence.sequence.index(root)
            suf = sequence.sequence[:ind+self.network.k]
            b = read.endswith(suf)
            return b
        else: # ends with
            root = read.read[:self.network.k]
            ind = sequence.sequence.rindex(root)
            pre = sequence.sequence[ind:]
            b = read.startswith(pre)
            return b
        # if dir: return read.endswith(sequence.sequence[:sequence.sequence.index(read.read[-self.network.k:])+self.network.k])
        # else: return read.startswith(sequence.sequence[sequence.sequence.rindex(read.read[:self.network.k]):])

    def descend(self,sequence,dir,stalk,join_at):
        carat = Node('',dir)
        if carat in self.foothold.leaves:
            for leaf in self.foothold.leaves[carat]:
                if join_at and join_at == leaf.read: return True
                elif join_at: continue
                if leaf.read == sequence.reads[-1] or leaf.read.is_sequenced: continue
                if self.joins_with(leaf.read,sequence,not dir):
                    if dir: sequence += leaf.information
                    else: sequence = leaf.information + sequence
                    leaf.read.is_sequenced = True
                    sequence.reads += [leaf.read]
                    return sequence
        if self.foothold:
            return self.descend(sequence,dir,stalk[len(self.node.stalk):],join_at)
        else: 
            if join_at: return False
            if dir: return sequence + '$'
            else: return '^' + sequence



In [29]:
'''
A root is a branch with a connection to the network and a list of the reads it comprises of.
It can have a collection of branches and leaves.
'''
class Root(Branch):
    def __init__(self,root,network):
        super().__init__(root,network)
        self.reads = []

    def __repr__(self):
        return self.root
        
    def add_read(self,read):
        self.reads += [read]
        self.network.reads += [read]

    def grow(self,sequence,dir,join_at=None):
        if len(self.reads) > 0: 
            if len(self.leaves) == 0: self.sprout(sequence,dir)
            else: 
                read = self.reads.pop()
                while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
                if not read.is_sequenced:
                    context,information = read.partition(self.root,dir)
                    if read.read == sequence: 
                        read.is_sequenced = True
                        sequence.reads += [read]
                    if context in self.leaves: 
                        node = Node(self.leaves[context].branch(context)[0])
                        self.branches[node] = Branch(self.root,self.network,node)
                        self.leaves[context].context = Node(self.leaves[context].context.stalk.partition(node.stalk)[2])
                        if self.leaves[context].context.stalk == '^': 
                            if self.leaves[context].context in self.branches[node].leaves: self.branches[node].leaves[self.leaves[context].context] += [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                            else: self.branches[node].leaves[self.leaves[context].context] = [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                        else: self.branches[node].leaves[self.leaves[context].context] = Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)
                        self.leaves.pop(context)
                        context = Node(context.stalk.partition(node.stalk)[2])
                        if context.stalk == '^': 
                            if context in self.branches[node].leaves: self.branches[node].leaves[context] += [Leaf(context,information,read)]
                            else: self.branches[node].leaves[context] = [Leaf(context,information,read)]
                        else: self.branches[node].leaves[context] = Leaf(context,information,read)
                    else: 
                        if context.stalk == '^': self.leaves[context] = [Leaf(context,information,read)]
                        else: self.leaves[context] = Leaf(context,information,read)
            return self.grow(sequence,dir,join_at)
        else: 
            if join_at: return self.climb(sequence,dir,'',join_at)
            sequence = self.climb(sequence,dir)
            if sequence[0] == '^' and not dir: return self.network.get_root(sequence,1).grow(sequence,1)
            elif sequence[0] == '^' and sequence[-1] == '$': return sequence
            return self.network.get_root(sequence,dir).grow(sequence,dir)

    def sprout(self,sequence,dir):
        read = self.reads.pop()
        while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
        if not read.is_sequenced:
            context,information = read.partition(self.root,dir)
            if read.read == sequence: 
                read.is_sequenced = True
                sequence.reads += [read]
            if len(self.branches) > 0 and context in self.branches: 
                leaf = Leaf(context,information,read)
                self.branches[context].grow(Leaf(context,information,read),dir)
            else: 
                if context.stalk == '^': self.leaves[context] = [Leaf(context,information,read)]
                else: self.leaves[context] = Leaf(context,information,read)

In [30]:
class RootNetwork:
    def __init__(self,k):
        self.roots = {}
        self.reads = []
        self.k = k

    def __getitem__(self,key):
        return self.roots[key]

    def __contains__(self, key):
        return key in self.roots

    # dir = 1, context gain towards prefix
    # dir = 0, context gain towards suffix
    def build(self,sequence,dir=0): return self.get_root(sequence,dir).grow(sequence,dir)

    def plant_trie(self,trie): self.roots[trie.root] = trie

    def get_root(self,sequence,dir):
        if dir: return self[sequence[-self.k:]]
        else: return self[sequence[:self.k]]

In [45]:
def initialise(reads):
    k = 3
    r = RootNetwork(k)
    R = {}
    kmers = {}
    for read in reads:
        R[read] = Read(read)
        for i in range(len(read)-k+1):
            if read[i:i+k] in kmers: kmers[read[i:i+k]] += 1
            else: kmers[read[i:i+k]] = 1
            if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
            r[read[i:i+k]].add_read(R[read])
    return R,r,kmers

In [46]:
# you say hello world, i bellow go to hell
successes = 0
reads = ['you say hel',' say hello wo','lo world, i be','ld, i bellow go t','ow go to hell']
print('Sequence: you say hello world, i bellow go to hell')
for read in reads:
    R,r,kmers = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'you say hello world, i bellow go to hell':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')

Sequence: you say hello world, i bellow go to hell
Initus: you say hel [SUCCESS]
Initus:  say hello wo [SUCCESS]
Initus: lo world, i be [SUCCESS]
Initus: ld, i bellow go t [SUCCESS]
Initus: ow go to hell [SUCCESS]
------------------------
Accuracy: 100.0 %


In [48]:
# she_sells_sea_shells_on_the_sea_shore
successes = 0
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
print('Sequence: she_sells_sea_shells_on_the_sea_shore')
for read in reads:
    R,r,kmers = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'she_sells_sea_shells_on_the_sea_shore':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')

Sequence: she_sells_sea_shells_on_the_sea_shore
Initus: she_sells_s [SUCCESS]
Initus: lls_sea_shel [SUCCESS]
Initus: ea_shells_o [SUCCESS]
Initus: shells_on_the_s [SUCCESS]
Initus: he_sea_s [SUCCESS]
Initus: ea_shore [SUCCESS]
------------------------
Accuracy: 100.0 %


[ ] recursive approach: build in the selected direction until the other end or until you reach the same read (indicating a circular sequence or an incorrect build)

[ ] probabilistic approach: build from unique regions (areas of low kmer coverage) towards common of repeated regions 

build(*sequence*,*mode*=0,*dir*=None):
    if *mode*==0:
        if there is no *dir*: 
            find the longest overlap that has not already been added to *sequence* and store it as *candidate*
            add *candidate* to *sequence* and store the direction to *dir*
        else:
            find the longest overlap on the *dir* end that has not already been added to *sequence* and store it as *candidate*
            add *candidate* to *sequence*
        build(*sequence*,1,*dir*)
    else:
        find the longest overlap on the *dir* end that is has not already been added to *sequence* and store it as *candidate*
        if there is no *candidate*: build(*sequence*,0,*dir*)

In [49]:
successes = 0
# longest repeat: betty_bought_b (14)
# shortest overlap: tter_th (7)
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                                     'r_butter_to_make_the_',
                                                                            'r_to_make_the_bitt',
                                                                                   'ke_the_bitter_butter_better']
print('Sequence: betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better')
for read in reads:
    R,r,kmers = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')

Sequence: betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better
Initus: betty_bought_butter_th [FAILURE] | result: tter_the_butter_was_bitter_betty_bought_butter_th
Initus: tter_the_butter_was_ [FAILURE] | result: tty_bought_better_butter_to_make_the_butter_was_bitter_betty_bought_butter_the_butter_was_
Initus: he_butter_was_bitter_ [FAILURE] | result: as_bitter_betty_bought_butter_the_butter_was_bitter_
Initus: as_bitter_betty_bought [FAILURE] | result: betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_
Initus: tty_bought_better_butter_t [FAILURE] | result: betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_
Initus: r_butter_to_make_the_ [FAILURE] | result: betty_bought_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_
Initus: r_to_make_the_bitt [SUCCESS]
Initus: ke_the_bitter_butter_better [SUCCESS]
------------------------
Accuracy: 25.0 %


# DeBruijn Graph

In [42]:
from itertools import permutations
import random

# Generate all possible k-mers of length k
def kmers(k,seed=None):
    random.seed(seed)
    perms = list(permutations('ATCG', k))
    random.shuffle(perms)
    return [''.join(p) for p in perms]

# Generate a sequence of length n where each k-mer appears exactly once
def sequence(k, n,seed=None):
    seq = ""
    for kmer in kmers(k,seed):
        seq += kmer
    return seq[:n]

def generate_reads(seq, k, min_overlap, max_overlap,seed=None):
    random.seed(seed)
    reads = []
    for i in range(0, len(seq)-k+1):
        # Generate a random overlap within the specified range
        overlap = random.randint(min_overlap, max_overlap)
        start = i
        end = i + k + overlap
        reads.append(seq[start:end])
    return reads
    
class DeBruijnGraph:
    def __init__(self,reads,k):
        self.nodes = {}
        self.unexplored = []
        for read in reads:
            for i in range(len(read)-k+1):
                if read[i:i+k-1] in self.nodes: 
                    self.nodes[read[i:i+k-1]] += [(read[i:i+k-1],read[i+1:i+k])]
                else: 
                    self.nodes[read[i:i+k-1]] = [(read[i:i+k-1],read[i+1:i+k])]
                self.unexplored += [(read[i:i+k-1],read[i+1:i+k])]
    def __repr__(self): return self.nodes
    def traverse(self): pass

In [100]:
# reads = ['betty_bought_butter_th',
#                         'tter_the_butter_was_',
#                               'he_butter_was_bitter_',
#                                          'as_bitter_betty_bought',
#                                                      'tty_bought_better_butter_t',
#                                                                      'r_butter_to_make_the_',
#                                                                             'r_to_make_the_bitt',
#                                                                                    'ke_the_bitter_butter_better']
seed=1
# seq=sequence(3, 12, seed)
seq='she_sells_sea_shells_on_the_sea_shore'
# reads = generate_reads(seq, 3, 0, 0, seed)
reads = ['she_sells_s',
               'lls_sea_shel',
                    'ea_shells_o',
                       'shells_on_the_s',
                                  'he_sea_s',
                                      'ea_shore']
dbg = DeBruijnGraph(reads,8)

In [101]:
seq

'she_sells_sea_shells_on_the_sea_shore'

In [102]:
reads

['she_sells_s',
 'lls_sea_shel',
 'ea_shells_o',
 'shells_on_the_s',
 'he_sea_s',
 'ea_shore']

In [103]:
dbg.nodes

{'she_sel': [('she_sel', 'he_sell')],
 'he_sell': [('he_sell', 'e_sells')],
 'e_sells': [('e_sells', '_sells_')],
 '_sells_': [('_sells_', 'sells_s')],
 'lls_sea': [('lls_sea', 'ls_sea_')],
 'ls_sea_': [('ls_sea_', 's_sea_s')],
 's_sea_s': [('s_sea_s', '_sea_sh')],
 '_sea_sh': [('_sea_sh', 'sea_she')],
 'sea_she': [('sea_she', 'ea_shel')],
 'ea_shel': [('ea_shel', 'a_shell')],
 'a_shell': [('a_shell', '_shells')],
 '_shells': [('_shells', 'shells_')],
 'shells_': [('shells_', 'hells_o'), ('shells_', 'hells_o')],
 'hells_o': [('hells_o', 'ells_on')],
 'ells_on': [('ells_on', 'lls_on_')],
 'lls_on_': [('lls_on_', 'ls_on_t')],
 'ls_on_t': [('ls_on_t', 's_on_th')],
 's_on_th': [('s_on_th', '_on_the')],
 '_on_the': [('_on_the', 'on_the_')],
 'on_the_': [('on_the_', 'n_the_s')],
 'he_sea_': [('he_sea_', 'e_sea_s')],
 'ea_shor': [('ea_shor', 'a_shore')]}

In [104]:
dbg.unexplored

[('she_sel', 'he_sell'),
 ('he_sell', 'e_sells'),
 ('e_sells', '_sells_'),
 ('_sells_', 'sells_s'),
 ('lls_sea', 'ls_sea_'),
 ('ls_sea_', 's_sea_s'),
 ('s_sea_s', '_sea_sh'),
 ('_sea_sh', 'sea_she'),
 ('sea_she', 'ea_shel'),
 ('ea_shel', 'a_shell'),
 ('a_shell', '_shells'),
 ('_shells', 'shells_'),
 ('shells_', 'hells_o'),
 ('shells_', 'hells_o'),
 ('hells_o', 'ells_on'),
 ('ells_on', 'lls_on_'),
 ('lls_on_', 'ls_on_t'),
 ('ls_on_t', 's_on_th'),
 ('s_on_th', '_on_the'),
 ('_on_the', 'on_the_'),
 ('on_the_', 'n_the_s'),
 ('he_sea_', 'e_sea_s'),
 ('ea_shor', 'a_shore')]

In [105]:
import random
import copy

random.seed(seed)
nodes = copy.deepcopy(dbg.nodes)
unexplored = dbg.unexplored.copy()
node = random.choice(unexplored)
unexplored.remove(node)
nodes[node[0]].remove(node)
cycles = {}
cycle = [node]
while len(unexplored):
    print(cycle)
    print(unexplored)
    print()
    if node[1] in nodes and len(nodes[node[1]]):
        node = nodes[node[1]].pop(random.randint(0,len(nodes[node[1]])-1))
        unexplored.remove(node)
        cycle += [node]
    else:
        print(cycle)
        if cycle[0][0] in cycles: cycles[cycle[0][0]] += cycle
        else: cycles[cycle[0][0]] = cycle
        node = random.choice(unexplored)
        unexplored.remove(node)
        nodes[node[0]].remove(node)
        cycle = [node]
if cycle[0][0] in cycles: cycles[cycle[0][0]] += cycle
else: cycles[cycle[0][0]] = cycle
# if cycle[-1][1] in cycles:
#     cycles[cycle[0][0]] += cycles[cycle[-1][1]]
#     if len(cycles[cycle[-1][1]]) == 0:cycles.pop(cycle[-1][1])

[('lls_sea', 'ls_sea_')]
[('she_sel', 'he_sell'), ('he_sell', 'e_sells'), ('e_sells', '_sells_'), ('_sells_', 'sells_s'), ('ls_sea_', 's_sea_s'), ('s_sea_s', '_sea_sh'), ('_sea_sh', 'sea_she'), ('sea_she', 'ea_shel'), ('ea_shel', 'a_shell'), ('a_shell', '_shells'), ('_shells', 'shells_'), ('shells_', 'hells_o'), ('shells_', 'hells_o'), ('hells_o', 'ells_on'), ('ells_on', 'lls_on_'), ('lls_on_', 'ls_on_t'), ('ls_on_t', 's_on_th'), ('s_on_th', '_on_the'), ('_on_the', 'on_the_'), ('on_the_', 'n_the_s'), ('he_sea_', 'e_sea_s'), ('ea_shor', 'a_shore')]

[('lls_sea', 'ls_sea_'), ('ls_sea_', 's_sea_s')]
[('she_sel', 'he_sell'), ('he_sell', 'e_sells'), ('e_sells', '_sells_'), ('_sells_', 'sells_s'), ('s_sea_s', '_sea_sh'), ('_sea_sh', 'sea_she'), ('sea_she', 'ea_shel'), ('ea_shel', 'a_shell'), ('a_shell', '_shells'), ('_shells', 'shells_'), ('shells_', 'hells_o'), ('shells_', 'hells_o'), ('hells_o', 'ells_on'), ('ells_on', 'lls_on_'), ('lls_on_', 'ls_on_t'), ('ls_on_t', 's_on_th'), ('s_on_th',

In [106]:
cycles

{'lls_sea': [('lls_sea', 'ls_sea_'),
  ('ls_sea_', 's_sea_s'),
  ('s_sea_s', '_sea_sh'),
  ('_sea_sh', 'sea_she'),
  ('sea_she', 'ea_shel'),
  ('ea_shel', 'a_shell'),
  ('a_shell', '_shells'),
  ('_shells', 'shells_'),
  ('shells_', 'hells_o'),
  ('hells_o', 'ells_on'),
  ('ells_on', 'lls_on_'),
  ('lls_on_', 'ls_on_t'),
  ('ls_on_t', 's_on_th'),
  ('s_on_th', '_on_the'),
  ('_on_the', 'on_the_'),
  ('on_the_', 'n_the_s')],
 'e_sells': [('e_sells', '_sells_'), ('_sells_', 'sells_s')],
 'ea_shor': [('ea_shor', 'a_shore')],
 'she_sel': [('she_sel', 'he_sell'), ('he_sell', 'e_sells')],
 'shells_': [('shells_', 'hells_o')],
 'he_sea_': [('he_sea_', 'e_sea_s')]}

In [87]:
for a in cycles:
    s = cycles[a][0][0][0]
    for b in cycles[a]:
        s += b[0][-1]
    s += b[1][-1]
    print(s)

ls_sells_on_the_sea_shells_s
esea_shells_o
h_s
sore
se_el
_ho
e_s_s


In [121]:
import suffix_tree #import Tree

In [136]:
reads = ['betty_bought_butter_th',
                        'tter_the_butter_was_',
                              'he_butter_was_bitter_',
                                         'as_bitter_betty_bought',
                                                     'tty_bought_better_butter_t',
                                                                     'r_butter_to_make_the_',
                                                                            'r_to_make_the_bitt',
                                                                                   'ke_the_bitter_butter_better']
tree = Tree()
i = 0
for read in reads:
    tree.add(i, read)
    i+=1

In [163]:
for id_, path in tree.find_all("_was_"):
    print(id_, ":", str(path))

1 : _ w a s _ $
2 : _ w a s _ b i t t e r _ $


In [137]:
for k, lk, path in tree.common_substrings():
    print(k, lk, path)

2 14 h e _ b u t t e r _ w a s _
3 10 t t y _ b o u g h t
4 8 _ b u t t e r _
5 8 _ b u t t e r _
6 8 _ b u t t e r _
7 5 t t e r _
8 2 t t


In [138]:
for C, path in sorted(tree.maximal_repeats()):
    print(C, path)

2 _ b e t t e r
2 _ b i t t e r _ b
2 a s _ b i t t e r _
2 b e t t y _ b o u g h t
2 h e _ b u t t e r _ w a s _
2 k e _ t h e _ b i t t
2 r _ b u t t e r _ t
2 r _ t o _ m a k e _ t h e _
2 t t e r _ b e t t
2 t t e r _ b u t t e r _
2 t t e r _ t h
2 t t y _ b o u g h t _ b
3 _ b e t t
3 _ b i t t e r _
3 _ b u t t e r _ t
3 _ t h e _ b
3 a s _
3 k e _ t h e _
3 r _ b u t t e r _
3 t t e r _ b
3 t t y _ b o u g h t
4 _ b i t t
4 _ t h e _
4 b e t t
4 h e _ b
4 r _ b
4 t t e r _ t
5 _ t h
5 a
5 e _
5 h e _
5 o
5 r _ t
6 _ b u t t e r _
6 _ t
7 t t e r
7 t t e r _
7 u
8 _
8 _ b
8 b
8 e
8 h
8 r
8 r _
8 t
8 t t


In [154]:
from Bio import SeqIO

In [157]:
for seq_record in SeqIO.parse("data/mutant_R1.fastq", "fastq"):
   print(seq_record.id)
   print(repr(seq_record.seq))
   print(len(seq_record))
   break

mutant-no_snps.gff-24960/1
Seq('AATGTTGTCACTTGGATTCAAATGACATTTTAAATCTAATTATTCATGAATCGA...TTT')
150
