*to handle errors and diploidy*:
    
    1. find every unsequenced read
    2. see if the unsequenced reads exist in the constructed sequence
    3. construct sequences with unseen reads until they combine with a sequenced read in both directions
    4. sort kmers by frequency of kmer occurence in all reads, more frequent kmers are more likely polyploidy and less frequent kmers are more likely error

In [10]:
import random

In [38]:
class Sequitur:
    def __init__(self,initial,read=None):
        self.sequence = initial
        self.reads = []
        if read: self.reads = [read]

    def __repr__(self): return self.sequence

    def __getitem__(self,key): return self.sequence[key]

    def __eq__(self,other): return self.sequence == other

    def __add__(self,other): 
        self.sequence += other
        return self

    def __radd__(self,other): 
        self.sequence = other + self.sequence
        return self

    def partition(self,root): return self.sequence.rpartition(root)

    def rpartition(self,root): return self.sequence.rpartition(root)

In [39]:
''' 
The read is held by a leaf and sustains their uniqueness and ensures they're added to the sequence at most once.
'''
class Read:
    def __init__(self,read):
        self.read = read
        self.is_sequenced = False

    def partition(self,root,dir):
        if dir: return Node(self.read.rpartition(root)[0],dir), self.read.rpartition(root)[2]
        else: return Node(self.read.partition(root)[2],dir), self.read.partition(root)[0]

In [40]:
'''
A leaf is the end point of a branch and can only the gained information. 
Every Trie must have a leaf for every read containing the root
Leaves can become branches
'''
class Leaf:
    def __init__(self,context,information,read):
        self.context = context
        if len(information) == 0: self.information = '$'
        else: self.information = information
        self.read = read

    def __repr__(self): return str(self.information)
    
    def branch(self,context):
        stalk = ""
        i = 0
        while i < min(len(context.stalk),len(self.context.stalk)) and context[i] == self.context[i]: 
            stalk += context[i]
            i += 1
        return stalk,self.context.stalk[len(stalk):]

In [41]:
'''
The node is a convenience class for accessing the branch
'''
class Node:
    def __init__(self,stalk,dir=0):
        if len(stalk) > 0: self.stalk = stalk
        else: self.stalk = "^"
        self.reversed = False
        if dir: 
            self.stalk = ''.join(reversed(self.stalk))
            self.reversed = True

    def __eq__(self,other): return self.stalk[0] == other.stalk[0]

    def __hash__(self): return hash(self.stalk[0])

    def __getitem__(self,index): return self.stalk[index]

    def __repr__(self): return self.stalk

    def __len__(self):
        if self.stalk == '^': return 0
        else: return len(self.stalk)

In [49]:
'''
A branch has either a collection of branches or a collection of leaves
Every branch must have at least 1 leaf
'''
class Branch:
    def __init__(self,root,network,node=None,foothold=None):
        self.branches = {}
        self.leaves = {}
        self.root = root
        self.network = network
        self.node = node
        self.foothold = foothold

    def grow(self,leaf,dir):
        leaf.context.stalk = leaf.context[len(self.node):]
        node = Node('',dir)
        if leaf.context.stalk == '':
            leaf.context.stalk = '^'
            if node in self.leaves: self.leaves[node] += [leaf]
            else: self.leaves[node] = [leaf]
        else:
            stalk = ''
            new_leaves = []
            for l in list(self.leaves):
                s = leaf.branch(l)[0]
                if len(s) == 0: continue
                elif len(stalk) == 0 or len(s) < len(stalk): 
                    stalk = s
                    new_leaves += [(l,self.leaves.pop(l))]
            if len(stalk) > 0: node.stalk = stalk
            self.branches[node] = Branch(self.root,self.network,node,self)
            for l in new_leaves:
                l[0].stalk = l[0][len(node):]
                if l[0].stalk == '':
                    l[0].stalk = '^'
                    if l[0] in self.branches[node]: self.branches[node].leaves[l[0]] += l[1]
                    else: self.branches[node].leaves[l[0]] = [l[1]]
                else: self.branches[node].leaves[l[0]] = l[1]
            leaf.context.stalk = leaf.context.stalk[len(node):]
            if leaf.context.stalk == '':
                leaf.context.stalk = '^'
                if leaf.context in self.branches[node].leaves: self.branches[node].leaves[leaf.context] += [leaf]
                else: self.branches[node].leaves[leaf.context] = [leaf]
            else: self.branches[node].leaves[leaf.context] = leaf

    def climb(self,sequence,dir,stalk='',join_at=None):
        if type(self) == Root:
            stalk = self.root
            if dir: context,_,_ = sequence.rpartition(stalk)
            else: _,_,context = sequence.partition(stalk)
        else:
            stalk += self.node.stalk
            if dir: context,_,_ = sequence.rpartition(stalk)
            else: _,_,context = sequence.partition(stalk)
        context = Node(context,dir)
        if context in self.branches: return self.branches[context].climb(sequence,dir,stalk,join_at)
        else: 
            if context in self.leaves and not context.stalk == '^' and not self.leaves[context].read.is_sequenced: 
                if dir: sequence += self.leaves[context].information
                else: sequence = self.leaves[context].information + sequence
                self.leaves[context].read.is_sequenced = True
                sequence.reads += [self.leaves[context].read]
                return sequence
            else:
                carat = Node('^',dir) 
                if carat in self.leaves:
                    if len(self.leaves[carat]) == 1:
                        if dir: sequence += self.leaves[carat][0].information
                        else: sequence = self.leaves[carat][0].information + sequence
                        self.leaves[carat][0].read.is_sequenced = True
                        sequence.reads += [self.leaves[carat][0].read]
                        return sequence
                    else:
                        sequence.reads[-1].is_sequenced = False
                        for leaf in self.leaves[carat]:
                            if self.network.get_root(Sequitur(leaf.read.read),not dir).grow(Sequitur(leaf.read.read),not dir,sequence.reads[-1]):
                                if dir: sequence += leaf.information
                                else: sequence = leaf.information + sequence
                                leaf.read.is_sequenced = True
                                sequence.reads += [leaf.read]
                                return sequence
                            else:
                               # TODO: should i figure out a read combining thing so that no work is wasted?
                                leaf.read.is_sequenced = False
                        sequence.reads[-1].is_sequenced = True
                if self.foothold:
                    return self.descend(sequence,dir,stalk[len(self.node.stalk):])
                else: 
                    if dir: return sequence + '$'
                    else: return '^' + sequence

    def descend(self,sequence,dir,stalk):
        carat = Node('',dir)
        if carat in self.foothold.leaves:
            if len(self.foothold.leaves[carat]) == 1:
                if dir: sequence += self.foothold.leaves[carat][0].information
                else: sequence = self.foothold.leaves[carat][0].information + sequence
                self.foothold.leaves[carat][0].read.is_sequenced = True
                sequence.reads += [self.foothold.leaves[carat][0].read]
                return sequence
            else:
                sequence.reads[-1].is_sequenced = False
                for leaf in self.foothold.leaves[carat]:
                    if self.network.get_root(Sequitur(leaf.read.read),not dir).grow(Sequitur(leaf.read.read),not dir,sequence.reads[-1]):
                        if dir: sequence += leaf.information
                        else: sequence = leaf.information + sequence
                        leaf.read.is_sequenced = True
                        sequence.reads += [leaf.read]
                        return sequence
                    else:
                        # TODO: should i figure out a read combining thing so that no work is wasted?
                        leaf.read.is_sequenced = False
        if dir: return sequence + '$'
        else: return '^' + sequence



In [50]:
'''
A root is a branch with a connection to the network and a list of the reads it comprises of.
It can have a collection of branches and leaves.
'''
class Root(Branch):
    def __init__(self,root,network):
        super().__init__(root,network)
        self.reads = []

    def __repr__(self):
        return self.root
        
    def add_read(self,read):
        self.reads += [read]
        self.network.reads += [read]

    def grow(self,sequence,dir,join_at=None):
        if len(self.reads) > 0: 
            if len(self.leaves) == 0: self.sprout(sequence,dir)
            else: 
                read = self.reads.pop()
                while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
                if not read.is_sequenced:
                    context,information = read.partition(self.root,dir)
                    if context in self.leaves: 
                        node = Node(self.leaves[context].branch(context)[0])
                        self.branches[node] = Branch(self.root,self.network,node)
                        self.leaves[context].context = Node(self.leaves[context].context.stalk.partition(node.stalk)[2])
                        if self.leaves[context].context.stalk == '^': 
                            if self.leaves[context].context in self.branches[node].leaves: self.branches[node].leaves[self.leaves[context].context] += [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                            else: self.branches[node].leaves[self.leaves[context].context] = [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                        else: self.branches[node].leaves[self.leaves[context].context] = Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)
                        self.leaves.pop(context)
                        context = Node(context.stalk.partition(node.stalk)[2])
                        if context.stalk == '^': 
                            if context in self.branches[node].leaves: self.branches[node].leaves[context] += [Leaf(context,information,read)]
                            else: self.branches[node].leaves[context] = [Leaf(context,information,read)]
                        else: self.branches[node].leaves[context] = Leaf(context,information,read)
                    else: self.leaves[context] = [Leaf(context,information,read)]
            return self.grow(sequence,dir,join_at)
        else: 
            print(sequence)
            sequence = self.climb(sequence,dir)
            if join_at and join_at in sequence.reads: return True
            elif join_at: return False
            if sequence[0] == '^' and not dir: return self.network.get_root(sequence,1).grow(sequence,1)
            elif sequence[0] == '^' and sequence[-1] == '$': return sequence
            return self.network.get_root(sequence,dir).grow(sequence,dir)

    # BUG: initial should start with and mark the initial read
    def sprout(self,sequence,dir):
        read = self.reads.pop()
        while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
        if not read.is_sequenced:
            context,information = read.partition(self.root,dir)
            if read.read == sequence: 
                read.is_sequenced = True
                sequence.reads += [read]
            if len(self.branches) > 0 and context in self.branches: 
                leaf = Leaf(context,information,read)
                self.branches[context].grow(Leaf(context,information,read),dir)
            else: 
                if context.stalk == '^': self.leaves[context] = [Leaf(context,information,read)]
                else: self.leaves[context] = Leaf(context,information,read)

In [44]:
class RootNetwork:
    def __init__(self,k):
        self.roots = {}
        self.reads = []
        self.k = k

    def __getitem__(self,key):
        return self.roots[key]

    def __contains__(self, key):
        return key in self.roots

    # dir = 1, context gain towards prefix
    # dir = 0, context gain towards suffix
    def build(self,sequence,dir=0): return self.get_root(sequence,dir).grow(sequence,dir)

    def plant_trie(self,trie): self.roots[trie.root] = trie

    def get_root(self,sequence,dir):
        if dir: return self[sequence[-self.k:]]
        else: return self[sequence[:self.k]]

In [52]:
k = 3
r = RootNetwork(k)
reads = {}
for read in ['you say hel',
                ' say hello wo',
                        'lo world, i be',
                              'ld, i bellow go t',
                                        'ow go to hell']:
    reads[read] = Read(read)
    for i in range(len(read)-k+1):
        if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
        r[read[i:i+k]].add_read(reads[read])
r.build(Sequitur('ld, i bellow go t'))

ld, i bellow go t
lo world, i bellow go t
 say hello world, i bellow go t
you say hello world, i bellow go t
^you say hello world, i bellow go t
^you say hello world, i bellow go to hell


^you say hello world, i bellow go to hell$

In [53]:
k = 3
r = RootNetwork(k)
reads = {}
for read in ['you say hel',
                ' say hello wo',
                        'lo world, i be',#'ld, i bellow go t',
                              'ld, i bellow go t',
                                        'ow go to hell']:
    reads[read] = Read(read)
    for i in range(len(read)-k+1):
        if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
        r[read[i:i+k]].add_read(reads[read])
r.build(Sequitur(' say hello wo'))

 say hello wo
you say hello wo
^you say hello wo
^you say hello world, i be
^you say hello world, i bellow go t
^you say hello world, i bellow go to hell


^you say hello world, i bellow go to hell$

In [51]:
k = 3
r = RootNetwork(k)
reads = {}
for read in ['she_sells_s',
                   'lls_sea_shel',
                        'ea_shells_o',
                           'shells_on_the_s',
                                      'he_sea_s',
                                          'ea_shore']:
    reads[read] = Read(read)
    for i in range(len(read)-k+1):
        if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
        r[read[i:i+k]].add_read(reads[read])
r.build(Sequitur('shells_on_the_s'))

shells_on_the_s
ea_shells_o
ea_shells_on_the_s
lls_sea_shells_on_the_s
she_sells_sea_shells_on_the_s
ea_shells_o
lls_sea_shel
^she_sells_sea_shells_on_the_s
^she_sells_sea_shells_on_the_sea_s
^she_sells_sea_shells_on_the_sea_shore


^she_sells_sea_shells_on_the_sea_shore$

In [11]:
k = 3
r = RootNetwork(k)
reads = {}
for read in ['she_sells_s', # works
                   'lls_sea_shel', # works
                        'ea_shells_o', # works
                           'shells_on_the_s', # works
                                      'he_sea_s', # incorrect output BUG: backtracking when the extended context diverges and no longer matches
                                          'ea_shore']: # incorrect output
    reads[read] = Read(read)
    for i in range(len(read)-k+1):
        if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
        r[read[i:i+k]].add_read(reads[read])
r.build(Sequitur('he_sea_s'))

he_sea_s
^he_sea_s


'^he_sea_s$'

In [29]:
k = 3
r = RootNetwork(k)
reads = {}
initial_read = 'tty_bought_better_butter_t'
for read in ['betty_bought_butter_th',
                            'tter_the_butter_was_',
                                   'he_butter_was_bitter_',
                                              'as_bitter_so_betty_bought',
                                                             'tty_bought_better_butter_t',
                                                                             'r_butter_to_make_the_',
                                                                                    'r_to_make_the_bitt',
                                                                                           'ke_the_bitter_butter_better']:
    reads[read] = Read(read)
    for i in range(len(read)-k+1):
        if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
        r[read[i:i+k]].add_read(reads[read])
r.build(initial_read)

tty_bought_better_butter_t


AttributeError: 'list' object has no attribute 'read'