*to handle errors and diploidy*:
    
    1. find every unsequenced read
    2. see if the unsequenced reads exist in the constructed sequence
    3. construct sequences with unseen reads until they combine with a sequenced read in both directions
    4. sort kmers by frequency of kmer occurence in all reads, more frequent kmers are more likely polyploidy and less frequent kmers are more likely error

In [10]:
import random

In [1]:
class Sequitur:
    def __init__(self,read):
        self.sequence = read.read
        self.reads = [read]

    def __repr__(self): return self.sequence

    def __getitem__(self,key): return self.sequence[key]

    def __eq__(self,other): return self.sequence == other

    def __add__(self,other): 
        self.sequence += other
        return self

    def __radd__(self,other): 
        self.sequence = other + self.sequence
        return self

    def partition(self,root): return self.sequence.partition(root)

    def rpartition(self,root): return self.sequence.rpartition(root)

In [2]:
''' 
The read is held by a leaf and sustains their uniqueness and ensures they're added to the sequence at most once.
'''
class Read:
    def __init__(self,read):
        self.read = read
        self.is_sequenced = False

    def partition(self,root,dir):
        if dir: return Node(self.read.rpartition(root)[0],dir), self.read.rpartition(root)[2]
        else: return Node(self.read.partition(root)[2],dir), self.read.partition(root)[0]

In [3]:
'''
A leaf is the end point of a branch and can only the gained information. 
Every Trie must have a leaf for every read containing the root
Leaves can become branches
'''
class Leaf:
    def __init__(self,context,information,read):
        self.context = context
        if len(information) == 0: self.information = '$'
        else: self.information = information
        self.read = read

    def __repr__(self): return str(self.information)
    
    def branch(self,context):
        stalk = ""
        i = 0
        while i < min(len(context.stalk),len(self.context.stalk)) and context[i] == self.context[i]: 
            stalk += context[i]
            i += 1
        return stalk,self.context.stalk[len(stalk):]

In [4]:
'''
The node is a convenience class for accessing the branch
'''
class Node:
    def __init__(self,stalk,dir=0):
        if len(stalk) > 0: self.stalk = stalk
        else: self.stalk = "^"
        self.reversed = False
        if dir: 
            self.stalk = ''.join(reversed(self.stalk))
            self.reversed = True

    def __eq__(self,other): #return self.stalk[0] == other.stalk[0]
        i = 0
        while i < min(len(self.stalk),len(other.stalk)):
            if self.stalk[i] != other.stalk[i]: return False
            i += 1
        return True

    def __hash__(self): return hash(self.stalk[0])

    def __getitem__(self,index): return self.stalk[index]

    def __repr__(self): return self.stalk

    def __len__(self):
        if self.stalk == '^': return 0
        else: return len(self.stalk)

In [5]:
'''
A branch has either a collection of branches or a collection of leaves
Every branch must have at least 1 leaf
'''
class Branch:
    def __init__(self,root,network,node=None,foothold=None):
        self.branches = {}
        self.leaves = {}
        self.root = root
        self.network = network
        self.node = node
        self.foothold = foothold

    def grow(self,leaf,dir):
        leaf.context.stalk = leaf.context[len(self.node):]
        if leaf.context.stalk == '': leaf.context.stalk = '^'
        if leaf.context in self.branches: self.branches[leaf.context].grow(leaf,dir)
        else:
            node = Node('',dir)
            if leaf.context.stalk == '^':
                if node in self.leaves: self.leaves[node] += [leaf]
                else: self.leaves[node] = [leaf]
            else:
                stalk = ''
                new_leaves = []
                for l in list(self.leaves):
                    s = leaf.branch(l)[0]
                    if len(s) == 0: continue
                    elif len(stalk) == 0 or len(s) < len(stalk): 
                        stalk = s
                        new_leaves += [(l,self.leaves.pop(l))]
                if len(stalk) > 0: node.stalk = stalk
                self.branches[node] = Branch(self.root,self.network,node,self)
                for l in new_leaves:
                    l[0].stalk = l[0][len(node):]
                    if l[0].stalk == '':
                        l[0].stalk = '^'
                        if l[0] in self.branches[node].leaves: self.branches[node].leaves[l[0]] += [l[1]]
                        else: self.branches[node].leaves[l[0]] = [l[1]]
                    else: self.branches[node].leaves[l[0]] = l[1]
                leaf.context.stalk = leaf.context.stalk[len(node):]
                if leaf.context.stalk == '':
                    leaf.context.stalk = '^'
                    if leaf.context in self.branches[node].leaves: self.branches[node].leaves[leaf.context] += [leaf]
                    else: self.branches[node].leaves[leaf.context] = [leaf]
                else: self.branches[node].leaves[leaf.context] = leaf

    def climb(self,sequence,dir,stalk='',join_at=None):
        if type(self) == Root:
            stalk = self.root
            if dir: context,_,_ = sequence.rpartition(stalk)
            else: _,_,context = sequence.partition(stalk)
        else:
            stalk += self.node.stalk
            if dir: context,_,_ = sequence.rpartition(stalk)
            else: _,_,context = sequence.partition(stalk)
        context = Node(context,dir)
        if context in self.branches: return self.branches[context].climb(sequence,dir,stalk,join_at)
        else: 
            if context in self.leaves and not context.stalk == '^' and not self.leaves[context].read.is_sequenced: 
                if dir: sequence += self.leaves[context].information
                else: sequence = self.leaves[context].information + sequence
                self.leaves[context].read.is_sequenced = True
                sequence.reads += [self.leaves[context].read]
                return sequence
            else:
                carat = Node('^',dir) 
                if carat in self.leaves:
                    if len(self.leaves[carat]) == 1 and not self.leaves[carat][0].read.is_sequenced:
                        if dir: sequence += self.leaves[carat][0].information
                        else: sequence = self.leaves[carat][0].information + sequence
                        self.leaves[carat][0].read.is_sequenced = True
                        sequence.reads += [self.leaves[carat][0].read]
                        return sequence
                    else:
                        sequence.reads[-1].is_sequenced = False
                        for leaf in self.leaves[carat]:
                            if self.network.get_root(Sequitur(leaf.read),not dir).grow(Sequitur(leaf.read),not dir,sequence.reads[-1]):
                                if dir: sequence += leaf.information
                                else: sequence = leaf.information + sequence
                                leaf.read.is_sequenced = True
                                sequence.reads += [leaf.read]
                                return sequence
                            else:
                                leaf.read.is_sequenced = False
                        sequence.reads[-1].is_sequenced = True
                if self.foothold:
                    return self.descend(sequence,dir,stalk[len(self.node.stalk):])
                else: 
                    if dir: return sequence + '$'
                    else: return '^' + sequence

    def descend(self,sequence,dir,stalk):
        carat = Node('',dir)
        if carat in self.foothold.leaves:
            if len(self.foothold.leaves[carat]) == 1 and not self.foothold.leaves[carat][0].read.is_sequenced:
                if dir: sequence += self.foothold.leaves[carat][0].information
                else: sequence = self.foothold.leaves[carat][0].information + sequence
                self.foothold.leaves[carat][0].read.is_sequenced = True
                sequence.reads += [self.foothold.leaves[carat][0].read]
                return sequence
            else:
                sequence.reads[-1].is_sequenced = False
                for leaf in self.foothold.leaves[carat]:
                    if self.network.get_root(Sequitur(leaf.read),not dir).grow(Sequitur(leaf.read),not dir,sequence.reads[-1]):
                        if dir: sequence += leaf.information
                        else: sequence = leaf.information + sequence
                        leaf.read.is_sequenced = True
                        sequence.reads += [leaf.read]
                        return sequence
                    else:
                        leaf.read.is_sequenced = False
        if dir: return sequence + '$'
        else: return '^' + sequence



In [6]:
'''
A root is a branch with a connection to the network and a list of the reads it comprises of.
It can have a collection of branches and leaves.
'''
class Root(Branch):
    def __init__(self,root,network):
        super().__init__(root,network)
        self.reads = []

    def __repr__(self):
        return self.root
        
    def add_read(self,read):
        self.reads += [read]
        self.network.reads += [read]

    def grow(self,sequence,dir,join_at=None):
        if len(self.reads) > 0: 
            if len(self.leaves) == 0: self.sprout(sequence,dir)
            else: 
                read = self.reads.pop()
                while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
                if not read.is_sequenced:
                    context,information = read.partition(self.root,dir)
                    if context in self.leaves: 
                        node = Node(self.leaves[context].branch(context)[0])
                        self.branches[node] = Branch(self.root,self.network,node)
                        self.leaves[context].context = Node(self.leaves[context].context.stalk.partition(node.stalk)[2])
                        if self.leaves[context].context.stalk == '^': 
                            if self.leaves[context].context in self.branches[node].leaves: self.branches[node].leaves[self.leaves[context].context] += [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                            else: self.branches[node].leaves[self.leaves[context].context] = [Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)]
                        else: self.branches[node].leaves[self.leaves[context].context] = Leaf(self.leaves[context].context,self.leaves[context].information,self.leaves[context].read)
                        self.leaves.pop(context)
                        context = Node(context.stalk.partition(node.stalk)[2])
                        if context.stalk == '^': 
                            if context in self.branches[node].leaves: self.branches[node].leaves[context] += [Leaf(context,information,read)]
                            else: self.branches[node].leaves[context] = [Leaf(context,information,read)]
                        else: self.branches[node].leaves[context] = Leaf(context,information,read)
                    else: 
                        if context.stalk == '^': self.leaves[context] = [Leaf(context,information,read)]
                        else: self.leaves[context] = Leaf(context,information,read)
            return self.grow(sequence,dir,join_at)
        else: 
            sequence = self.climb(sequence,dir)
            if join_at and join_at in sequence.reads: return True
            elif join_at: return False
            if sequence[0] == '^' and not dir: return self.network.get_root(sequence,1).grow(sequence,1)
            elif sequence[0] == '^' and sequence[-1] == '$': return sequence
            return self.network.get_root(sequence,dir).grow(sequence,dir)

    def sprout(self,sequence,dir):
        read = self.reads.pop()
        while read.is_sequenced and len(self.reads) > 0: read = self.reads.pop()
        if not read.is_sequenced:
            context,information = read.partition(self.root,dir)
            if read.read == sequence: 
                read.is_sequenced = True
                sequence.reads += [read]
            if len(self.branches) > 0 and context in self.branches: 
                leaf = Leaf(context,information,read)
                self.branches[context].grow(Leaf(context,information,read),dir)
            else: 
                if context.stalk == '^': self.leaves[context] = [Leaf(context,information,read)]
                else: self.leaves[context] = Leaf(context,information,read)

In [7]:
class RootNetwork:
    def __init__(self,k):
        self.roots = {}
        self.reads = []
        self.k = k

    def __getitem__(self,key):
        return self.roots[key]

    def __contains__(self, key):
        return key in self.roots

    # dir = 1, context gain towards prefix
    # dir = 0, context gain towards suffix
    def build(self,sequence,dir=0): return self.get_root(sequence,dir).grow(sequence,dir)

    def plant_trie(self,trie): self.roots[trie.root] = trie

    def get_root(self,sequence,dir):
        if dir: return self[sequence[-self.k:]]
        else: return self[sequence[:self.k]]

In [20]:
def initialise(reads):
    k = 3
    r = RootNetwork(k)
    R = {}
    for read in reads:
        R[read] = Read(read)
        for i in range(len(read)-k+1):
            if read[i:i+k] not in r: r.plant_trie(Root(read[i:i+k],r))
            r[read[i:i+k]].add_read(R[read])
    return R,r

successes = 0
reads = ['she_sells_s','lls_sea_shel','ea_shells_o','shells_on_the_s','he_sea_s','ea_shore']
print('Sequence: she_sells_sea_shells_on_the_sea_shore')
for read in reads:
    R,r = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'she_sells_sea_shells_on_the_sea_shore':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')

successes = 0
reads = ['you say hel',' say hello wo','lo world, i be','ld, i bellow go t','ow go to hell']
print('Sequence: you say hello world, i bellow go to hell')
for read in reads:
    R,r = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'you say hello world, i bellow go to hell':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')

Sequence: she_sells_sea_shells_on_the_sea_shore
Initus: she_sells_s [SUCCESS]
Initus: lls_sea_shel [SUCCESS]
Initus: ea_shells_o [SUCCESS]
Initus: shells_on_the_s [SUCCESS]
Initus: he_sea_s [SUCCESS]
Initus: ea_shore [SUCCESS]
------------------------
Accuracy: 100.0 %
Sequence: you say hello world, i bellow go to hell
Initus: you say hel [SUCCESS]
Initus:  say hello wo [SUCCESS]
Initus: lo world, i be [SUCCESS]
Initus: ld, i bellow go t [SUCCESS]
Initus: ow go to hell [SUCCESS]
------------------------
Accuracy: 100.0 %


In [None]:
successes = 0
reads = ['betty_bought_butter_th','tter_the_butter_was_','he_butter_was_bitter_','as_bitter_betty_bought','tty_bought_better_butter_t','r_butter_to_make_the_','r_to_make_the_bitt','ke_the_bitter_butter_better']
print('Sequence: betty_bough_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better')
for read in reads:
    R,r = initialise(reads)
    sequence = r.build(Sequitur(R[read]))[1:-1]
    if sequence == 'betty_bough_butter_the_butter_was_bitter_betty_bought_better_butter_to_make_the_bitter_butter_better':
        successes += 1
        print('Initus:',read,'[SUCCESS]')
    else: print('Initus:',read,'[FAILURE] | result:',sequence)
print('------------------------')
print('Accuracy:',successes/len(reads)*100,'%')
print('========================')