* run-time collision resolution

In [None]:
import unittest

In [1]:
def generate_sequence(n):
  import random

  bases = ['a','g','c','t']
  seq = ''
  for i in range(n):
    seq += bases[random.randint(0,3)]
  return seq

def chop_sequence(sequence,min_len=3,max_len=15,min_overlap=3):
  import random

  start = 0
  end = max(random.randint(start,max_len),min_len)
  chop = [sequence[start:end]]
  while end < len(sequence):
    start = random.randint(start + 1,end - min_overlap)
    end = max(random.randint(start,start + max_len),start + min_len)
    chop += [sequence[start:end]]
  return chop

In [4]:
ref = "Fresh out of college, Barry the Bee (Jerry Seinfeld) finds the prospect of working with honey uninspiring. He flies outside the hive for the first time and talks to a human (Renée Zellweger), breaking a cardinal rule of his species. Barry learns that humans have been stealing and eating honey for centuries, and he realizes that his true calling is to obtain justice for his kind by suing humanity for theft." #generate_sequence(200)
reads = chop_sequence(ref,9,18,8)
import random
print(ref)
print(reads)
# random.shuffle(reads)

Fresh out of college, Barry the Bee (Jerry Seinfeld) finds the prospect of working with honey uninspiring. He flies outside the hive for the first time and talks to a human (Renée Zellweger), breaking a cardinal rule of his species. Barry learns that humans have been stealing and eating honey for centuries, and he realizes that his true calling is to obtain justice for his kind by suing humanity for theft.
['Fresh out', 'resh out of coll', 'sh out of colle', ' out of co', 'out of colleg', ' of college,', 'college, Barry the', 'llege, Ba', 'lege, Barr', 'ege, Barry the Bee', 'arry the B', 'rry the Bee (', 'the Bee (Jerr', 'ee (Jerry', 'e (Jerry Sei', ' (Jerry S', '(Jerry Se', 'Jerry Sei', 'erry Sein', 'rry Seinfeld) f', 'einfeld) ', 'infeld) finds ', 'nfeld) finds t', 'd) finds t', ' finds the prospe', 'nds the prospec', 's the prospe', 'the prosp', 'he prospe', 'e prospect of work', 'ect of wo', 'ct of wor', 't of work', ' of worki', 'of workin', 'f working', ' working ', 'working with

In [3]:
ref = "The sly brown fox jumps swiftly over the lazy dog"
reads = chop_sequence(ref,6,11)
import random
random.shuffle(reads)
print(ref)
print(reads)

The sly brown fox jumps swiftly over the lazy dog
['rown fox ', 'er the', 'iftly over', 'tly ov', ' over ', 'the lazy d', 'fox ju', 'azy dog', 'The sl', ' fox j', 'mps swiftly', 'own fo', 'x jump', 'he sly brow', ' jumps', 'e lazy', 'brown ']


In [5]:
class Read:
  def __init__ (self,read,k_min=3):
    self.read = read
    self.k_min = k_min

  def __repr__(self):
    return self.read

  def __getitem__(self,key):
    return self.read[key]

  def __len__(self):
    return len(self.read)

  def partition(self,sep):
    return self.read.partition(sep)
  
  def startswith(self,key):
    return self.read.startswith(key)

  def endswith(self,key):
    return self.read.endswith(key)

  def find(self,key):
    return self.read.find(key)

  def __contains__(self,item):
    return item in self.read

  def get_all_partitions(self,root):
    index = 0
    for c in range(self.read.count(root)):
      index = self.read.find(root,index)
      part = list(self.read[index:].partition(root))
      part[0] += self.read[:index]
      yield part, index
      index += len(root)

  def connection_strength(self,other,root,index=0):
    p2 = list(other.read[other.read.find(root,index):].partition(root))
    p2[0] += other.read[:other.read.find(root,index)]
    total = max_pre = max_suf = 0
    curr_part = None
    for p1 in self.get_all_partitions(root):
      min_pre = min(len(p1[0][0]),len(p2[0]))
      min_suf = min(len(p1[0][2]),len(p2[2]))
      if min_pre == 0 and min_suf == 0: continue
      if self.read[p1[1]-min_pre:p1[1]+len(root)+min_suf] == other.read[other.read.find(root,index)-min_pre:other.read.find(root,index)+len(root)+min_suf]:
         max_pre = max(min_pre,max_pre)
         max_suf = max(min_suf,max_suf)
         if max_pre + max_suf > total:
          total = max_pre + max_suf
          curr_part = p1[0]
    return {
      'total_strength': total,
      'prefix_strength': max_pre,
      'suffix_strength': max_suf,
      'p1': curr_part,
      'p2': p2
    }

  def is_continuous_with(self,other,root,index=0):
    p2 = list(other.read[other.read.find(root,index):].partition(root))
    p2[0] += other.read[:other.read.find(root,index)]
    for p1 in self.get_all_partitions(root):
      min_pre = min(len(p1[0][0]),len(p2[0]))
      min_suf = min(len(p1[0][2]),len(p2[2]))
      # if min_pre == 0 and min_suf == 0: continue
      if self.read[p1[1]-min_pre:p1[1]+len(root)+min_suf] == other.read[other.read.find(root,index)-min_pre:other.read.find(root,index)+len(root)+min_suf]: return True
    return False

  def continues_to(self,other,root):
    return self.read.partition(root)[0].endswith(other.read.partition(root)[0])
  
  def continues_from(self,other,root):
    return self.read.partition(root)[2].startswith(other.read.partition(root)[2]) 

In [6]:
class Segment:
  def __init__(self,prefix,root,suffix,read):
    self.root = root
    self.read = read
    self.prefix = prefix
    self.suffix = suffix
    
  def __repr__(self):
    return '{}{}{}'.format(self.prefix.root + ' -> ' if self.prefix else '',self.root,' -> ' + self.suffix.root if self.suffix else '')

  def __hash__(self):
    return hash(self.root)

  def __eq__(self,other):
    return self.root == other.root

  def __len__(self):
    return len(self.root)

  def __getitem__(self,key):
    return self.root[key]

In [6]:
# v2
class Sequitur:
  def __init__(self,reads,k_min=3):
    self.k_min = min(list(map(len,reads)))
    self.reads = list(map(Read,reads))
    self.transitions = {}
    self.ends = {}
    self.seq = ''
    for read in self.reads:
      if read.read == 'Barry the Bee (':
        print()
      cont = False
      seg = None
      i = 1
      l = self.k_min
      # find starting point for read
      if len(self.transitions) > 0:
        for r in set(map(len,self.transitions.keys())):
          a = 0
          # while read[a:r+a] in self.transitions and self.transitions[read[a:r+a]].prefix and self.transitions[read[a:r+a]].prefix.read.is_continuous_with(read,read[a:r+a]):
          while read[a:r+a] in self.transitions and self.transitions[read[a:r+a]].read.is_continuous_with(read,read[a:r+a]):
            a += 1
          if a > 0:
            a -= 1
            curr = self.transitions[read[a:r+a]]
            while curr.suffix and curr.suffix.read.is_continuous_with(read,curr.suffix.root):
              curr = curr.suffix
            if curr.suffix and not read.endswith(curr.root):
              # check for stronger suffix connection specifically
              # when would a strong prefix connection matter?
              new_conn = curr.read.connection_strength(read,curr.root)
              old_conn = curr.read.connection_strength(curr.suffix.read,curr.root)
              if new_conn[0] >= old_conn[0]:
                if new_conn[1] >= old_conn[1]:
                  print()
                if new_conn[2] >= old_conn[2]:
                  seg = curr
                  curr.suffix.prefix = None
                  curr.suffix = None
                  l = read.find(curr.root) + len(curr.root)
                  i = read.find(curr.root) + 1
                break
              elif new_conn[0] < 2:
                continue
              else:
                cont = True
                break
            elif curr.suffix and read.endswith(curr.root):
              cont = True
              break
            else: 
              seg = curr
              # seg.read = read
              l = read.find(curr.root) + len(curr.root)
              i = read.find(curr.root) + 1
              break
      if cont:
        continue
      if not seg:
        a = 0
        while read[:self.k_min+a] in self.transitions:
          a += 1
        seg = Segment(None,read[:self.k_min+a],None,read)
        self.transitions[read[:self.k_min+a]] = seg
      s = 0
      p = 0
      while l < len(read):
        if read[i+p:i+p+self.k_min+s] in self.transitions:
          if self.transitions[read[i+p:i+p+self.k_min+s]].read != read\
            and self.transitions[read[i+p:i+p+self.k_min+s]].read.is_continuous_with(read,read[i+p:i+p+self.k_min+s],i+p):
            self.transitions[read[i+p:i+p+self.k_min+s]].prefix = seg
            seg.suffix = self.transitions[read[i+p:i+p+self.k_min+s]]
            curr = self.transitions[read[i+p:i+p+self.k_min+s]]
            while curr.suffix and curr.suffix.read.is_continuous_with(read,curr.suffix.root):
              curr = curr.suffix
            if curr.suffix and not read.endswith(curr.root):
              if curr.read.connection_strength(read,curr.root)[0] > curr.read.connection_strength(curr.suffix.read,curr.root)[0]:
                seg = curr
                curr.suffix.prefix = None
                curr.suffix = None
                l = read.find(curr.root) + len(curr.root)
                i = read.find(curr.root) + 1
                continue
            elif curr.suffix and read.endswith(curr.root):
              break
            seg = curr
            l = read.find(curr.root) + len(curr.root)
            i = read.find(curr.root) + 1
            continue
          if i + abs(p) + s + self.k_min < len(read):
            s += 1
          else:
            p -= 1
            s += 1
            self.transitions.pop(seg.root)
            seg.prefix.suffix = None
            seg = seg.prefix
        else:
          self.transitions[read[i+p:i+p+self.k_min+s]] = Segment(seg,read[i+p:i+p+self.k_min+s],None,read)
          seg.suffix = self.transitions[read[i+p:i+p+self.k_min+s]]
          seg = self.transitions[read[i+p:i+p+self.k_min+s]]
          l = read.find(read[i+p:i+p+self.k_min+s]) + len(read[i+p:i+p+self.k_min+s])
          i = read.find(read[i+p:i+p+self.k_min+s]) + 1
          s = 0
          p = 0

  def __repr__(self):
    return str(self.transitions)

  def construct(self):
    self.seq = 'Fre'
    curr = self.transitions[self.seq].suffix
    while curr:
      self.seq += curr.root.partition(self.seq[-(self.k_min-1):])[2]
      curr = curr.suffix
    return self.seq

In [7]:
# v2
class Sequitur:
  def __init__(self,reads,k_min=3):
    self.k_min = min(list(map(len,reads)))
    self.reads = list(map(Read,reads))
    self.transitions = {}
    self.ends = {}
    self.seq = ''
    for read in self.reads:
      if read[a:self.k_min] not in self.transitions:
  def __repr__(self):
    return str(self.transitions)

  def construct(self):
    self.seq = 'Fre'
    curr = self.transitions[self.seq].suffix
    while curr:
      self.seq += curr.root.partition(self.seq[-(self.k_min-1):])[2]
      curr = curr.suffix
    return self.seq

In [7]:
seq = Sequitur(reads)

In [8]:
seq.transitions

{' do': y d ->  do -> dog,
 'dog':  do -> dog,
 'y b': y b ->  br,
 ' br': y b ->  br -> bro,
 'bro':  br -> bro -> row,
 'row': bro -> row -> own,
 'own': row -> own -> wn ,
 'ump': jum -> ump -> mps,
 'mps': ump -> mps -> ps ,
 'ps ': mps -> ps  -> s s,
 's s': ps  -> s s ->  sw,
 ' sw': s s ->  sw -> swi,
 'swi':  sw -> swi -> wif,
 'wif': swi -> wif -> ift,
 'ift': wif -> ift -> ftl,
 'ftl': ift -> ftl,
 'The': The -> he ,
 'he ': The -> he  -> e s,
 'e s': he  -> e s ->  sl,
 ' sl': e s ->  sl -> sly,
 'tly': tly -> ly ,
 'ly ': tly -> ly  -> y o,
 'y o': ly  -> y o ->  ov,
 ' ov': y o ->  ov -> ove,
 ' th': r t ->  th -> the,
 'the':  th -> the -> he l,
 'he l': the -> he l,
 'ove':  ov -> ove -> ver,
 'ver': ove -> ver -> er ,
 'er ': ver -> er  -> r t,
 'r t': er  -> r t ->  th,
 'e l': e l ->  la,
 ' la': e l ->  la -> laz,
 'laz':  la -> laz -> azy,
 'azy': laz -> azy -> zy ,
 'zy ': azy -> zy  -> y d,
 'y d': zy  -> y d ->  do,
 'wn ': own -> wn  -> n f,
 'jum':  ju -> jum -

In [8]:
seq.construct()

'Fresh out of college, Barry the Bee (Jerry Seinfeld) finds the prospect of working with honey uninspiring. He flies outside the hive for the first time and talks to a human (Renée Zellweger), breaking a cardinal rule of his species. Barry learns that humans have been stealing and eating honey for centuries, and he realizes that his true calling is to obtain justice for his kind by suing humanity for theft.'

In [35]:
seq.transitions['y th'].read

 Barry th