In [8]:
def skew(genome):
    last = 0
    result = [last]
    for nucleotide in genome:
        if nucleotide == 'G':
            last += 1
        if nucleotide == 'C':
            last -= 1
        result.append(last)
    return result

In [11]:
sample_in = 'CATGGGCATCGGCCATACGCC'
sample_out = [0, -1, -1, -1, 0, 1, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0, -1, 0, -1, -2]
assert skew(sample_in) == sample_out

---

In [18]:
def skew_min(genome):
    skews = skew(genome)
    m = min(skews)
    oris = [i for i,v in enumerate(skews) if v == m]
    return oris

In [20]:
sample_in = 'TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT'
sample_out = [11, 24]
assert skew_min(sample_in) == sample_out

In [21]:
with open('data/dataset_7_6.txt', 'r') as f:
    data = f.read().splitlines()
    genome = data[0]

In [22]:
skew_min(genome)

[3403, 3404, 3421]

---

In [27]:
def hamming_distance(seq, seq_prime):
    diff = [int(a != b) for a, b in zip(seq, seq_prime)]
    return sum(diff)

In [29]:
sample_in = ('GGGCCGTTGGT', 'GGACCGTTGAC')
sample_out = 3
assert hamming_distance(*sample_in) == sample_out

In [30]:
with open('data/dataset_9_3.txt', 'r') as f:
    data = f.read().splitlines()
    seq = data[0]
    seq_prime = data[1]

In [31]:
hamming_distance(seq, seq_prime)

838

---

In [33]:
def approximate_pattern_match(pattern, genome, threshold):
    matches = list()
    for i in range(len(genome) - len(pattern) + 1):
        dist = hamming_distance(genome[i: i + len(pattern)], pattern)
        if dist <= threshold:
            matches.append(i)
    return matches

In [34]:
sample_in = ('ATTCTGGA', 
             'CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT', 3)
sample_out = [6, 7, 26, 27]
assert approximate_pattern_match(*sample_in) == sample_out

In [38]:
with open('data/dataset_9_4.txt', 'r') as f:
    data = f.read().splitlines()
    pattern = data[0]
    genome = data[1]
    threshold = int(data[2])

In [41]:
result = approximate_pattern_match(pattern, genome, threshold)
print(' '.join(map(str, result)))

2 4 5 7 9 13 15 18 20 21 23 25 26 28 31 35 36 40 45 46 48 49 50 54 55 57 60 63 68 69 71 73 76 78 81 85 90 94 103 104 107 112 114 115 117 121 124 126 128 133 135 137 139 143 150 153 155 157 159 163 166 168 169 172 178 180 181 182 186 187 190 191 193 199 204 208 209 211 213 214 217 220 222 224 226 228 229 230 231 233 235 239 241 248 251 253 255 257 261 266 269 270 271 272 274 278 280 283 285 287 289 291 293 294 298 300 304 310 311 313 314 320 321 325 326 328 329 334 335 338 339 341 343 344 346 348 350 354 356 357 359 361 364 365 368 370 372 374 376 382 387 389 391 395 398 401 403 405 410 412 414 416 418 421 426 429 430 431 435 437 441 442 446 449 450 453 455 456 458 462 465 467 468 470 471 473 476 479 481 486 490 491 492 495 498 500 501 508 511 512 518 525 527 530 532 535 540 545 547 549 551 558 561 563 564 566 568 569 571 573 576 579 580 582 584 586 588 594 598 600 606 607 611 613 614 616 620 623 625 628 634 635 638 640 641 644 646 647 650 653 658 659 666 668 670 672 674 676 679 680 687

---

In [51]:
def approximate_pattern_count(pattern, genome, threshold):
    count = 0
    for i in range(len(genome) - len(pattern) + 1):
        chunk = genome[i:i + len(pattern)]
        dist = hamming_distance(pattern, chunk)
        if dist <= threshold:
            count += 1
    return count

In [52]:
sample_in = ('GAGG', 'TTTAGAGCCTTCAGAGG', 2)
sample_out = 4
assert approximate_pattern_count(*sample_in) == sample_out

In [54]:
with open('data/dataset_9_6.txt', 'r') as f:
    data = f.read().splitlines()
    pattern = data[0]
    genome = data[1]
    threshold = int(data[2])

In [55]:
approximate_pattern_count(pattern, genome, threshold)

147

---

In [57]:
from numpy import base_repr

def number_to_pattern(number, k):
    letters = ['A', 'C', 'G', 'T']
    sequence = base_repr(number, base=4)
    sequence = ''.join(['0'] * (k-len(sequence))) + sequence
    return ''.join([letters[int(s)] for s in sequence])

In [58]:
def pattern_to_number(pattern):
    numbers = {'A': '0', 'C': '1', 'G': '2', 'T': '3'}
    sequence = ''.join([str(numbers[letter]) for letter in pattern])
    return int(sequence, 4)

In [65]:
def approximate_frequent_words(genome, k, threshold):
    freqs = [0] * (4 ** k)
    for i in range(len(genome) - k + 1):
        kmer = genome[i:i + k]
        freq = approximate_pattern_count(kmer, genome, threshold)
        n = pattern_to_number(kmer)
        freqs[n] = freq
    m = max(freqs)
    return [number_to_pattern(i, k) for i, v in enumerate(freqs) if v == m]

In [66]:
sample_in = ('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)
sample_out = ['GATG', 'ATGC', 'ATGT']

In [68]:
result = approximate_frequent_words(*sample_in)
assert sorted(result) == sorted(sample_out)

In [71]:
with open('data/dataset_9_7.txt', 'r') as f:
    data = f.read().splitlines()
    genome = data[0]
    k, threshold = (int(i) for i in data[1].split(' '))

In [72]:
result = approximate_frequent_words(genome, k, threshold)
print(' '.join(result))

TTAAC


---

In [73]:
def reverse_nucleotide(nucleotide):
    if nucleotide == 'A':
        return 'T'
    if nucleotide == 'T':
        return 'A'
    if nucleotide == 'C':
        return 'G'
    if nucleotide == 'G':
        return 'C'

In [74]:
def reverse(sequence):
    return ''.join([reverse_nucleotide(n) for n in sequence[::-1]])

In [143]:
def approximate_reverse_complement_frequent_words(genome, k, threshold):
    size = 4 ** k
    freqs = [0] * size
    for i, old_freq in enumerate(freqs):
        if old_freq == 0:
            kmer = number_to_pattern(i, k)
            rev_kmer = reverse(kmer)
            rev_i = pattern_to_number(rev_kmer)
            freq = approximate_pattern_count(kmer, genome, threshold)
            freq_rev = approximate_pattern_count(rev_kmer, genome, threshold)
            f = freq + freq_rev
            freqs[i] = f
            freqs[rev_i] = f
    m = max(freqs)
    return [number_to_pattern(j, k) for j, v in enumerate(freqs) if v == m]

In [144]:
sample_in = ('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)
sample_out = ['ATGT', 'ACAT']

In [145]:
result = approximate_reverse_complement_frequent_words(*sample_in)
assert sorted(result) == sorted(sample_out)

In [152]:
sample_in = (
    'CTTGCCGGCGCCGATTATACGATCGCGGCCGCTTGCCTTCTTTATAATGCATCGGCGCCGCGATCTTGCTATATACGTACGCTTCGCTTGCATCTTGCGCGCATTACGTACTTATCGATTACTTATCTTCGATGCCGGCCGGCATATGCCGCTTTAGCATCGATCGATCGTACTTTACGCGTATAGCCGCTTCGCTTGCCGTACGCGATGCTAGCATATGCTAGCGCTAATTACTTAT',
    9, 3)
sample_out = ['AGCGCCGCT', 'AGCGGCGCT']

In [153]:
result = approximate_reverse_complement_frequent_words(*sample_in)
assert result == sample_out

In [149]:
sample_in = ('ATA', 3, 1)
sample_out = ['AAA', 'AAT', 'ACA', 'AGA', 'ATA', 'ATC', 'ATG', 'ATT', 
              'CAT', 'CTA', 'GAT', 'GTA', 'TAA', 'TAC', 'TAG', 'TAT', 
              'TCT', 'TGT', 'TTA', 'TTT']
assert approximate_reverse_complement_frequent_words(*sample_in) == sample_out

In [155]:
assert approximate_reverse_complement_frequent_words('AAAAAAAAAA', 2, 1) == ['AT', 'TA']

In [156]:
assert approximate_reverse_complement_frequent_words('AGTCAGTC', 4, 2) == ['AATT', 'GGCC']

In [157]:
assert approximate_reverse_complement_frequent_words('AATTAATTGGTAGGTAGGTA', 4, 0) == ['AATT']

In [158]:
assert approximate_reverse_complement_frequent_words('AAT', 3, 0) == ['AAT', 'ATT']

In [159]:
assert approximate_reverse_complement_frequent_words('TAGCG', 2, 1) == ['CA', 'CC', 'GG', 'TG']

In [160]:
with open('data/dataset_9_8.txt', 'r') as f:
    data = f.read().splitlines()
    genome = data[0]
    k, threshold = (int(i) for i in data[1].split(' '))

In [161]:
result = approximate_reverse_complement_frequent_words(genome, k, threshold)
print(' '.join(sorted(result)))

GGCGCC


---