# Week 1 - Where in the Genome Does Replication Begin?
---

In [None]:
def pattern_count(text, pattern):
    count = 0
    for i in range(len(text)-len(pattern)+1):
        if text[i:i+len(pattern)] == pattern:
            count += 1
    return count

In [None]:
pattern_count('GCGCG', 'GCG')

In [None]:
with open('dataset_2_7.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[0]
    pattern = data[1]

In [None]:
pattern_count(text, pattern)

---

In [None]:
def frequent_words(text, k):
    frequent_patterns = list()
    counts = list()
    for i in range(len(text)-k+1):
        pattern = text[i:i+k]
        counts.append(pattern_count(text, pattern))
    max_count = max(counts)
    for i in range(len(text)-k+1):
        if counts[i] == max_count:
            frequent_patterns.append(text[i:i+k])
    return list(set(frequent_patterns))

In [None]:
frequent_words('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4)

In [None]:
with open('dataset_2_10.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[0]
    k = int(data[1])

In [None]:
frequent_words(text, k)

---

In [None]:
def reverse_nucleotide(nucleotide):
    if nucleotide == 'A': 
        return 'T'
    if nucleotide == 'T': 
        return 'A'
    if nucleotide == 'C': 
        return 'G'
    if nucleotide == 'G': 
        return 'C'

In [None]:
def reverse(sequence):
    return ''.join([reverse_nucleotide(n) for n in sequence[::-1]])

In [None]:
assert reverse('AAAACCCGGT') == 'ACCGGGTTTT'

In [None]:
with open('dataset_3_2.txt', 'r') as f:
    sequence = f.read().splitlines()[0]

In [None]:
reversed_sequence = reverse(sequence)

---

In [None]:
def pattern_matching(pattern, genome):
    positions = list()
    for i in range(len(genome) - len(pattern) + 1):
        if genome[i:i + len(pattern)] == pattern:
            positions.append(i)
    return positions

In [None]:
pattern_matching('ATAT', 'GATATATGCATATACTT')

In [None]:
with open('dataset_3_5.txt', 'r') as f:
    data = f.read().splitlines() 
    pattern = data[0]
    genome = data[1]

In [None]:
print(' '.join([str(k) for k in pattern_matching(pattern, genome)]))

---

In [None]:
with open('Vibrio_cholerae.txt', 'r') as f:
    genome = f.read().splitlines()[0]

In [None]:
print(' '.join([str(k) for k in pattern_matching('CTTGATCAT', genome)]))

---

In [None]:
from collections import defaultdict

In [None]:
def frequent_words_(text, k):
    counts = defaultdict(int)
    for i in range(len(text)-k+1):
        pattern = text[i:i+k]
        counts[pattern] += 1
    max_count = max(counts.values())
    return [k for k,v in counts.items() if v == max_count]

In [None]:
with open('dataset_2_10.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[0]
    k = int(data[1])

In [None]:
%time
frequent_words(text, k)

In [None]:
%time
frequent_words_(text, k)

---

In [5]:
from numpy import base_repr

In [6]:
base_repr(5437, 4)

'1110331'

In [7]:
int('1110331', 4)

5437

In [8]:
def pattern_to_number(pattern):
    numbers = {'A': '0', 'C': '1', 'G': '2', 'T': '3'}
    sequence = ''.join([str(numbers[letter]) for letter in pattern])
    return int(sequence, 4)

In [9]:
['0']*3

['0', '0', '0']

In [10]:
def number_to_pattern(number, k):
    letters = ['A', 'C', 'G', 'T']
    sequence = base_repr(number, base=4)
    sequence = ''.join(['0'] * (k-len(sequence))) + sequence
    return ''.join([letters[int(s)] for s in sequence])

In [11]:
pattern_to_number('CCCATTC')

5437

In [12]:
number_to_pattern(5437, 7)

'CCCATTC'

In [27]:
def computing_frequencies(text, k):
    size = 4**k
    freqs = [0] * size
    for i in range(len(text)-k+1):
        pattern = text[i:i+k]
        j = pattern_to_number(pattern)
        freqs[j] += 1
    return freqs

In [14]:
assert computing_frequencies('ACGCGGCTCTGAAA', 2) == [2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 1, 1, 0]

In [15]:
with open('dataset_2994_5.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[0]
    k = int(data[1])

In [None]:
result = computing_frequencies(text, k)

In [None]:
print(' '.join([str(k) for k in result]))

---

In [26]:
def faster_frequent_words(text, k):
    freqs = computing_frequencies(text, k)
    max_freq = max(freqs)
    patterns = [number_to_pattern(i, k) for i,freq in enumerate(freqs) if freq == max_freq]
    return patterns

In [None]:
faster_frequent_words('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4)

---

In [16]:
from tqdm import tqdm 

def find_clumps(genome, k, window_size, min_freq):
    clumps = list()
    for i in tqdm(range(len(genome) - window_size + 1)):
        window = genome[i: i + window_size]
        freqs = computing_frequencies(window, k)
        new_clumps = [n for n, f in enumerate(freqs) if f >= min_freq]
        clumps.extend([number_to_pattern(c, k) for c in new_clumps])
    return list(set(clumps))

In [17]:
sample = 'CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA'
solution = ['CGACA', 'GAAGA']
result = find_clumps(sample, 5, 50, 4)

100%|██████████| 26/26 [00:00<00:00, 5037.50it/s]


In [18]:
assert sorted(result) == sorted(solution)

In [None]:
with open('clump_finding.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[1]
    params = [int(p) for p in data[2].split(' ')]
    solution = data[4].split(' ')

In [None]:
result = find_clumps(text, params[0], params[1], params[2])

In [None]:
assert result == solution

In [None]:
with open('dataset_4_5.txt', 'r') as f:
    data = f.read().splitlines() 
    text = data[0]
    params = [int(p) for p in data[1].split(' ')]

In [None]:
result = find_clumps(text, params[0], params[1], params[2])

In [None]:
result

In [None]:
print(' '.join([str(r) for r in result]))

In [None]:
# E coli
with open('E_coli.txt', 'r') as f:
    data = f.read().splitlines() 
    ecoli = data[0]

In [None]:
result = find_clumps(ecoli, k=9, window_size=500, min_freq=3)

In [None]:
len(result)

In [28]:
from pathos import multiprocessing as mp
from functools import reduce

def find_clumps_parallel(genome, k, window_size, min_freq):
    windows = [genome[i: i + window_size] for i in range(len(genome) - window_size + 1)]
    
    def analyse_window(window):
        computing_frequencies(window, k)
        freqs = computing_frequencies(window, k)
        new_clumps = [n for n, f in enumerate(freqs) if f >= min_freq]
        return [number_to_pattern(c, k) for c in new_clumps]
    
    with mp.ProcessPool() as pool:
        clump_sets = pool.map(analyse_window, windows)
    
    clumps = reduce(clump_sets, sum)
    return list(set(clumps))

In [29]:
sample = 'CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA'
solution = ['CGACA', 'GAAGA']
result = find_clumps_parallel(sample, 5, 50, 4)
result == sample

NameError: name 'computing_frequencies' is not defined

In [None]:
result = find_clumps_parallel(ecoli, k=9, window_size=500, min_freq=3)