# Week 3 - Which DNA Patterns Serve as Molecular Clocks?

In [1]:
def change_one(sequence, position):
    bases = ['A', 'C', 'G', 'T']
    seq = list(sequence)
    i = position
    variants = [''.join(seq[:i] + [b] + seq[i+1:]) for b in bases if b != seq[i]]
    return variants

def change_many(sequence, positions):
    seqs = [sequence]
    while positions:
        p = positions.pop()
        seqs.extend([v for s in seqs for v in change_one(s, p)])
    return seqs


from itertools import combinations

def neighbors(pattern, threshold):
    neighbors = set([pattern])
    for i in range(1, threshold + 1):
        for c in combinations(range(len(pattern)), i):
            neighbors.update(change_many(pattern, list(c)))
    return list(neighbors)

In [2]:
def motif_enumeration(dna, k, d):
    patterns = list()
    for chunk in dna:
        kmers = [chunk[i:i+k] for i in range(len(chunk) - k + 1)]
        extended_kmers = set([kmer_prime for kmer in kmers for kmer_prime in neighbors(kmer, d)])
        patterns.append(extended_kmers)
    return list(set.intersection(*patterns))      

In [3]:
dna_ex = [
    'ATTTGGC',
    'TGCCTTA',
    'CGGTATC',
    'GAAAATT',
]
sample_in = (dna_ex, 3, 1)
sample_out = ['ATA', 'ATT', 'GTT', 'TTT']

In [4]:
assert sorted(motif_enumeration(*sample_in)) == sorted(sample_out)

In [5]:
dna_ex = [
    'TACTGAGTTAGAGGTATCTGTGCAA',
    'ATGTAGGTAACACTTCCCCACGGCT',
    'CGGAAACTGCGTTTCTATCACCGTT',
    'TACGCGGTAGTGCTTAGCGCTGAAA',
    'AAGGCAGAGTCTGTTGGCAAATAAA',
    'CTTACGTTGGGCCAGCGTAATCGAT',
]
sample_in = (dna_ex, 5, 2)
motifs = motif_enumeration(*sample_in)

---

In [6]:
def hamming_distance(seq, seq_prime):
    diff = [int(a != b) for a, b in zip(seq, seq_prime)]
    return sum(diff)

In [7]:
def distance(kmer, dna):
    d = 0
    for chunk in dna:
        d += min([hamming_distance(kmer, chunk[i:i+k]) for i in range(len(chunk) - k + 1)])
    return d

In [8]:
from numpy import inf

def median_string(dna, k):
    median = None
    d = inf
    kmers = list(set([chunk[i:i+k] for chunk in dna for i in range(len(chunk) - k + 1)]))
    for kmer in kmers:
        d_prime = distance(kmer, dna)
        if d > d_prime:
            d = d_prime
            median = kmer
    return median

In [9]:
k = 3
dna = [
    'AAATTGACGCAT',
    'GACGACCACGTT',
    'CGTCAGCGCCTG',
    'GCTGAGCACCGG',
    'AGTTCGGGACAG',
]
sample_in = (dna, k)
sample_out = 'GAC'

In [10]:
assert median_string(*sample_in) == sample_out

In [11]:
dna = [
    'AACTATGGGATAAAACGCTTGATTATTAATGCTAGAATTCCG',
    'GCCGACAGTGCTCGGTGCCCCTAGTATGTTCCCGGTAAACGC',
    'TTAACTGCAGTATAAATTAGCCCCAGCCGTTAACTTAAACGC',
    'TGTTCCGCACCGTAACGCGAATACTCAAATTTACCCTACGGA',
    'GTATGACGGTTATAACGCGATGATCAGTCTAGAACGGGGCAG',
    'GTTGGGAACTCCACTGTCAGCGGCTGACGCCAACGCCCGTGT',
    'ACACTTCTCCTACAACGCATCCCTGGCCATTTGCCCACACGT',
    'CAACGCGATTATGCGGGTTACTTGGTTGACCCCTGAGATTTG',
    'TCCGTTATTGATGTTTAGAGATGAACGCTGAAACGCGTCGCG',
    'ACTCGGACTATAAAACGCCACCCCCATCGTCTCGTTCAGGTT',
]
k = 6

In [12]:
median_string(dna, k)

'AAACGC'

---

In [13]:
def profile_most_probable_kmer(text, k, profile):
    bases = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    probabilities = dict()
    for i in range(len(text) - k + 1):
        kmer = text[i:i+k]
        if kmer not in probabilities.keys():
            probability = 1
            for j, base in enumerate(kmer):
                probability *= float(profile[bases[base]][j])
            probabilities[kmer] = probability
    most_probable, _ = max(probabilities.items(), key=lambda x:x[1])
    return most_probable

In [14]:
text = 'ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT'
k = 5
profile_ = """
0.2 0.2 0.3 0.2 0.3
0.4 0.3 0.1 0.5 0.1
0.3 0.3 0.5 0.2 0.4
0.1 0.2 0.1 0.1 0.2
"""
profile = [line.split(' ') for line in profile_.split('\n') if line]

sample_in = (text, k, profile)
sample_out = 'CCGAG'

In [15]:
assert profile_most_probable_kmer(*sample_in) == sample_out

In [16]:
text = 'GGGCGAACGTATAGGCCCTGTCGTATCCTACGAGCCGGGCCTTTATTCTGCCGAGTATCTTGGAACACTCTTCAGACGCCGCAAGTCTCAAGTGAATTTCCACCCCGTTAAGGTGTACCATTTATGGTAACGCCAAAGCCAGCATATAACATACACCTAATGCAAAGGCATGGGGGTCTACGTTTCATTTCTGTGGCATGGTTATTGTCGATCTCGATTCAGATACAAACATCGAGCACTGGTCCAGGCGTTCGTGCTTGGAGAAGAATACAGTCACGTCGATCATAGTCTGAGAGACCAGCGGTAGCGCAAACTCACTGCGGGGGCGATGATATAGGGGAGCCCTCAGAGAACTATATGTTATCTGGCATGAACGGAAAGATCTGAAATTAATTGTTATCTACAGCCCCTGGTAAAGGCAAAACTAGCGCCGTTGTCCGGTTAACCAGCGATCTCGGATCGATGACTCGGTCCAGGCCAGGACGAACCCCATGACGGGGCCACCTTGTGAGCTCTGTTGAAAGCTCGGATGTCACGCGGTCGACAACGCCCAACCACCAAATAATAGGGGATAACCTGGATAAGCATTTTCAGGTGTAACTAGTGACCTCCACGATGCTAGCCTGTGTGGCGGATTCATCTTGAGAGCGAGAGGTGCGCGTCGTTGTCACAAGCGTTTCGCGCGCGAAAAGCAGATACGGGCCGTTAACTGAATTTCGTGACAGTATTATTTAACCGGTGGAGAGTTATTGTATATGAATGCCGGCTTGACGGAAAATTATTCCCCTACGATAATTTTTGTCCAAAGCATACCGCTAGCTTAATTTCGGCTTAGTGAGTGAATTAGTACGAAAAAGCCTCGTAGACACCGCCTAGTACATTTCAACCAACCCCCGATGATCCAAGGCCGGAGCTTTTAGCAATGCGAGTGGGCTGCCTGCAGTGATGCAGTGTTTGTAGTCCTTTTGACAGACTCCAAACAGCTGCGGACAGGGGCGTT'
k = 12
profile_ = """
0.205 0.301 0.241 0.241 0.301 0.265 0.313 0.253 0.205 0.241 0.277 0.157
0.253 0.265 0.325 0.229 0.217 0.181 0.253 0.241 0.325 0.289 0.265 0.337
0.169 0.265 0.205 0.313 0.265 0.289 0.265 0.205 0.193 0.277 0.217 0.301
0.373 0.169 0.229 0.217 0.217 0.265 0.169 0.301 0.277 0.193 0.241 0.205
"""
profile = [line.split(' ') for line in profile_.split('\n') if line]

In [17]:
profile_most_probable_kmer(text, k, profile)

'TACGAAAAAGCC'

---

In [72]:
import numpy as np

def form_profile(motifs):
    bases = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    counts = np.zeros((4, len(motifs[0])))
    for line in motifs:
        for j, base in enumerate(line):
            counts[bases[base]][j] += 1
    freqs = counts / counts.sum(axis=0, keepdims=True)
    return freqs.tolist()

In [73]:
def score_motifs(motifs):
    bases = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    n = len(motifs[0])
    counts = np.zeros((4, n))
    for line in motifs:
        for j, base in enumerate(line):
            counts[bases[base]][j] += 1
    score = sum(len(motifs) - counts.max(axis=0))
    return score

In [74]:
assert score_motifs([
    'TC',
    'CC',
    'AC',
    'TT',
    'AA',
    'TT',
    'TC',
    'TA',
    'TC',
    'TC',
]) == 7

In [75]:
def greedy_motif_search(dna, k, t):
    best_motifs = [chunk[0:k] for chunk in dna]
    for i in range(len(dna[0]) - k + 1):
        kmer = dna[0][i:i+k]
        motifs = [kmer]
        for j, chunk in enumerate(dna[1:]):
            profile = form_profile(motifs[:j+1])
            motifs.append(profile_most_probable_kmer(chunk, k, profile))
        if score_motifs(motifs) < score_motifs(best_motifs):
            best_motifs = motifs
    return best_motifs

    GreedyMotifSearch(Dna, k, t)
        BestMotifs ← motif matrix formed by first k-mers in each string from Dna
        for each k-mer Motif in the first string from Dna
            Motif1 ← Motif
            for i = 2 to t
                form Profile from motifs Motif1, …, Motifi - 1
                Motifi ← Profile-most probable k-mer in the i-th string in Dna
            Motifs ← (Motif1, …, Motift)
            if Score(Motifs) < Score(BestMotifs)
                BestMotifs ← Motifs
        return BestMotifs

In [76]:
k = 3
t = 5
dna = [
    'GGCGTTCAGGCA',
    'AAGAATCAGTCA',
    'CAAGGAGTTCGC',
    'CACGTCAATCAC',
    'CAATAATATTCG',
]
sample_in = (dna, k, t)
sample_out = ['CAG', 'CAG', 'CAA', 'CAA', 'CAA']

In [77]:
assert greedy_motif_search(*sample_in) == sample_out

In [78]:
k = 12
t = 25
dna_ = """
CGTCAAGGCATGTCGGGCTCACGCTGACGATGAGACATAACGCCAGCGGTCGGTTACTGGGCCGGAATCTGGGCCATAAGACAAAAGGACGAAGACGTAATAGAATGGCAGATAACCGTACCATGTTCAAAGGGGCGTACGCTAAAATTCGCGACG
TCTTGTGCACGATCTACGCGAACGCTGGCCAGTAATGAAAACGCAGACGTGTGGCATTTAATAATATCGGTTGTTGTTCCACCGACTGCCCCGATGGGGCCCGTTTGGGCAGGCAATAGCAAGTTCCCCCCTGTCATTTCTTGTGTTCCGGCCAAC
TTAATGCGGACTACAGCTCGTCCAGGGCCACGCTGCGTCTACCGTAGCTAGGACGTAGACATGAGCATTGCTATGTTTGATTGACATTCCTCCGGATCCGTGCGAACTGCCTCGACGAACTAGAGCTTGTACCGCGCGTCGGTTGCTGCTCTTACT
TACAACGTAGACTCGTCTGCGATGTGGTAGCTCAGGGCGGGAGCTCGATCCCTTCCGACGCTTGGGCAACCTCCCTGTCGCCTAATATGCGTGCACTGGAAGCGGATCAAGTTGCAAACTTGAAGCGATGCGGTTGAATTGAGCTGATTACGTTGT
TTGACCCGCTGGAAGGGACGAAAAGCTACAACACGGCATTATGGATGGGAAATGTACAAATCGGATAGGTCCAGTGCCATAGCGACTGTTCATGAATGGGACGGCGACGACTACGAAGACGACCATGCAACTTCCGGTTGGAATAGTTCTGTGAGA
CAAGAGGCTGAAGAGTTCAATGTTTAATACGAAGACTAGCATAGTTTGCCTACTAGGAGTCTGCTGAATCCACAGCGGACGATATCCCCGTTCCAGATTAGTTGGCTCTGGATAACCGTCAACAAGGGTTGTCAGGATGTCTTACTACAGCCTCCC
ATAAGACCATGTCTTAACCCCAGCAGGTTCTTAATTTAAAACGCAGACAAGTCCGCTGACGGCACCACTTCGACGCGCCGGGCCGCAGAGTCAATTAGCTGGGTTGCGTCGCATGCTTTGGTGACCGCGCTACCACTTTATTTTGTCCTGTAGCAA
TAATACTTCAAGTAAACGCCAATACCGGAATACTCGAATCCCGTGCCGCCTGTCGGAGGTCACCCCATCCAAATTTTCTCGATTGTCACCGTGTCGAAGAACAATGATAAGTGCCGACTCCGTGCTGATACGAACGACGAAGACTCACGATTGACC
TGTATCACACAACAATGATAGGATCTGACCTAACCTGTCCAAGATGGCGTGCAACGAAATTACTACGTTGAGGTGATAGATACAGGTCTTCAGCTTGCTATACGGTCAGCCACCAATTCTTAAGACGCAGACGAAAACTACCACATTGCATCAGAA
GGAGAACGGCCGGGGAATAGACGCATACTGAGGACATACATCCTTTGCCTACGTAGTGAGCTTGCCTAAGATAGATATAACCGTCATAAGACTCGCGAGGCCGAGAATTCTGCTTCGTTCACGGGTTAGAGTAACCACGAAGACACGTCACCTGGA
TATTTGATCCGTTTAATGCTTGGGCGACACTCTGGTGAAAACCAAATAGAGCCTGTAGATAAGGTGGGGTTGAACTATTGAAGCCGCATTTGATCCCTAGCAATTACGAGAACTTTACCACCATTTGGAGCGATGCCGATTGGCAAGCACGTAGAC
TGGAAGTGTACGCCTAAGCTACTACCACCTCGAGTGCTTCCCGAGGGGCCGAATTATCCTGATCACGTAGACTGGATTAAGCACGGACCGTCAAAGGACTCAAACATGTAGCTTCGCCTGAAGCCCCCGTGCCCACGGGCTGGAGTGCCAGACACC
GACGACGGAGACCTTTCTGGAATTTTGAATGTCATATTGACCTTGCTCGAGTGACACTTTAGGATTAACGATGCAACTTTGACCCAGGGGCCATCATCATAACATGGTAATCAGGAGATACGTCAGCCGACCATCTGCCTATGAACTAGCCCGTAA
GATAACGGAGACGTCTCAAGTCAGATACTCCTTCCGCAGTGCTACCGTCGTCGTCAATAACTGCTGGAAGCCGCGCGCCTAGTTTAGAAACACTTTTGCCATTTTTGAGAATGGTTACCAGAGAAGGCTTCAACCAGAGCTTAATAACAGTGCATG
GGGTTAGAGCTCGGAAACAGCGTAACGGAGCTGGACCCCCGGCGGCTTCATACGAGGACACCGTAAACGGGCAACTGGGTTTAAATTAGCAGTATGAACCCCATATAGGTAAATGTCCACTTTATGACGGCTTGACCCGAATCTTATAACGAAGAC
GTTATTAATCGAATCGCTGACTCGCGTTGGTCTTCGTGTGTCATTACCGCAATCGGGCATTTATGGCAGTAGAGCAAACTCTTTAATGTCTCTAACGCCAATTCTTGCCAACGTGTGAGTGACGACGTAGACTATCGAAAGTATAGGGTGGGCCCC
ACCAAGAGCGAAAAGGCACCCTGCACCACCACTCAGCAAGGGGAATACCGCGAAGACAGTACACACGATGTGCACGACGAAGACAGCGGCATGTCAGAAAGCACGAGCGAAGTCCTTAGTAAAGAAACACGATAGGCCCATAATGCAGTTACGAAG
ATCGCACGCGCTAAACTATTGTCATTCGGCCCGGTTTGGTGCGTAATCAAGATGGGGCGAACCATCTCAGAGCAGAATAATAGTTGCCAGGCACGGGAACCACACTCGGAAAACGGAGACGTGCTAATCAGATGGGCCATCTGGCCGAAGCCGATT
ACCCTGACGTGTCTGGATGTTAGTCGGCGTATAACCTAGGTCGTCTGTATCGGCGAATGCGCGGCCGCCAGAAACAACGAAGACGTTACACGACTCCTGCGGTCCTTCCCGTTACGAAGAGTGGAATGCTACCGCCGATTCTTCACTTACATAGTC
AAGCAACGCGGTCTAAGATTTGCCCGGTCAAACTGATGTCTCTATCAGCTAACCAACTGTATCATGTAATCAGATAACCGCGTAATGGTGACATGTCCCAATTGTTAGGACTACGTAGACCTTTACTGATATTGCTCCTCAGCCAACTGCTCGATG
GTTGTAAGTCCTTCAATGGATGTTGTAGCGCTTTTGCTGTTCTAACCGCCAACGACCGTTCAGTACGTAGACGCACCCACGTAGGAAGACCATACAACATGATCCGCAGCATGGCTCAGCGAAGCTATTAAACAAGCCGCTGGGATCGGCCCGTCT
GCTGTCTCCGCCGAGGGACTACTGATGGACCCGACTCGACAGTTGTTTAAAAGTAGCTCGACCCTGCGTGCATCCTCTAAGTTAGGGTCGCCTCCGGCGGCCCAATCTGATCACGCAGACTGGGGGCTCCCAAGCGTTCGTCTGCGGGAGGGCGAC
TCGAGGTGTCAAAACTCACGCCAGGGCGTCAGCCCCCTATCACGTCGCGCCGCTCGGACAAAGTGAGTCTTACAATACGCAGACAACCATCCAGATACGTCCTCTTTACTGGCGAGTTGCTGTGCCTGCGTGTTGTGGATGGCAGTTTGCTAATCT
TTGACGAATTAGCATTGTATCCGGTCGGAAACGAAACGATGCTGGCCCGTGAAGAATACATTCACTCAAAGCCATGATAACTTTTGACAACCGAGTACACATCTTTCTAGAGGCCGGCTGGAGTACGAAGACTCTATAGAGATTCTAATTGCTCTA
TTGTGTGCTAAGCGCAGGTTAGGGCCTTTGCTTAAATATATTTACTAAAGCAAGCTCGACACACGTGACAACACAAACGATGTGCAAAACGTAGACATTCTCTTACTATTGATCCTGTATAGGGCAACTAGCATGCAGGCTAATGTCCCCACTTCT
"""
dna = [line for line in dna_.split('\n') if line]
result = greedy_motif_search(dna, k, t)

In [79]:
print('\n'.join(result))

AGGACGAAGACG
TCTTGTGCACGA
TTAATGCGGACT
TACAACGTAGAC
TGGATGGGAAAT
AAGAGGCTGAAG
AAAACGCAGACA
ACGACGAAGACT
AAGACGCAGACG
ACCACGAAGACA
ATTACGAGAACT
AGGACTCAAACA
ACGACGGAGACC
ATAACGGAGACG
AGCTCGGAAACA
ATGTCTCTAACG
ACGACGAAGACA
AAAACGGAGACG
ACAACGAAGACG
AAGCAACGCGGT
TAGACGCACCCA
ATCACGCAGACT
AATACGCAGACA
AGTACGAAGACT
ACAACACAAACG


---

In [80]:
import numpy as np

def pseudo_form_profile(motifs):
    bases = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    counts = np.ones((4, len(motifs[0])))
    for line in motifs:
        for j, base in enumerate(line):
            counts[bases[base]][j] += 1
    freqs = counts / counts.sum(axis=0, keepdims=True)
    return freqs.tolist()

In [81]:
def pseudo_greedy_motif_search(dna, k, t):
    best_motifs = [chunk[0:k] for chunk in dna]
    for i in range(len(dna[0]) - k + 1):
        kmer = dna[0][i:i+k]
        motifs = [kmer]
        for j, chunk in enumerate(dna[1:]):
            profile = pseudo_form_profile(motifs[:j+1])
            motifs.append(profile_most_probable_kmer(chunk, k, profile))
        if score_motifs(motifs) < score_motifs(best_motifs):
            best_motifs = motifs
    return best_motifs

In [82]:
k = 3
t = 5
dna = [
    'GGCGTTCAGGCA',
    'AAGAATCAGTCA',
    'CAAGGAGTTCGC',
    'CACGTCAATCAC',
    'CAATAATATTCG',
]
sample_in = (dna, k, t)
sample_out = ['TTC', 'ATC', 'TTC', 'ATC', 'TTC']

In [83]:
assert pseudo_greedy_motif_search(*sample_in) == sample_out

In [85]:
k = 12
t = 25
dna_ = """
TCGGCTATGATCTCCCACACCTACGCCTAGTCACTGGCATCGCGTTCCATGCACGCGCTGCTTAACCTCTGCATTTGAGAGCTAATAAAGCCAAGCGTGCTCCTATCCGTAGGGCTTCAGCGTTGTGCCGCGGAAAAGGCCGCTTACCCGCTTCTT
TGGTAGCGTGGAGGAATCCCTAAACGGGTTCAAAACTAGGGGAAGAGATCGTGCTGGCCGGTGTAACTCTCAGCTGGACCGTCAATCTGTGTGCTATTCGGAAAAACATGGTAGTTGGGTGTACTGTTACGTCGGGATTCGCCGGATGCCCCGGTG
TTGGAGCGGCTGGCATCTACGTATATTCGGATTCTCGGATTTGATCTTAAGTAATACCCTAGATTACAAATGGCCATGCGCCAGCGTCAAAGTCCGCAAATAGAAAGTGTGGAATGAAAGGTTTGACTTGGGTGTACAGTATTCATCTGAGCTCTA
AATGGACGAGGGATCTTGATAGAACAACTCGCCAGTGCTATGGGCCGACAGAGAAGGTACATTTGAGGTCTAATTAGGTGGAGAACGTCCAGACTCAGCACTTAAGACCAAGACAAGAACCATTCGCAAGCGACTCCTGCCATCATAGCGCCTAAG
TTCAATATAATTGATCCGGCGCGGATTCTTCTACCAGGACAAGGCTGAGGCAGCATAAAGTACACGCTAACCTACGACTTGGCCTCTTCCGGATGCATCTGTGCCCTAAATGGTCTATCGATAGAACTTTCTGTCACACATTGTTCGATGCTTATT
ATTTGGCGTGGCCTGAAACCACAACGTGGTTTGACTGTCTTATCAGTTCGATTAGTGTGTGTAACCGCACGAACTTCATTTCAAGTCGGCCGCACATAGGACCTAGGTTCACTGAGCTTGTATAATCTAGTAACCCTCACAGTTATTTGAGGTCTA
AGGTCTGTGAAGACCGTACCGATAAGTTGCCACGTTACTCTCACTTTAAGGCTCATAAGGCACAGGCGGAATCTGCGGACAACCGGAACTCCCCAGAGGCCCTTGGCGATGTGAGACCTAATAACGTGCGCATGCGGCTTCCGATTTAGGGCGGCG
AGATTTCGGATCCGCACCAAGTAAGCCGCAATTAGCTATATCACCGCTATATGTGGGCTAGATGGATGGAGCATGACCGGACCTCAGTTTACCAACGAAGAGTAGTTCTCTACTAAGCAACTCTTCGTTGACCGTCTCTACTTCTATACTAGGGGA
TGCGAAACACGTTGGTGATAACCAGTTTGCTTGAATATGCTGTAATTGCATCGGTTGGGACCGTTCAGAGAAATGCAAGAAAATTTTCATACAACCCCGGCCCGGAGCGAATACAACGATGTTCGTGTACAAATATGGGAACTAAAGGGGATACGT
TGGCTCTCGGGAATCTGCGAACTACTATTGCAACCCCTACGCGTAGTACCTTCAACGCTTAATAGTTCCACAATATTAGCAAAAGTTCCTAATTTGCATGGTTCTAAGCTATGCATCGCCTCTGATAACTCTTTTCGTGTGCGGGATTCCCTCCAC
ACTTCTGGTCCGCTACTTGAGTTGTTAGGGGCCGTTTGAAATCAAGAAATAAAGATGGTGGCAGTCGGATGCCAGTGATGACCCATTTGCGCACTAGGTTCGCGCGCATAATTTTAATACACGGATTCTCGGATTAGGGTGGTAGCAACTACTGCA
GACCTGCTATTTCTCAGCCACCGTCTTGATTATCCTTACGGGACATCGGGCGAAGATAGATGTCCGTGAACGGTACTTGTACGTGACGTTCCACCTAACTAGATCAGACCGGACTATCCATCACAAGCATATATGTGTGAACTATTTCATTATAGA
CGCAGGAGACTCTGTCCTCTCCATTCTGACTCAACTCCGTTTCTGATGATGTGGGATCTAAGTAGGAGTGATAAGGTTTCGGCTCAATACGCTACGTGCCCGGCCATTAATAAGCGTGTGCTGAACAGCCCGTCTAAGGTAGACTTGGTTGGTTAT
ATGTGAGCCCTACTCGCAGTTGTTGGGATGCAGAACGACCCTAAAAACAGAGGTGACCGAGAGCCGGACCTTGGGGTTGTTTCGGGCGTCCATGCGTGCACAAGTCCATGTCAATCTATTTTCGGCGTATGCTCCCCGCAACTGCATTCATTACCG
ACTAGCTAGGAGGTGTGCATCCCCGAGAACATGTAAAACTCATTCGACTCGAAGTGCTTATAGTTCTCGGAACTGGCTGTCATGTCCACGCAATGTATGTGCGCGCTAAGGATATACCCTGTACGGGCACAGGTGCAGGATCATTATTATAGAGTA
ACCAATTGAACCGAACCGAGGCGCGATCCTGTCTTGATGTGGGTTCTAGACCATCGCCACGATTTGATGTTTCTAGGTCAATCCCTGAACAACCCGAGGGCCAAGGGCAAGCTCGTACTAAGATGGCTCGATCCCAGTAACTAAACTAATCACCGT
CCGCATCCCCTTCAGGGTGTGGCCGATCTATACGTGCTAGCAACAGGGCTAGTATTAGAAACCCAAGCCCTTATGTGTGGCCTAAGCATTGGCGCGCGATCTCCACTTATGCCTTGTGAGGACGGGGTAAAATGGACGGGGTATAGCTGGATACAT
CTAGGCACTTGGTCTTGAGTGAGCACCTGATCGAGCTCAATTTATCCATCGCCGACGCCAGATGTTCAACCAATGTTGAGCCACCCCTATGCTCATATTGTGGTCCTAATTTGAGGACTACAGCTACCGCAGTCCGTTAACAGGTGAGTGCAAGAA
CAAGGCACCTCTATAATCACCCATAACATGTGAGTTATCTGTGGTCTAAAGAAATTTTGCCGATTTGACCCTCACTTTCGAATAGAAGATTGTGGCGATCCCAGTATCTCTAGAACGAGCCGTGCAAAACGTTTCCATACAGTGCAGGAGTTGTAC
GCGCCCACTTCATTATTTGAGCGTATTTTTACCCCGGACAGACATGAGTGGGGACATATTGTATGGAATATTCCACTCTTACAGCCCTGCAATTATCCGAAGATGATGATCTGTGTACTAGACATTATCAGGGATTTAGACTGACTTCCCTACCAA
ATATGAGCTCTAGAATTCACAAAGAACCTCCGAACTGGTTGGGTCGCAGTAGGATGATGACTTTCGAAAAGCACAATGGCTCGCTGTCGAATACCTACATGAGAATCTGCATTAGCGCTCCCAGACCGCCCGACACCGGCCCGCTGGACGCGTTAC
GGAATGTCCTACATGTTAGGGTCGCTCTTTACGCTGGTCGGGGAGGTAATGTGTGGACTAACGACGGTAGGAAACTCTTTCCCCTGCCACGGCTCCATGGTACAAATTAATTTCAGTAAGAGTAGTATGCGGCTTAATCAAGTAGTTGTTCAGAGA
CATACGGCGCTAATAACGACCGGTTATAGTCAACGGAGAATCATCACTAAGGAGATACGTCCAGATTTCCACATAGGAGTCTCATCTCGCGATGACATGGCATATCTAAAGCCTCGAGTCGCAGTCCTTTTCATGTGTGTACTAGTCGAACCCCTC
CCGGGCATTTCCTGTCTTCCCCCACCACCGGCAGCATTAGCACTAGCGTAACAGTGACTGTACGAAGCCGGTATTAGAAATAGAAAAGTACTAGCCCTGCAGTGCGGAATCTGTGCCCTATTTCCGCACCCATAGCCATAGAAGGGCTGCAATGAA
CAGGTAGAACTAGTATGTATTACCCGTTTAATGTCCTAAACAGTGACGATCCCGCTGTGAGGTCTCAACATCATCTGGGCACTACCTGTAGAAGTAGGAATGGTCTCCTGGGAGGGGACGGAGCTAAACCTTCCCTTACCGCCCACCACACGATAT
"""
dna = [line for line in dna_.split('\n') if line]
result = pseudo_greedy_motif_search(dna, k, t)

In [86]:
print('\n'.join(result))

ATTTGAGAGCTA
ATCTGTGTGCTA
ATCTGAGCTCTA
ATTTGAGGTCTA
ATCTGTGCCCTA
ATTTGAGGTCTA
ATGTGAGACCTA
ATATGTGGGCTA
ATATGGGAACTA
ATCTGCGAACTA
ATTTGCGCACTA
ATGTGTGAACTA
ATGTGGGATCTA
ATGTGAGCCCTA
ATGTGCGCGCTA
ATGTGGGTTCTA
ATGTGTGGCCTA
ATTTGAGGACTA
ATCTGTGGTCTA
ATCTGTGTACTA
ATATGAGCTCTA
ATGTGTGGACTA
ATGTGTGTACTA
ATCTGTGCCCTA
ATCTGGGCACTA


---