In [23]:
def profile_dict(motifs):
    if type(motifs) != list:
        motifs = motifs.split()
    nt_dict = {}
    for i in range(0, len(motifs[0])):
        for motif in motifs:
            if i not in nt_dict:
                nt_dict[i] = {"A":1, "C":1, "G":1, "T":1}
            nt_dict[i][motif[i]] += 1
    # nt_dict is a dict with a form of {i: {"A":count_A ..., "T":count_T}}, in which i is the position of nucleotide in a motif
    return nt_dict

def profile_string(motifs):
    if type(motifs) != list:
        motifs = motifs.split()
    nt_dict = {}
    for i in range(0, len(motifs[0])):
        for motif in motifs:
            if i not in nt_dict:
                nt_dict[i] = {"A":1, "C":1, "G":1, "T":1}
            nt_dict[i][motif[i]] += 1
    # nt_dict is a dict with a form of {i: {"A":count_A ..., "T":count_T}}, in which i is the position of nucleotide in a motif
    ntlist = ["A", "C", "G", "T"]
    probability = ""
    for nucleotide in ntlist:
        for i in range(0, len(nt_dict)):
            probability = probability + " " + str(nt_dict[i][nucleotide]/sum(nt_dict[i].values()))
    prob = probability[1:]
    return prob

def profile_most_probable(Text, k, prob):
    # The input matrix must be a string composed of numbers split by spaces only, e.g., "0.2 0.2 0.3 0.2 0.3 0.4 0.3 0.1 0.5 0.1 0.3 0.3 0.5 0.2 0.4 ... 0.1 0.2"
    list_prob = [x for x in prob.split()]
    list_prob = [list_prob[i:i+k] for i in range(0, len(list_prob), k)]
    matrix = {"A":list_prob[0], "C":list_prob[1], "G":list_prob[2], "T":list_prob[3]} # Stored in the form of a dict
        
    list_kmer = [Text[i:i+k] for i in range(0, len(Text)-k+1)]
    most_probable = ""
    max_prob = -99
    for kmer in list_kmer:
        prob = 1
        for i in range(0, k):
            nt = kmer[i]
            prob_nt = eval(matrix[nt][i])
            prob *= prob_nt
        if prob > max_prob:
            max_prob = prob
            most_probable = kmer
        
    return most_probable

def score(motifs):
    nt_dict = profile_dict(motifs)
    final_score = 0
    for i in range(0, len(nt_dict)):
        max_count = max(nt_dict[i].values())
        final_score += sum(nt_dict[i].values()) - max_count
    return final_score

In [24]:
import numpy as np

def RandomizedMotifSearch(Dna, k, t):
    if type(Dna) != list:
        Dna = Dna.split()
    initial_motifs = [i[j:j+k] for i in Dna for j in np.random.randint(0, len(i)-k+1, 1)]
    best_motifs = initial_motifs
    
    while True:
        motifs = [profile_most_probable(i, k, profile_string(best_motifs)) for i in Dna]
        if score(motifs) < score(best_motifs):
            best_motifs = motifs
        else:
            return best_motifs

def RandomizedMotifSearch_iter(Dna, k, t):
    if type(Dna) != list:
        Dna = Dna.split()
    best_result = [i[j:j+k] for i in Dna for j in np.random.randint(0, len(i)-k+1, 1)]
    for i in range(0, 1000):
        motifs = RandomizedMotifSearch(Dna, k)
        if score(motifs) < score(best_result):
            best_result = motifs
    for i in best_result:
        print(i, end=" ")
    return best_result

In [95]:
import numpy as np
import random as rdm

def Random(Motifs, Text):
    if type(Motifs) != list:
        Motifs = Motifs.split()
    Profile = profile_dict(Motifs)
    motif_len = len(Motifs[0])
    motif_list = [Text[j:j+motif_len] for j in range(0, len(Text)-motif_len+1)]
    
    scorelist = {}
    for motif in motif_list:
        score = 1
        for i in range(0, motif_len):
            score *= Profile[i][motif[i]]
        scorelist[motif] = score
    
    choicelist = [i for i in scorelist.keys()]
    probablist = [j for j in scorelist.values()]
    random_motif = rdm.choices(choicelist, probablist)

    return random_motif

In [123]:
def GibbsSampler(Dna, k, t, N):
    if type(Dna) != list:
        Dna = Dna.split()
    random_motif = [i[j:j+k] for i in Dna for j in np.random.randint(0, len(Dna[0])-k+1, 1)]
    BestMotifs = random_motif

    for j in range(0, N):
        i = np.random.randint(0, t)
        Motifs_without_i = BestMotifs[:i] + BestMotifs[i+1:]
        Motif_i = Random(Motifs_without_i, Dna[i])
        Motifs = Motifs_without_i[:i] + Motif_i + Motifs_without_i[i:]
        if score(Motifs) < score(BestMotifs):
            BestMotifs = Motifs

    return BestMotifs

bestscore = 999
bestmotif = []
for i in range(0, 20):
    tempmotif = GibbsSampler("ATGATAACAGATTAGTAATTGGATTCAGTCAAGGCATTGGAGCGCTCACTGGAACGCGGGAGTGTCCCCGTCTCGTGCCAGTGGTCGGATAGCACCATCGTATCGTATCATTGAGGCACAACCCAAGGGAACTCAGGGTCAGTGTCTCAAAGAACAGTGTGCTTTCCGTTTATTCTTCAGCCCGCAGGCTGGTTGTAGTTTCGGTCGATTATGGCACCGGAGTGCACCTGAGACTCACATCAAAGGGCGCTACATGCACGGATGGTTGTTCCCTCCATCCAGAAGAACCGCAACTGAAATCTTGGATGATAACAGATTAG TAATTGGATTCAGTCAAGGCATTGGAGCGCTCACTGGAACGCGGGAGTGTCCCCGTCTCGTGCCAGTGGTCGGATAGCACCATCGTATCGTATCATTGAGGCACAACCCAAGGGAACTCAGGGTCAGTGTCTCAAAGAACAGTGTGCTTTCCGTTTATTCTTCAGCCCGCAGGCTGGTTGTAGTTTCGGTCGATTATGGCACCGGAGTTTGAAGAACAACCGTGCACCTGAGACTCACATCAAAGGGCGCTACATGCACGGATGGTTGTTCCCTCCATCCAGAAGAACCGCAACTGAAATCTTGGATGATAACAGATTAG TGACCCCCCACAGTCCACGGGGAGGAACTTAATAGAACAATTCCCATATTACCCAGAACAATCATGCTACTGCGAAGAAACGGTAGTTCCGCGTTTCCCGGGGAGTGGACGGATGATTTCGAGCCTGTTAATCCGAACGTCGAGCATTTCATCTCAGGACTTAGAGAGCTGGGCTTGGGGAGAACATGTGGAGTGCCTCTCAGGTGATATGCAATGTCGCCTTCGGAGCTCACAGCTCATTTGAGATCATCGTTTGAATCCTAACGAAGGTCACGATCGTCGCCAGGGTCGGTGAGTTTGTGCTGAACGTTCTCCGCGCG GCTCTGGCGATCTGCTACCTAAGTACCCAGAGGACTTACAACTCAGTACAACGGCGCCTCTCCCGTCAATCGCAACGATTTTGTGGACGCTCGGCCATCACTCATTCTGATTTCCACACACCTGCGCTTTCTGTATTACTTCCTTATAAGGTCGATTACCCCGCAACCTAAGTTTTCGGGGATTCGGATCTATCTGGTTAATAGAGGGACCTCACGATATTGCCCTCACACGGTTGGTGTAGGACAATAGACTGACACATACAAATGCGATAACGAAATTTCTGCGCACTGGTTCAGTCGAGCTCAGAAAGTACCCTTAC GCCCAGCACTCAATTTGCTGTCAATCATAACACTATTCCGCCTTCCGCTAAGCAGTCTAGTACTTACGACCCCTGGGTAGCCGACCACAAGCCGTATTTTGGACAGACCGCTATGTCATGGCGCTCTAGTGGTTCCCCGGGTTTTAAAGTTATCATCTGTGGTCAGGATCATAAATATATGCTCAGCGAGGCCGTGTCCGCATCATCTGCGCACTGCCAGAATATAGAACAACCCGACAATACAACTGCCAGATTAACAACAGTCATAATACAGAATATAATGACGTCAGGAATCTTGCCACGCAACGATGAAATAGTCG ACCTGATAAATGGGTTGAAGCATTTTAAGCCTCCCCTTCCCGAAACCCGACATAGGGTCTCGCAATACGACATGGGGTTAAATTTCCAGCGTTCAGGAGTCGGAGAAGCATCCGATGACTACACAGGTCTGCCCCCATGACAGTCTGTAGAACAACCACGTTACTCCGTACCCTAGGGGCTATTGACGCCGTGAATAAAATGCTAACATCTCTAGTCCAGGGGGTGGCGCACGACGATTTTGACATATTAGTTTTTTAAACGTCAGCGGTCGGCCTCGAGAAGTGCCCGAGAAGCCTAGATCGTCACAGAACCTTGAATG CGATGGACGGCCTGAAACCCGATTGCTACTGTATAATAGCGGGATGTTATGCACATTCAGATCGGTTTACACGGAAAAATAGAAGCGTGGTCAATTTAGGCGTTAATAGAACAGTTACATCGCGTTTATTTCGCCCAAGATCCCCCGAAAGGCGCTGACTTAAGTAAAGCATTGAAGCAACGCGTCTATATAGAATTCACAGTAATTGCCACGGGAATAGCGATGCCACAAGACCTTTCGAGTGTGGCGCGAAGAGTATCTGCTCAGCATGACCGGGGCATACAGCCGTTGGTGGAATCGTAGCTTCATTAGACTCGAGC TCCTGTCTCTCTGACTGCGAGTCACCTACCGGGCTTCAGGGGGGCCAATTGCAGTGGCATTACATCTCAAATGACCTGTGCCATCAACTCAAATTTACTCACTCTGCTAGATAGGTTCATCGGCAGGCGCCGCTGTGGAGCTCAAGCATCAATATTCTATTCCACTAAGATCGATGTGTTAATATGCCAACCAGTACCATTATAGTAGACATTACCATGAGCGTTTGTTGAGTTATCAGACATGTCCGTATATACAAGACCTGAACGTTTCGCAGGCGAACCTGTTACCTGAAACATCCAGTATGTAGTCAGCTACTCAT CGTCGACCACCCGGAGTCCCTTACATGTAGGGGACAGCTAGCACGGGAATAGCAGGGTATTGTAAGAGGCGTTCACTAACCCGACTAGCTACTACGCGTATTTGGCCTAGGTCACAGGAGGACGATTAGGCTTAAAAGTATGCCTGCATGTATCAGCCAGGGTGTTCGGTGTTAATGCCACAACCTATCGCTCTTAGGTTCATATTAGGCATTTAGCGCGTTATCTCGCACACCGCTCTACACGTTCAGATTGGTATCCTCATTGTCCGGAACTCTGATGGCGAACCGCCTTTCAATTTTGTCGACCTCGTACGCATAGA CAGGTATCTGGTAGTCTGATTCTCGAGGTACGTAAACTATACCCCCTTCGTTTTCAAAAGCGTGGTGGGCCAGTGCCGAGCCCAATAGAACAACCTCGCCTGTTCGTGGACAGATTGGTAAAAAATTATTAGGAATAGATTAATGGTGTGAATCACGACGTAATACTCAACGAATGAAGGTACAGGGTCGATGCTAGGGCCAGCTTCGGTGTGGAGGATACTTTATGCAATCAAGCGAGTTATACCGTCTCCATGGCGAATTCCATGTTGTACCGTGAGTGCAGGATCGGCAGGTTGAAGTCGCACCACATTCATATTGA ACCGCCCTCTCTAACTCAGGGGTCACAAACGTGTCTTTCACAATTTCGGCCAACATGGATATGCGATCAGGGGGTGATAAGTCAAAATTTAAGCCCTAGACGACATGACTGTTTATACAGCTGCCAATAGGTTCTCAGAACAACCGTCAATGGAATAACCTACGGGGTATGTGTTAATTCCATCCTTATGCATAAAACAAGGCCGGCCATAATTCCCGGGGAGATATCAGTAGATTCTGGACGAGTCTTCGTACTGGAGTTGCATATAAGGTGCAAGGACCGATTCGTATTCATCCGTCACATCATCAATCGAAACTGCC ATTTGACTATTACAAGCATAATTTCTTTTTCTCAGTACGTGATATGTACGGACTATTCACTTCGAGGTCATTAAGCCGTTTGGGCTTATAATCATTGGACACGTCCCACTCAGTTAATAGCCGAACCGCCAGCTGTAGCGCATATCGTGTAAAAAAGGGGTAAGAACAACACTCCACGTAACTTCCGTTAGTCTTTAAGCTACAAGTCAGCACGTCACTCAAGGGACGCACGCTCGCGCATATCCCGTGTCGCTGTTGCCAGTGAATATTACTTTATTTGGAGTGGCCCGGCTGGCGGTACGAATTGTCCTCTCCACCTT TTTAGAACGCAGACGAGGCAGGTCCGCCAACATTTTGTAAGCGTAGCGGCCTTTAGCGAGTTAAACCAACAACCACAAGAACGCTCATATGCGTCTGCAATTGATCATGGAGGTCCCGGCTGGTTTGCGGTAAGTGAAGTTTCGAAGTCTTACATCCAGCCGTCGAGGGTTGAGAGCGACTAATTACGGTTTACCGAGTCGCACGCCCGTGTAATATAGATCCCATCAAGCACCTCTTCTATGGATGGTCTTATTGCTGGAGTAGGGCCCTGGACCGCACGAACATCGTGGGACGCCCCGGAATACAGAAACTCGTCTGC ACATATTCGAGGTGCGTAACGATATCGAATGCCAGGTCCCGGGGTTACCCCAAAGCGCTAATGCTAGCTTCGGTGCAGCAAGAGATCCTATTGTTGCAAGAGTTTGCAGTGGTGGCGCCTCTGGGCAAAAACACGACATTTGGTCTCGGCGATGCCTCATCTGGTTTTATCGATCGGATACACCCCCCGACAATCACTACGAATCACTATTTGTGTCGGCGGGGGCAGGGGTGTTCTATGAAAGGGCCCTTTTATTATAATAGAACAACTTGTTTACAGGTTCGTAACTGTCGACCTATTCCGGGCTCGAAAACGTGTGT GTGCCTCATCCAGATTCGCAGTCAGTACGGCGCCACTGGGTTCTAGTGATATGAAGCACTCTCTCCTCTGGTTAAGGTAACAACCATATTCCCGTGCGTTTCTAGAGAAAGCTCTCAGGCGTATAATGCGAGGCCGGATGTTGTATGGGTTGTCCAAGATGTTCGCTCCCGAGGTCCAGTACCCACCGTATCTTGAAAGTCAGCAACTTATTCTACAAGTGTATGGGGGCTCGGGGAAGTTTACGTAGCCAGTTCACCTTCTTTTCATCAGTAAGTCGGCACCGTCTGTAGTGGAGAGGAGAACACTGACCCACCCGCGA TGTCTAGGTGGATGGAATTCTTCATTCCGAGGCAACCAACTTCTGTGGAAGGATTCCAGCTCAAAAGTTACGGGAACAACCCAAGACCTGTTACCTTTCCCTGAGATTCACCCACTGCATAAAAGCGATTTTGAAGCTGGGTCGTACGCAGCAGCTAACTTCAGGCTATACAATGCAAAGTTCCGGTCGGGGGAGCGGGTCCGTAGGTTTTACCCTTGACCGAAACCCTCTTCTCAATAGTAGCTAGACTGAATTAGGTTTTACGTCTACACTTCTGAGTTCTCGGTAGACTAACCAGTACGGAAAGATTTAGACCCATG ATCTACAGGAGAATTAACCTCGAGACGGCGCATTCTAGGAGAAGGGTACCGACGAATACGTGAGAGACAGTAAGCTAGTCCATTCCAGATTTCCTGTGCTAGATGAGCAAGTGCCTATCGCCCGGTCGTTGGGTTCGATCATCTTCAAAGACCCAGTCGGATAGTGCGCCCAGTAAGTGAAATCGAGGTTGCTGATAAAAGGGTTCTCCTGAGTCCTCGCTGTTAGCGGAACAACCGGTGTCACCTTGCTCCAACTCTCATAAATATCAATAACTTGGCAAGACTGCACAACCCCCCCCTATGTATGGTCTTCTGGTTAG TACCCATTTTAAACCTGAGCTGACAATTATAGGTCTTGTTTGACAATGCTCAGCATGGGATCAAAGGATATACAAACTTCCGGTGGACGGTGCGCTGTTAGGGGGTCAACACAAAGCTTGTTGGAAGTTGTCGTGAATCCCTAGTGACAACTCCTGTGGAAACTTTTGTGCACGCCACTCAGAGGACATCATCAGATGGTTAATAGAACTCGCCGAGTCAGTCCTACGTGCATCCGATTCCCTTACCATCAGTACAGAAGTAGCGCAGTCGCCTAAATGCCCTATCCTACGCACCTGATTAACTCGTGTCTATCTACTGG CATCCGTAGCCCTAAGTCCCGGCGGAATGATTTGAACAGTTGTTGCGTTCGCGTCGTGGTCATCCACACTTAGAATTCGAACCTCGCCATGATGTTGACACACAGGGAGGAATTCCACCCGCTGTTGCTAAATACCTCCAAACGTAACTCTTGTAGTTCCAATGTTAATGTAAAGGAACCGCAGATTAAGGTCCTTGAACTAAAACGCCTTACGCCCGTTGTATCCGGCGGAACGCATTTGTCTTCTTTAGCCGCTGCATGACTCCCATCCTTTTGTTAATTTGACAACCAGCTTTGTCCATTCCTTACAGACCCTCTAT TAGGTCGTGGGAGACTCCGGTGCCATCTTAATTTTAGACCCAAGGTACCGAAACTTGGTTTTATCTTATCCCGCGCCAGTTTCTGGACTGGGAGCCTTCCCAAGCTGTAAATTTTTCACCATTCTGGTTGGGTCCATTACTGGTGGCCGTTGGACAACATACCTGCCAATACGTCACTCGAGCGTGGGCAGGCATTACTGAGGGGATAAGCTAGTTAATAGAAACCCCCTTTGGAGCCGATTTCCATCGGGGCCGCGATTAGCCCCGTTAACCCCAAGGGCCAAGAATATCCGAAGAGCGCCCCATTTACCAATGAGCCA", 15, 20, 2000)
    if score(tempmotif) < bestscore:
        bestmotif = tempmotif
        bestscore = score(tempmotif)
print(score(bestmotif))

for i in bestmotif:
    print(i, end=" ")

108
GTTTATTCTTCAGCC GTTTGAAGAACAACC CTTAATAGAACAATT GTTAATAGAGGGACC GAATATAGAACAACC GTCTGTAGAACAACC GTTAATAGAACAGTT GTTAATATGCCAACC GTTAATGCCACAACC CCCAATAGAACAACC GTCAATGGAATAACC GTTAATAGCCGAACC GTTAAACCAACAACC TATAATAGAACAACT GTTAAGGTAACAACC GTTACGGGAACAACC GTTAGCGGAACAACC GTTAATAGAACTCGC GTTAATTTGACAACC GTTAATAGAAACCCC 