In [1]:
import pandas as pd
import numpy as np
import random

In [376]:
def gen_profile_matrix(motif_matrix: list, pseudocount=False) -> np.ndarray:
    """
    Returns the count matrix of a motif matrix
    :param motif_matrix: Provided motif matrix
    :return: The count matrix
    """
    seq_len=len(motif_matrix[0])
    num_frag=len(motif_matrix)
    motif_matrix = [element for element in motif_matrix if (element is not None)]
    nucleo_dict = {'A': 0, 'C': 1,
                   'G': 2, 'T': 3}
    count_arr = np.zeros((4, seq_len))

    for col in range(seq_len):
        nucleo_col = list()
        for row in range(num_frag):
            nucleo_col.append(motif_matrix[row][col])
        for nucleo in nucleo_dict.keys():
            occur = nucleo_col.count(nucleo)
            count_arr[nucleo_dict[nucleo], col] = occur
    if pseudocount:
        count_arr=count_arr+1
    count_arr_norm=count_arr/sum(count_arr)
    return count_arr_norm
        
#         for nucleo in nucleo_dict.keys():
#             if pseudocount:
#                 occur = nucleo_col.count(nucleo)+1
#             else:
#                 occur = nucleo_col.count(nucleo)
#             count_arr[nucleo_dict[nucleo], col] = occur
#         count_arr_norm=count_arr/sum(count_arr)
#     return count_arr_norm

def ProfileMostProbableKmer(Text,k,prof):
    nuc_dic={'A':0,'C':1,'G':2,'T':3}
    score=0
    bestkmer=[]
    for i in range(len(Text)-k+1):
        tmpscore=1
        pattern=Text[i:i+k]
        for j,p in enumerate(pattern):
            tmpscore=prof[nuc_dic[p]][j]*tmpscore
        if tmpscore>score:
            score=tmpscore
            bestkmer=pattern
        if score==0:
            bestkmer=Text[0:k]
    return bestkmer

def gen_score(motifs,profile_matrix):
    consensus=gen_consensus_sequence(profile_matrix)
    score = 0
    for motif in motifs:
        score += HammingDistance(consensus, motif)
        
    return score
def gen_consensus_sequence(profile_matrix: np.ndarray) -> str:

    row_nucleo_dict = {0: 'A', 1: 'C',
                       2: 'G', 3: 'T'}

    seq = str()
    nucleotide = str()
    col_length = len(profile_matrix[0, :])
    row_length = len(profile_matrix[:, 0])
    for j in range(col_length):
        max_val = 0
        for i in range(row_length):
            if profile_matrix[i, j] > max_val:
                max_val = profile_matrix[i, j]
                nucleotide = row_nucleo_dict[i]
        seq = seq + nucleotide
    return seq
def HammingDistance(texta,textb):
    mis=0
    for i in range(min(len(texta),len(textb))):
        if texta[i]!=textb[i]:
            mis+=1
        else:
            continue
    return mis

def motif_score(motif,profile):
    nuc_dic={'A':0,'C':1,'G':2,'T':3}
    score=1
    for j,p in enumerate(motif):
        score=profile[nuc_dic[p]][j]*score
    return score

# RandomizedMotifSearch

Since a single run of RandomizedMotifSearch may generate a rather poor set of motifs, bioinformaticians usually run this algorithm thousands of times. On each run, they begin from a new randomly selected set of k-mers, selecting the best set of k-mers found in all these runs

RandomizedMotifSearch(Dna, k, t)
    randomly select k-mers Motifs = (Motif1, …, Motift) in each string from Dna
    BestMotifs ← Motifs
    while forever
        Profile ← Profile(Motifs)
        Motifs ← Motifs(Profile, Dna)
        if Score(Motifs) < Score(BestMotifs)
            BestMotifs ← Motifs
        else
            return BestMotifs

In [244]:
def RandomizedMotifSearch(Dna, k, t):
    best_score=10000

    motifs=[]
    rand_idxs=[random.randint(0, len(DNA[0])-k) for _ in range(t)]
    for i,rand_idx in enumerate(rand_idxs):
        motifs.append(DNA[i][rand_idx:rand_idx+k])
    best_motifs= motifs
    for j in range(100):
        profile=gen_profile_matrix(best_motifs,pseudocount=True)
        motifs=[]
        for d in DNA:
            motifs.append(ProfileMostProbableKmer(d, k, profile))
        if motifs==best_motifs:
            break
        if gen_score(motifs,profile) < best_score:
            best_score= gen_score(motifs,profile)
            best_motifs= motifs[:]
    return best_motifs,best_score


In [386]:
DNA='CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG TAGTACCGAGACCGAAAGAAGTATACAGGCGT TAGATCAAGTTTCAGGTGCACGTCGGTGAACC AATCCACCAGCTCCACGTGCAATGTTGGCCTA'.split(' ')
# DNA=['TTACCTTAAC','GATGTCTGTC','ACGGCGTTAG','CCCTAACGAG','CGTCAGAGGT']
k=8
t=5

motifs0=['TCTCGGGG','CCAAGGTG','TACAGGCG','TTCAGGTG','TCCACGTG']
profile0=gen_profile_matrix(motifs0,pseudocount=True)
ans_score=gen_score(motifs0,profile0)
print(ans_score)

i=0
lastmotifs,lastscore = RandomizedMotifSearch(DNA, k, t)

while (i < 1001):
    bestmotifs,bestscore = RandomizedMotifSearch(DNA, k, t)
    if bestscore < lastscore:
        lastmotifs = bestmotifs
        lastscore = bestscore
    i += 1
print(lastmotifs,lastscore)

9
['AACGGCCA', 'AAGTGCCA', 'TAGTACCG', 'AGGTGCAC', 'ACGTGCAA'] 10


In [248]:
def read_file(filename):
    with open(filename, "r") as dataset:
        data = []
        for line in dataset:
            data.append(line.strip())
        k = data[0].split(' ')[0]
        num_of_DNAs = data[0].split(' ')[1]        
        DNA = data[1].split(' ')
    return DNA,int(k),int(num_of_DNAs)

filename='C:/Users/Jofan/Downloads/dataset_161_5 (1).txt'
DNA,k,t= read_file(filename)

i=0
lastmotifs,lastscore = RandomizedMotifSearch(DNA, k, t)

while (i < 51):
#     print(i)
    bestmotifs,bestscore = RandomizedMotifSearch(DNA, k, t)
    if bestscore < lastscore:
        lastmotifs = bestmotifs
        lastscore = bestscore
    i += 1
print(lastmotifs,lastscore)
print(*lastmotifs)

['ATTGATTACCGTGGG', 'TCCACTATCCGTGCT', 'CTCGTGATCCGTGCC', 'AGAGTGATCCGTGCT', 'TCCCATATCCGTGCT', 'TCCGTGACTAGTGCT', 'TGGTTGATCCGTGCT', 'TCCGGAGTCCGTGCT', 'TCCGTGTAGCGTGCT', 'TCCGTGATCCGAATT', 'TCCGTGATCCGTCAA', 'TCCGGTTTCCGTGCT', 'TCCGTGTGGCGTGCT', 'TCCGTACCCCGTGCT', 'TCCGTGATCGCAGCT', 'TCGCGGATCCGTGCT', 'TCCGTGATTGCTGCT', 'TCCGTTGACCGTGCT', 'ACCGTGATCCGTGGG', 'TCCGTGATCCTATCT'] 66
ATTGATTACCGTGGG TCCACTATCCGTGCT CTCGTGATCCGTGCC AGAGTGATCCGTGCT TCCCATATCCGTGCT TCCGTGACTAGTGCT TGGTTGATCCGTGCT TCCGGAGTCCGTGCT TCCGTGTAGCGTGCT TCCGTGATCCGAATT TCCGTGATCCGTCAA TCCGGTTTCCGTGCT TCCGTGTGGCGTGCT TCCGTACCCCGTGCT TCCGTGATCGCAGCT TCGCGGATCCGTGCT TCCGTGATTGCTGCT TCCGTTGACCGTGCT ACCGTGATCCGTGGG TCCGTGATCCTATCT


# gibb's sampling

GibbsSampler(Dna, k, t, N)
    randomly select k-mers Motifs = (Motif1, …, Motift) in each string from Dna
    BestMotifs ← Motifs
    for j ← 1 to N
        i ← Random(t)
        Profile ← profile matrix constructed from all strings in Motifs except for Motifi
        Motifi ← Profile-randomly generated k-mer in the i-th sequence
        if Score(Motifs) < Score(BestMotifs)
            BestMotifs ← Motifs
    return BestMotifs

In [400]:
def GibbsSampler(Dna, k, t, N):
    best_score=10000
    motifs=[]
    rand_idxs=[random.randint(0, len(DNA[0])-k) for _ in range(t)]
    for i,rand_idx in enumerate(rand_idxs):
        motifs.append(DNA[i][rand_idx:rand_idx+k])
    best_motifs= motifs[:]
    for j in range(N):
        row_hide=random.randint(0,t-1)
        motifs.pop(row_hide)
        profile=gen_profile_matrix(motifs,pseudocount=True)
        motifs_hide_score=[]
        for i in range(len(DNA[row_hide])-k+1):
            text=DNA[row_hide][i:i+k]
            motifs_hide_score.append(motif_score(text,profile))
        motifs_hide_score=motifs_hide_score/sum(motifs_hide_score)
        rand_idxs=np.random.choice(range(len(motifs_hide_score)),p=motifs_hide_score)
        motifs.insert(row_hide,DNA[row_hide][rand_idxs:rand_idxs+k])
        if gen_score(motifs,profile) < gen_score(best_motifs,profile):
            best_score=gen_score(motifs,profile)
            best_motifs= motifs[:]
    return best_motifs,best_score


In [401]:
DNA='CGCCCCTCTCGGGGGTGTTCAGTAACCGGCCA GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG TAGTACCGAGACCGAAAGAAGTATACAGGCGT TAGATCAAGTTTCAGGTGCACGTCGGTGAACC AATCCACCAGCTCCACGTGCAATGTTGGCCTA'.split(' ')
k=8
t=5
N=10

motifs0=['TCTCGGGG','CCAAGGTG','TACAGGCG','TTCAGGTG','TCCACGTG']
profile0=gen_profile_matrix(motifs0,pseudocount=True)
ans_score=gen_score(motifs0,profile0)
print(ans_score)

i=0
lastmotifs,lastscore = GibbsSampler(DNA, k, t,N)

while (i < 20):
#     print(i)
    bestmotifs,bestscore = GibbsSampler(DNA, k, t,N)
    if bestscore < lastscore:
        lastmotifs = bestmotifs
        lastscore = bestscore
    i += 1
print(lastmotifs,lastscore)

9
['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG'] 9


In [408]:
def read_file(filename):
    with open(filename, "r") as dataset:
        data = []
        for line in dataset:
            data.append(line.strip())
        k = data[0].split(' ')[0]
        num_of_DNAs = data[0].split(' ')[1]    
        N = data[0].split(' ')[2] 
        DNA = data[1].split(' ')
    return DNA,int(k),int(num_of_DNAs),int(N)

filename='C:/Users/Jofan/Downloads/dataset_163_4 (2).txt'
DNA,k,t,N= read_file(filename)
# N=N/10

i=0
lastmotifs,lastscore = GibbsSampler(DNA, k, t,N)

while (i < 20):
#     print(i)
    bestmotifs,bestscore = GibbsSampler(DNA, k, t,N)
    if bestscore < lastscore:
        lastmotifs = bestmotifs
        lastscore = bestscore
    i += 1
print(lastmotifs,lastscore)
print(*lastmotifs)

['AAGTGGACTGTTTAG', 'ATGACGACTTCTCCG', 'GTGGGTACTTCTCGC', 'ATGGCACCTTCTCCG', 'ATGGGTACTTCAATG', 'ATGGGTACGAATCCG', 'ATGGGTACTCAACCG', 'ATGGGTTGGTCTCCG', 'ATGGGGGGTTCTCCG', 'GGGGGTACTTCTCCT', 'ATGGGTTGCTCTCCG', 'ATGGGTACTTCTGAA', 'AACCGTACTTCTCCG', 'ATGGGTAGGGCTCCG', 'ATCCCTACTTCTCCG', 'ATGGTAGCTTCTCCG', 'ATGGGTACTTTCACG', 'ATGTTGACTTCTCCG', 'ATGGGCGGTTCTCCG', 'GATGGTACTTCTCCG'] 64
AAGTGGACTGTTTAG ATGACGACTTCTCCG GTGGGTACTTCTCGC ATGGCACCTTCTCCG ATGGGTACTTCAATG ATGGGTACGAATCCG ATGGGTACTCAACCG ATGGGTTGGTCTCCG ATGGGGGGTTCTCCG GGGGGTACTTCTCCT ATGGGTTGCTCTCCG ATGGGTACTTCTGAA AACCGTACTTCTCCG ATGGGTAGGGCTCCG ATCCCTACTTCTCCG ATGGTAGCTTCTCCG ATGGGTACTTTCACG ATGTTGACTTCTCCG ATGGGCGGTTCTCCG GATGGTACTTCTCCG
