https://stepik.org/lesson/240241/step/4?unit=214000


https://github.com/ivanov-v-v/rosalind-mipt-2019/blob/master/06%20%E2%80%94%20ba2d/main.py

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy import stats

In [2]:
alphabet = list('ACGT')
letter_to_id = {letter: idx for idx, letter in enumerate(alphabet)}

In [3]:
def most_probable_kmer(text, k, profile_matrix):
    
    """
    output:
        most probable kmer in text for given profile matrix
    """
    
    best_proba = -np.inf
    most_proba_kmer = None
    
    for i in range(len(text)-k+1):
        kmer = text[i:i+k]
        indexes = [letter_to_id[char] for char in kmer]
        
        string_proba = np.choose(indexes, profile_matrix).prod()
        
        if string_proba > best_proba:
            best_proba = string_proba
            most_proba_kmer = kmer
    
    return np.array(list(most_proba_kmer))

In [4]:
def get_profile_with_pseudocounts(motifs):
    
    """
    input:
        motifs: np.array
    
    output:
        profile matrix
    """
    
    counter = np.apply_along_axis(Counter, 0, motifs)
    counter = [dict(c) for c in counter]
    
    count_motifs = pd.DataFrame(counter, columns=list('ACGT')).fillna(0).T.to_numpy()
    
    # Laplace’s Rule of Succession
    count_motifs = count_motifs + 1
    
    # Plus 4, since for each letter in the alphabet (ACGT) we add 1
    profile = count_motifs / (len(motifs) + 4)
    
    return profile

In [5]:
# motifs = [
#     "TAAC",
# "GTCT",
#     "ACTA",
# "AGGT"
# ]
motifs = ['ACCT']

In [6]:
dna_arr = np.array([list(text) for text in motifs])
dna_arr

array([['A', 'C', 'C', 'T']], dtype='<U1')

In [7]:
counter = np.apply_along_axis(Counter, 0, dna_arr)
counter = [dict(c) for c in counter]

In [8]:
count_motifs = pd.DataFrame(counter, columns=list('ACGT')).fillna(0).T.to_numpy()

In [9]:
count_motifs = count_motifs + 1
count_motifs

array([[2., 1., 1., 1.],
       [1., 2., 2., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 2.]])

In [10]:
profile = get_profile_with_pseudocounts(dna_arr)
profile

array([[0.4, 0.2, 0.2, 0.2],
       [0.2, 0.4, 0.4, 0.2],
       [0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.4]])

In [11]:
text = "gATGTctgtc".upper()
text

'GATGTCTGTC'

In [12]:
most_probable_kmer(text, 4, profile)

array(['A', 'T', 'G', 'T'], dtype='<U1')

In [13]:
def get_scores(motifs, t):
    
    """
    input:
        motifs: np.array
    
    output:
        score of given motifs matrix
    """
    
    modes, counts = stats.mode(motifs)
    scores = t - counts
    
    return scores.sum()

In [14]:
def greedy_motif_search_with_pseudocounts(dna, k, t):
    dna_arr = np.array([list(text) for text in dna])
    
    best_motifs = dna_arr[:, :k]
    best_score = get_scores(best_motifs, t)
    
    for i in range(dna_arr.shape[1]-k+1):
        motifs_list = dna_arr[0, i:i+k][np.newaxis] #motif_1
        
        for j in range(1,t):
            profile = get_profile_with_pseudocounts(motifs_list)
            most_proba_kmer = most_probable_kmer(dna[j], k, profile)
            
            motifs_list = np.row_stack([motifs_list, most_proba_kmer])
            
        score_motifs_list = get_scores(motifs_list, t)
        
        if score_motifs_list < best_score:
            best_score = score_motifs_list
            best_motifs = motifs_list.copy()
    
    best_motifs = [''.join(motif_arr.tolist()) for motif_arr in best_motifs]
    
    return best_motifs

In [15]:
k = 3
t = 5

In [16]:
dna = [
    "GGCGTTCAGGCA", 
    "AAGAATCAGTCA", 
    "CAAGGAGTTCGC", 
    "CACGTCAATCAC", 
    "CAATAATATTCG"
]

In [17]:
greedy_motif_search_with_pseudocounts(dna, k, t)

['TTC', 'ATC', 'TTC', 'ATC', 'TTC']

In [18]:
dna_arr = np.array([list(text) for text in dna])

In [20]:
dna_arr.shape

(5, 12)

In [21]:
np.random.random_integers(0, 12-k, t)

  """Entry point for launching an IPython kernel.


array([0, 2, 3, 3, 8])

In [79]:
def main():
    
    file = open('rosalind_ba2e.txt', 'r')
    
    k, t = list(map(int, next(file).split()))
    
    
    dna = []
    for string in file:
        dna.append(string.strip())
    
    
    print("\n".join(greedy_motif_search_with_pseudocounts(dna, k, t)))

    file.close()

In [80]:
if __name__ == "__main__":
    main()

CAGAGGGTCTCA
AAGACGCTCGCA
TAGAAGATCACA
TAGACGATCCCA
TAGATGGTCGCA
AAGACGTTCCCA
GAGAGGATCGCA
CAGAGGTTCACA
CAGAAGCTCACA
AAGAAGGTCTCA
TAGACGCTCGCA
GAGACGGTCGCA
GAGAGGGTCTCA
CAGACGCTCACA
CAGACGGTCCCA
GAGACGATCTCA
TAGATGATCGCA
GAGAAGCTCTCA
GAGAGGGTCGCA
TAGATGGTCACA
AAGATGATCTCA
GAGAGGGTCCCA
TAGATGTTCCCA
GAGAAGATCCCA
TAGAAGCTCCCA
