https://stepik.org/lesson/240241/step/4?unit=214000


https://github.com/ivanov-v-v/rosalind-mipt-2019/blob/master/06%20%E2%80%94%20ba2d/main.py

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy import stats

In [2]:
alphabet = list('ACGT')
letter_to_id = {letter: idx for idx, letter in enumerate(alphabet)}

In [3]:
def most_probable_kmer(text, k, profile_matrix):
    
    """
    output:
        most probable kmer in text for given profile matrix
    """
    
    best_proba = -np.inf
    most_proba_kmer = None
    
    for i in range(len(text)-k+1):
        kmer = text[i:i+k]
        indexes = [letter_to_id[char] for char in kmer]
        
        string_proba = np.choose(indexes, profile_matrix).prod()
        
        if string_proba > best_proba:
            best_proba = string_proba
            most_proba_kmer = kmer
    
    return np.array(list(most_proba_kmer))

In [4]:
def get_profile_with_pseudocounts(motifs):
    
    """
    input:
        motifs: np.array
    
    output:
        profile matrix
    """
    
    counter = np.apply_along_axis(Counter, 0, motifs)
    counter = [dict(c) for c in counter]
    
    count_motifs = pd.DataFrame(counter, columns=list('ACGT')).fillna(0).T.to_numpy()
    
    # Laplace’s Rule of Succession
    count_motifs = count_motifs + 1
    
    # Plus 4, since for each letter in the alphabet (ACGT) we add 1
    profile = count_motifs / (len(motifs) + 4)
    
    return profile

In [5]:
def get_scores(motifs, t):
    
    """
    input:
        motifs: np.array
    
    output:
        score of given motifs matrix
    """
    
    modes, counts = stats.mode(motifs)
    scores = t - counts
    
    return scores.sum()

In [6]:
def get_motifs_profile_dna(profile, dna, k):
    motifs = []
    for text in dna:
        motifs.append(most_probable_kmer(text, k, profile))
    
    return np.row_stack(motifs)

In [7]:
dna = [
    "ttaccttaac",
"gatgtctgtc",
    "acggcgttag",
    "ccctaacgag",
"cgtcagaggt"
]

In [8]:
dna = [text.upper() for text in dna]

In [9]:
dna

['TTACCTTAAC', 'GATGTCTGTC', 'ACGGCGTTAG', 'CCCTAACGAG', 'CGTCAGAGGT']

In [10]:
k = 4

In [11]:
profile = np.array([
    [4/5, 0, 0, 1/5],
    [0, 3/5, 1/5, 0],
    [1/5, 1/5, 4/5, 0],
    [0, 1/5, 0, 4/5]
])
profile

array([[0.8, 0. , 0. , 0.2],
       [0. , 0.6, 0.2, 0. ],
       [0.2, 0.2, 0.8, 0. ],
       [0. , 0.2, 0. , 0.8]])

In [12]:
get_motifs_profile_dna(profile, dna, k)

array([['A', 'C', 'C', 'T'],
       ['A', 'T', 'G', 'T'],
       ['G', 'C', 'G', 'T'],
       ['A', 'C', 'G', 'A'],
       ['A', 'G', 'G', 'T']], dtype='<U1')

In [13]:
def randomized_greedy_motif_search(dna, k, t, best_so_far):
    # including pseudocounts
    
    dna_arr = np.array([list(text) for text in dna])
    
    # random index selection
    idxs = np.random.randint(0, dna_arr.shape[1]-k+1, t)[:, None]
    idxs = idxs + np.arange(k)
    
    # use indexes to randomly select the initial motifs
    best_motifs = np.take_along_axis(dna_arr, idxs, axis=1)
    best_score = best_so_far
    
    ## code Motifs(Profile, DNA): https://stepik.org/lesson/240243/step/2?unit=214002
    
    motifs_list = best_motifs.copy()
    while True:
        profile = get_profile_with_pseudocounts(motifs_list)
        motifs_list = get_motifs_profile_dna(profile, dna, k) #Motifs(Profile, Dna)
            
        score_motifs_list = get_scores(motifs_list, t)
        
        if score_motifs_list < best_score:
            best_score = score_motifs_list
            best_motifs = motifs_list.copy()
        else:
            break
            
    
    best_motifs = [''.join(motif_arr.tolist()) for motif_arr in best_motifs]
    
    return best_motifs, best_score

In [14]:
def find_best_approx(dna, k, t, n_times=1000):
    
    best_motifs = None
    best_score = np.inf
    
    curr_score = np.inf
    for i in tqdm(range(n_times)):
        curr_motifs, curr_score = randomized_greedy_motif_search(dna, k, t, curr_score)
        
        if curr_score < best_score:
            best_score = curr_score
            best_motifs = curr_motifs
    
    
    return best_motifs

In [15]:
from tqdm import tqdm

In [21]:
def main():
    
    file = open('rosalind_ba2f.txt', 'r')
    
    k, t = list(map(int, next(file).split()))
    
    print(k, t)
    
    dna = []
    for string in file:
        dna.append(string.strip())
        
    print(dna)
    
    
    print("\n".join(find_best_approx(dna, k, t)))

    file.close()

In [22]:
if __name__ == "__main__":
    main()

  0%|          | 0/1000 [00:00<?, ?it/s]

15 20
['GAAGCACTGAGTTACCAGGTACACAGGTCTCAGTGCACGGACTGCACCGGGCTCAGCCCTGTAGGCGTGAACCTTCCAACATCGACTGGGGATGATCTAATCTTCGCATCGTACGCACGTCAATAGCATAACGTCGGACACGTACCCTAGGCAACAACCACTTTTGACCTTCCTTCCGGCCGCAAGGCTCAACTTAGTGGAAGCACTGAGTTAC', 'CAGGTACACAGGTCTCAGTGCACGGACTGCACCGGGCTCAGCCCTGTAGGCGTGAACCTTCCAACATCGACTGGGGATGATCTAATCTTCGCATCGTACGCACGTCAATAGCATAACGTCGGACACGCCCCAATAGTCAGCGTACCCTAGGCAACAACCACTTTTGACCTTCCTTCCGGCCGCAAGGCTCAACTTAGTGGAAGCACTGAGTTAC', 'ATACTCTTAATCATTATAACATGAGACAGCCTATCACACGTACACGATCGCCAATGTTCCCCCGCTCGAATTAGACCGGTTTGGAATTGAGTTCCGTGGGGGTATGCGCCGCACAAACTGCTATAGTAAAACCACCCTTTTAGTCAGCGTATTGACATATTACAAGCGGCGTTTACTCCTTAGAAAGGCAATAAACGAACAAAACAGTACCCGC', 'ACTGGCTTCCAGAAAACACCAGGCCCCAATTTCGGCGGATGGATTGTTCACAATGCTTGATCGTGCGTGCGCCATATTGCTGCCGACCGCGGCCGGATATACCGGTGCCTTTCCTAGAAATTGTCGGATAATACTCCTGCCCCAATGAGTCAGCGTATACAGTACGGGACCCATGGTCAAGAACGTTAAGTATTCGTAGGGCGCGACACACAGT', 'GCCCAGGCCCTTCCCAGCTTAGCAGCGTCTGTCGATGTTCCTGCTGCGATGGCTCGCCGAGACTACAAGAGTGGAGCGAGCCGAAATATCAGAAGCACCGAGCATGAGCTCGGGCCGCGA

100%|██████████| 1000/1000 [02:41<00:00,  6.20it/s]

ACCAGGTACACAGGT
CCCCAATAGTCAGCG
CCCTTTTAGTCAGCG
CCCAATGAGTCAGCG
CCCAGCTTAGCAGCG
AGCAGCTAGTCAGCA
CCCAGCTAGATCGCG
CCCACTAAGTCAGCG
CCGGACTAGTCAGCG
CCCAGCGCATCAGCG
CCCAGCTATGTAGCG
CCCAGCTAGTAGTCG
TCCAGCTAGTCAGTT
CCCAGGGGGTCAGCG
TCGAAGATGTCAGCG
CCCAGGGTGTCAGCG
CCCAGCTAGTCACGA
CCCAGCAGCTCAGCG
CCCAGCTAGTCTATG
CAGGGCTAGTCAGCG





In [18]:
CATGGGGAAAACTGA
CCTCTCGATCACCGA
CCTATAGATCACCGA
CCGATTGATCACCGA
CCTTGTGCAGACCGA
CCTTGCCTTCACCGA
CCTTGTTGCCACCGA
ACTTGTGATCACCTT
CCTTGTGATCAATTA
CCTTGTGATCTGTGA
CCTTGTGATCACTCC
AACTGTGATCACCGA
CCTTAGTATCACCGA
CCTTGTGAAATCCGA
CCTTGTCGCCACCGA
TGTTGTGATCACCGC
CACCGTGATCACCGA
CCTTGGTTTCACCGA
CCTTTGCATCACCGA
CCTTGTGATTTACGA

NameError: name 'CATGGGGAAAACTGA' is not defined

In [None]:
ACGGAGATTTCTGGC
ACGGAGATTTCTGGC
CCCGCTGATTGTCTG
GCGACTGGTTCCTTA
CCGTAAATATCAGGA
CAGGACGTTTAGGTA
ACCTACGTTTCTGGA
GCGGACGGTGCCTGG
TGGTATAATTCTGTG
TCCGTAGGTTGTGGA
CAGGATGAATCTGGA
TGGGCGGATTCATGA
GGGTAAGATGCCGCA
CCGTACTGTTATGGT
AACGAGGATTGTTCA
CCCTTTTTTTCAGCA
ATGGAGTTTGACTCA
TCGGACATTCCTGGA
ATGGAGTTTCCATCC
TCGAAGGTTTCTTGG