In [1]:
import random

def profile_matrix_with_pseudocounts(k,motifs):
    profile = {'A': [1] * k, 'C': [1] * k, 'G': [1] * k, 'T': [1] * k}
    total_motifs = len(motifs) + 4  # pseudocounts for each nucleotide

    for i in range(k):
        for motif in motifs:
            profile[motif[i]][i] += 1

    for nucleotide in profile:
        for i in range(k):
            profile[nucleotide][i] /= total_motifs

    return profile

In [2]:
def hamming_distance(str1, str2):
    dist = 0
    for i in range (len(str1)):
        if str1[i] != str2[i]: 
            dist = dist + 1
    return dist
        

In [3]:
import math
def median_string(k, dna, motifs):
    distance = float('inf')
    
    for m in motifs:
        current_pattern = m
        current_distance = sum(min(hamming_distance(current_pattern, dna[seq][i:i+k]) for i in range(len(dna[seq]) - k + 1)) for seq in range(len(dna)))


    return current_distance

In [4]:
import numpy as np

def profile_most_Probable(text,profile, k):
    most_probable_kmer = []
    for y in range(len(text)):
        max_prob = -1
        for i in range(len(text[y]) - k + 1):
            kmer = text[y][i:i + k]
            prob = 1
            for j in range(k):
                prob *= profile[kmer[j]][j]
            if prob > max_prob:
                max_prob = prob
                most_probable_kmer.append(kmer)

    return most_probable_kmer

In [5]:
def score_motifs(motifs,k,t):
    score = 1
    for i in range(k):
        column = [motif[i] for motif in motifs]
        most_common = max(set(column), key=column.count)
        
        for nucleotide in column:
            if nucleotide != most_common:
                score += 1 

    return score

In [6]:
def randomized_motif_search(dna, k, t):

    best_motifs = [random.choice([sequence[i:i + k] for i in range(len(sequence) - k + 1)]) for sequence in dna]
    while 1:
        profile_matrix = profile_matrix_with_pseudocounts(k,best_motifs)
        current_motifs = profile_most_Probable(dna,profile_matrix ,k)

        if score_motifs(current_motifs,k,t) < score_motifs(best_motifs,k,t):
            best_motifs = current_motifs

        else: 
            return best_motifs



In [10]:
def iter_rand_motif_search(dna,k,t):
    min_score = 20
    min_result = []
    for i in range(1000):
        result = randomized_motif_search(dna, k, t)
#         ham_dis = median_string(k, dna, result)
        sc = score_motifs(result,k,t)
        print(result)
        if sc < min_score: 
            min_result.clear()
            min_score = sc
 
            min_result.append(result)

    return min_result

In [11]:
file_path = r'D:\Mobina\Desktop\input_1.txt'
with open(file_path, 'r') as file:
    lines = file.readlines()

dna_list = (lines[1:][0]).split(' ')
print(dna_list)
if dna_list:
    dna_list[-1] = dna_list[-1].rstrip()
k_value = 15
t_value = 20

results = iter_rand_motif_search(dna_list, k_value, t_value)


['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA', 'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG', 'TAGTACCGAGACCGAAAGAAGTATACAGGCGT', 'TAGATCAAGTTTCAGGTGCACGTCGGTGAACCAA', 'TCCACCAGCTCCACGTGCAATGTTGGCCTA']
['CGGGGGTGTTCAGTA', 'AGGTATGTGTAAGTG', 'ACCGAGACCGAAAGA', 'CACGTCGGTGAACCA', 'CCACGTGCAATGTTG']
['GGGGTGTTCAGTAAA', 'TAAGTGCCAAGGTGC', 'ACCGAGACCGAAAGA', 'AAGTTTCAGGTGCAC', 'TCCACCAGCTCCACG']
['CCTCTCGGGGGTGTT', 'TATGTGTAAGTGCCA', 'GAGACCGAAAGAAGT', 'TCAGGTGCACGTCGG', 'TGCAATGTTGGCCTA']
['CCCTCTCGGGGGTGT', 'AGGTATGTGTAAGTG', 'GAGACCGAAAGAAGT', 'TTCAGGTGCACGTCG', 'GCTCCACGTGCAATG']
['TGTTCAGTAAACGGC', 'GCGAGGTATGTGTAA', 'CCGAGACCGAAAGAA', 'TCAGGTGCACGTCGG', 'AGCTCCACGTGCAAT']
['GTTCAGTAAACGGCC', 'GTGTAAGTGCCAAGG', 'GTACCGAGACCGAAA', 'CACGTCGGTGAACCA', 'TCCACGTGCAATGTT']
['TTCAGTAAACGGCCA', 'GGTATGTGTAAGTGC', 'ACCGAAAGAAGTATA', 'GGTGCACGTCGGTGA', 'CGTGCAATGTTGGCC']
['CCTCTCGGGGGTGTT', 'AGTGCCAAGGTGCCA', 'AGAAGTATACAGGCG', 'AGATCAAGTTTCAGG', 'TCCACCAGCTCCACG']
['CCCCTCTCGGGGGTG', 'AGTGCCAAGGTGCCA', 'AGAAGTATACA

['CTCGGGGGTGTTCAG', 'GTAAGTGCCAAGGTG', 'AGAAGTATACAGGCG', 'AAGTTTCAGGTGCAC', 'CGTGCAATGTTGGCC']
['CGGGGGTGTTCAGTA', 'GAGGTATGTGTAAGT', 'TACCGAGACCGAAAG', 'GTTTCAGGTGCACGT', 'CTCCACGTGCAATGT']
['GTGTTCAGTAAACGG', 'GTAAGTGCCAAGGTG', 'CGAAAGAAGTATACA', 'AGGTGCACGTCGGTG', 'CCACCAGCTCCACGT']
['TCGGGGGTGTTCAGT', 'GTGCCAAGGTGCCAG', 'TAGTACCGAGACCGA', 'GATCAAGTTTCAGGT', 'ACGTGCAATGTTGGC']
['GTTCAGTAAACGGCC', 'AAGTGCCAAGGTGCC', 'AGAAGTATACAGGCG', 'TGCACGTCGGTGAAC', 'CGTGCAATGTTGGCC']
['GTTCAGTAAACGGCC', 'GTAAGTGCCAAGGTG', 'CCGAGACCGAAAGAA', 'TCAGGTGCACGTCGG', 'ACCAGCTCCACGTGC']
['CTCTCGGGGGTGTTC', 'ATGTGTAAGTGCCAA', 'GACCGAAAGAAGTAT', 'TGCACGTCGGTGAAC', 'TCCACCAGCTCCACG']
['CGCCCCTCTCGGGGG', 'TGTGTAAGTGCCAAG', 'GAAGTATACAGGCGT', 'ATCAAGTTTCAGGTG', 'CCAGCTCCACGTGCA']
['GTTCAGTAAACGGCC', 'GAGGTATGTGTAAGT', 'AAGAAGTATACAGGC', 'ATCAAGTTTCAGGTG', 'CGTGCAATGTTGGCC']
['GTGTTCAGTAAACGG', 'GGCGAGGTATGTGTA', 'AGTACCGAGACCGAA', 'CAGGTGCACGTCGGT', 'TGCAATGTTGGCCTA']
['GTGTTCAGTAAACGG', 'GGCGAGGTATGTGTA', '

['CCTCTCGGGGGTGTT', 'GTGCCAAGGTGCCAG', 'CCGAGACCGAAAGAA', 'ATCAAGTTTCAGGTG', 'TGCAATGTTGGCCTA']
['TCGGGGGTGTTCAGT', 'GTGCCAAGGTGCCAG', 'TACCGAGACCGAAAG', 'TCAGGTGCACGTCGG', 'CACCAGCTCCACGTG']
['GGGGTGTTCAGTAAA', 'AGTGCCAAGGTGCCA', 'GTACCGAGACCGAAA', 'GTGCACGTCGGTGAA', 'CCACCAGCTCCACGT']
['TTCAGTAAACGGCCA', 'TGTAAGTGCCAAGGT', 'GTACCGAGACCGAAA', 'GTTTCAGGTGCACGT', 'CCAGCTCCACGTGCA']
['CCCCTCTCGGGGGTG', 'TAAGTGCCAAGGTGC', 'AGACCGAAAGAAGTA', 'GTGCACGTCGGTGAA', 'CACCAGCTCCACGTG']
['TCTCGGGGGTGTTCA', 'GTGCCAAGGTGCCAG', 'AGACCGAAAGAAGTA', 'CAGGTGCACGTCGGT', 'GTGCAATGTTGGCCT']
['CGCCCCTCTCGGGGG', 'CGAGGTATGTGTAAG', 'CCGAGACCGAAAGAA', 'TCAGGTGCACGTCGG', 'CACGTGCAATGTTGG']
['CCTCTCGGGGGTGTT', 'GGTATGTGTAAGTGC', 'GAAGTATACAGGCGT', 'ACGTCGGTGAACCAA', 'CGTGCAATGTTGGCC']
['GCCCCTCTCGGGGGT', 'AAGTGCCAAGGTGCC', 'AGTACCGAGACCGAA', 'CACGTCGGTGAACCA', 'GCTCCACGTGCAATG']
['GGGGGTGTTCAGTAA', 'GTATGTGTAAGTGCC', 'CGAGACCGAAAGAAG', 'TCAGGTGCACGTCGG', 'CCACGTGCAATGTTG']
['CGGGGGTGTTCAGTA', 'TATGTGTAAGTGCCA', '

['GCCCCTCTCGGGGGT', 'GGGCGAGGTATGTGT', 'AGACCGAAAGAAGTA', 'AGATCAAGTTTCAGG', 'ACCAGCTCCACGTGC']
['CCTCTCGGGGGTGTT', 'TAAGTGCCAAGGTGC', 'GAGACCGAAAGAAGT', 'GATCAAGTTTCAGGT', 'TCCACGTGCAATGTT']
['CCCTCTCGGGGGTGT', 'CGAGGTATGTGTAAG', 'GTACCGAGACCGAAA', 'TTCAGGTGCACGTCG', 'GTGCAATGTTGGCCT']
['TCGGGGGTGTTCAGT', 'GGGCGAGGTATGTGT', 'CCGAGACCGAAAGAA', 'AAGTTTCAGGTGCAC', 'ACCAGCTCCACGTGC']
['TCTCGGGGGTGTTCA', 'GGTATGTGTAAGTGC', 'GACCGAAAGAAGTAT', 'TTTCAGGTGCACGTC', 'ACCAGCTCCACGTGC']
['TCGGGGGTGTTCAGT', 'AGTGCCAAGGTGCCA', 'CGAAAGAAGTATACA', 'TAGATCAAGTTTCAG', 'CCACGTGCAATGTTG']
['CCCTCTCGGGGGTGT', 'AGGTATGTGTAAGTG', 'GAGACCGAAAGAAGT', 'TCAAGTTTCAGGTGC', 'CACGTGCAATGTTGG']
['GTGTTCAGTAAACGG', 'GGTATGTGTAAGTGC', 'CGAAAGAAGTATACA', 'GTTTCAGGTGCACGT', 'ACGTGCAATGTTGGC']
['CTCGGGGGTGTTCAG', 'CGAGGTATGTGTAAG', 'CGAAAGAAGTATACA', 'CAGGTGCACGTCGGT', 'ACGTGCAATGTTGGC']
['GTTCAGTAAACGGCC', 'GCGAGGTATGTGTAA', 'AGAAGTATACAGGCG', 'TTCAGGTGCACGTCG', 'CACCAGCTCCACGTG']
['GGGGGTGTTCAGTAA', 'TGTGTAAGTGCCAAG', '

['CCTCTCGGGGGTGTT', 'TATGTGTAAGTGCCA', 'CCGAAAGAAGTATAC', 'AGTTTCAGGTGCACG', 'GTGCAATGTTGGCCT']
['CGCCCCTCTCGGGGG', 'GTATGTGTAAGTGCC', 'TACCGAGACCGAAAG', 'CACGTCGGTGAACCA', 'CCACCAGCTCCACGT']
['GCCCCTCTCGGGGGT', 'TATGTGTAAGTGCCA', 'AGACCGAAAGAAGTA', 'GGTGCACGTCGGTGA', 'CCACCAGCTCCACGT']
['CTCTCGGGGGTGTTC', 'GGGCGAGGTATGTGT', 'CGAGACCGAAAGAAG', 'AGTTTCAGGTGCACG', 'TGCAATGTTGGCCTA']
['GGGTGTTCAGTAAAC', 'GTATGTGTAAGTGCC', 'GTACCGAGACCGAAA', 'TGCACGTCGGTGAAC', 'CTCCACGTGCAATGT']
['TCGGGGGTGTTCAGT', 'TGTAAGTGCCAAGGT', 'GAAGTATACAGGCGT', 'ATCAAGTTTCAGGTG', 'GTGCAATGTTGGCCT']
['CTCGGGGGTGTTCAG', 'GGCGAGGTATGTGTA', 'TAGTACCGAGACCGA', 'GCACGTCGGTGAACC', 'TCCACGTGCAATGTT']
['TCTCGGGGGTGTTCA', 'CGAGGTATGTGTAAG', 'GTACCGAGACCGAAA', 'TTCAGGTGCACGTCG', 'CACCAGCTCCACGTG']
['CCCTCTCGGGGGTGT', 'GTAAGTGCCAAGGTG', 'ACCGAGACCGAAAGA', 'TCAAGTTTCAGGTGC', 'CGTGCAATGTTGGCC']
['CCCCTCTCGGGGGTG', 'TAAGTGCCAAGGTGC', 'GAAAGAAGTATACAG', 'TCAGGTGCACGTCGG', 'CAGCTCCACGTGCAA']
['GCCCCTCTCGGGGGT', 'TGTGTAAGTGCCAAG', '

In [9]:
print(*results)


