In [1]:
try:
  import numpy as np
  import pandas as pd
  import time
except Exception as e:
  print('Error: {e}')

In [18]:
class Motifs():
  def __init__(self,dna_sequence, motif_len):
    self.dna_sequence = dna_sequence
    self.motif_len = motif_len


  # function to generate motifs of length 'L' given length of motifs and the DNA sequence
  def generate_motifs(self):
    return [seq[i:i+self.motif_len] for seq in self.dna_sequence for i in range(len(seq) - self.motif_len + 1)]


  # function to generate the profile matrix from motifs
  def generate_profile_matrix(self,motifs):
      T, L_mer = len(motifs), len(motifs[0])
      profile_matrix = np.zeros((4, L_mer))
      nucleotides = {'a': 0, 'c': 1, 'g': 2, 't': 3}
      for i in range(L_mer):
         for j in range(T):
              profile_matrix[nucleotides[motifs[j][i].lower()]][i] += 1
      return profile_matrix


   # function to get the best motifs out of all possible motifs
  def get_best_motif(self, profile_matrix):
    best_motif = ""
    best_score = 0
    for pos in range(profile_matrix.shape[1]):
        max_score, max_nucleotide = max((profile_matrix[char][pos], 'ACGT'[char]) for char in range(profile_matrix.shape[0]))
        best_motif += max_nucleotide
        best_score += max_score
    return best_motif, best_score


  # finding the best motif out of all DNA Sequence
  def find_best_motif(self):
    motifs = self.generate_motifs()
    print("Total Number Of Motifs:", len(motifs)) # It should be (N-L+1)*T => (40-6+1)*26 = 35*26 = 910.
    profile_matrix = self.generate_profile_matrix(motifs)
    print(profile_matrix)
    best_motif, best_score = self.get_best_motif(profile_matrix)
    return best_motif, best_score



In [23]:
start_time = time.time()
data = pd.read_csv('dna.csv', header=None)
dna_sequence = data.values.flatten().tolist()
motif_class = Motifs(
    dna_sequence = dna_sequence,
    motif_len = 6
)
best_result = motif_class.find_best_motif()
end_time = time.time()
print("Best Motif:", best_result[0])
print("Best Score:", best_result[1])
print('Execution Time of the Algorithm : ', end_time-start_time, 'seconds')

Total Number Of Motifs: 910
[[265. 261. 267. 266. 261. 259.]
 [191. 193. 193. 191. 194. 197.]
 [272. 274. 270. 270. 267. 267.]
 [182. 182. 180. 183. 188. 187.]]
Best Motif: GGGGGG
Best Score: 1620.0
Execution Time of the Algorithm :  0.010611295700073242 seconds
