-
Notifications
You must be signed in to change notification settings - Fork 0
/
freqarray.py
257 lines (190 loc) · 7.92 KB
/
freqarray.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# Filename: freqarray.py
DNA_ALPHABET = ['A', 'B', 'C', 'D']
def gen_freq_array(alphabet, k):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
in the charging station topic, "The Frequency Array."
Input: The alphabet of symbols to use and the length k of each k-mer.
Output: A frequency array initialized to each possible k-mer with a frequency of 0.
"""
# # There are (len(alphabet))^k - 1 possible k-mers.
# #left = [ [alphabet[i]] * len(alphabet)**k/len(alphabet) for i in range(len(alphabet)) ]
# left = [ [alphabet[i]] * ((len(alphabet))**(k-1)) for i in range(len(alphabet)) ]
# left1 = sum(left, [])
# # Have to figure out how to get the middle values, and when I don't know how long
# # the alphabet is.
# right = alphabet * (alphalen**(k-1))
alphalen = len(alphabet)
#print(alphalen, k)
#print(alphalen**k)
mylist = []
temp = ''
#for i in range(alphalen**(k-1)):
for i in range(alphalen**k):
for j in range(k):
temp += alphabet[int(i/(4**j)) % alphalen]
mylist.append(temp)
temp = ''
#print(mylist)
mylist.sort()
#mydict = { mylist[i]:0 for i in range(len(mylist)) }
mydict = { mylist[i]:(i,0) for i in range(len(mylist)) }
#print(mydict)
return(mydict)
def get_freq_indx(freq_array, kmer):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
in the charging station topic, "The Frequency Array."
Input: The frequency array and the kmer for which we wish to retrieve the index.
Output: The index of the kmer from the frequency array. Returns -1 if the kmer is not
found in the frequency array (actually implemented as a dictionary).
"""
indx = -1
if kmer in freq_array:
indx = freq_array[kmer][0]
else:
print("Error: That kmer is not in the frequency array.")
return indx
def get_kmer_from_index(freq_array, kmer_index):
#if kmer_index in freq_array.values()[0]:
# kmer = freq_array.values()[kmer_index]
kvpair = [ (k, v) for k, v in freq_array.items() if v[0] == kmer_index ]
#print(kvpair)
if len(kvpair) == 0:
print("Error: That value was not found in the frequency array.")
result = ''
elif len(kvpair) == 1:
#result = kvpair[0][1][0]
result = kvpair[0][0]
else:
print("Error: That index was not unique.")
result = ''
return result
def get_frequency(freq_array, kmer):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
in the charging station topic, "The Frequency Array."
Input: The frequency array and the kmer for which we wish to retrieve the index.
Output: The frequency with which the supplied kmer occurs in the genome. Returns -1
if the kmer is not found in the frequency array (actually implemented as a dictionary).
"""
freq_value = -1
if kmer in freq_array:
freq_value = freq_array[kmer][1]
else:
print("Error: That kmer is not in the frequency array.")
return freq_value
def patterncount(text, pattern):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
#1, in the first chapter, first section.
The task is to count the number of occurences of the pattern (can be overlapping)
that can be found in the string given by text.
"""
count = 0
#print(len(text), len(pattern))
for i in range(len(text) - len(pattern)):
#print(i)
#print(text[i:i + len(pattern)])
if text[i:i + len(pattern)] == pattern:
count += 1
return count
def find_frequencies(genome, freq_array, k):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
in the charging station topic, "The Frequency Array."
Input: The genome to search, the frequency array with the frequency of each
possible k-mer initialized to 0, and the kmer length k.
Output: A frequency array containing each possible k-mer and the frequency with which
the k-mer is found in the given genome.
"""
# for kmer in freq_array.keys():
# # Find the frequency of that kmer in the genome
# freq_count = patterncount(genome, kmer)
# freq_array[kmer][1] = freq_count
for i in range(len(genome) - (k - 1)):
kmer = genome[i:(i + k)]
#freq_array[kmer][1] += 1 #Illegal in python
freq_array[kmer] = (freq_array[kmer][0], freq_array[kmer][1] + 1)
#print(freq_array)
def freq_array(genome, alphabet, k):
"""
Bioinformatics Algorithms 1 (2015) Coursera Class. Interactive Text Code Challenge
in the charging station topic, "The Frequency Array."
Input: The genome to search, the alphabet of symbols to use and the length k of each k-mer.
Output: A frequency array containing each possible k-mer and the frequency with which
the k-mer is found in the given genome.
"""
myfreqdict = gen_freq_array(alphabet, k)
#print(myfreqdict.values()["AAT"])
# print(get_freq_indx(myfreqdict, "AAT"))
# result = get_kmer_from_index(myfreqdict, 48)
# print(result)
# result = get_kmer_from_index(myfreqdict, 11)
# print(result)
find_frequencies(genome, myfreqdict, k)
#print(myfreqdict)
print_freq_array(myfreqdict)
return(myfreqdict)
def print_freq_array(myfreqdict):
"""
Helper function to print the frequencies in order of the index.
Input: myfreqdict, a dictionary that represents a "frequency array."
Output: Returns nothing, but prints the frequencies associated with each possible k-mer, in
the order of the k-mer's index in the frequency array.
"""
# for val in myfreqdict:
# print(myfreqdict[val].[1])
freqlist = [ myfreqdict[val] for val in myfreqdict ]
freqlist.sort()
for val in freqlist:
print(val[1])
def convert_pattern_to_number(pattern):
"""
DNA String pattern converter that uses the idea that it is the same as converting
from base 4 to base 10, and treating 'A'=0, 'C'=1, 'G'=2, 'T'=3.
Input: Pattern, a pattern of DNA symbols.
Output: The index of the pattern in the lexicographically-ordered frequency array.
"""
alphadict = {'A':0, 'C':1, 'G':2, 'T':3}
patlist = [ alphadict[symbol] for symbol in pattern ]
j = 0
num = 0
for i in range(-1, -(len(patlist) + 1), -1):
# Go through each digit, in reverse order (i.e. starting from the right and going
# left), and multiply each successive digit by an increasing power of 4 (which is
# the size of the alphabet).
num += 4**j * patlist[i]
#print(patlist[i], j, num)
j += 1
return(num)
def convert_number_to_pattern(num, k):
"""
Converts a number to a pattern. Successively divide num by 4 and get each remainder.
The remainder (i.e. mod 4) yields each successive symbol in pattern, starting from
the right side of pattern.
This process is equivalent to converting from base 10 to base 4, and then using the
symbols in alphadict to convert from numbers to digits.
Input: num, a number to convert into a DNA String pattern, and k, the length of the string.
Output: The pattern represented by num.
"""
numalphadict = {0:'A', 1:'C', 2:'G', 3:'T'}
#quotient = len(str(num))
quotient = num / 4
remainderlist = []
while (num > 0):
#quotient = num / 4
remainder = num % 4
#print(quotient, remainder)
remainderlist.append(remainder)
quotient = num / 4
num = int(quotient)
#print(len(remainderlist), k)
#while (len(remainderlist) <= k):
while (len(remainderlist) < k):
#print("Appending A's")
remainderlist.append(0)
remainlist = [ str(numalphadict[item]) for item in remainderlist ]
remainlist.reverse()
pattern = ''.join(remainlist)
return(pattern)