-
Notifications
You must be signed in to change notification settings - Fork 27
/
enrichr_functions.py
374 lines (272 loc) · 9.94 KB
/
enrichr_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def add_enrichr_cats(df, inst_rc, run_enrichr, num_terms=10):
from copy import deepcopy
tmp_gene_list = deepcopy(df.index.tolist())
gene_list = []
if type(tmp_gene_list[0]) is tuple:
for inst_tuple in tmp_gene_list:
gene_list.append(inst_tuple[0])
else:
gene_list = tmp_gene_list
orig_gene_list = deepcopy(gene_list)
# set up for non-tuple case first
if ': ' in gene_list[0]:
# strip titles
gene_list = [inst_gene.split(': ')[1] for inst_gene in gene_list]
# strip extra information (e.g. PTMs)
gene_list = [inst_gene.split('_')[0] for inst_gene in gene_list]
gene_list = [inst_gene.split(' ')[0] for inst_gene in gene_list]
gene_list = [inst_gene.split('-')[0] for inst_gene in gene_list]
user_list_id = post_request(gene_list)
enr, response_list = get_request(run_enrichr, user_list_id, max_terms=20)
# p-value, adjusted pvalue, z-score, combined score, genes
# 1: Term
# 2: P-value
# 3: Z-score
# 4: Combined Score
# 5: Genes
# 6: pval_bh
# while generating categories store as list of lists, then convert to list of
# tuples
bar_info = []
cat_list = []
for inst_gene in orig_gene_list:
cat_list.append([inst_gene])
for inst_enr in response_list[0:num_terms]:
inst_term = inst_enr[1]
inst_pval = inst_enr[2]
inst_cs = inst_enr[4]
inst_list = inst_enr[5]
pval_string = '<p> Pval ' + str(inst_pval) + '</p>'
bar_info.append(inst_cs)
for inst_info in cat_list:
# strip titles
gene_name = inst_info[0]
if ': ' in gene_name:
gene_name = gene_name.split(': ')[1]
# strip extra information (e.g. PTMs)
gene_name = gene_name.split('_')[0]
gene_name = gene_name.split(' ')[0]
gene_name = gene_name.split('-')[0]
if gene_name in inst_list:
inst_info.append(inst_term+': True'+ pval_string)
else:
inst_info.append(inst_term+': False'+pval_string)
cat_list = [tuple(x) for x in cat_list]
df.index = cat_list
return df, bar_info
def clust_from_response(response_list):
from clustergrammer import Network
import scipy
import json
import pandas as pd
import math
from copy import deepcopy
# print('----------------------')
# print('enrichr_clust_from_response')
# print('----------------------')
ini_enr = transfer_to_enr_dict( response_list )
enr = []
scores = {}
score_types = ['combined_score','pval','zscore']
for score_type in score_types:
scores[score_type] = pd.Series()
for inst_enr in ini_enr:
if inst_enr['combined_score'] > 0:
# make series of enriched terms with scores
for score_type in score_types:
# collect the scores of the enriched terms
if score_type == 'combined_score':
scores[score_type][inst_enr['name']] = inst_enr[score_type]
if score_type == 'pval':
scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type])
if score_type == 'zscore':
scores[score_type][inst_enr['name']] = -inst_enr[score_type]
# keep enrichement values
enr.append(inst_enr)
# sort and normalize the scores
for score_type in score_types:
scores[score_type] = scores[score_type]/scores[score_type].max()
scores[score_type].sort(ascending=False)
number_of_enriched_terms = len(scores['combined_score'])
enr_score_types = ['combined_score','pval','zscore']
if number_of_enriched_terms <10:
num_dict = {'ten':10}
elif number_of_enriched_terms <20:
num_dict = {'ten':10, 'twenty':20}
else:
num_dict = {'ten':10, 'twenty':20, 'thirty':30}
# gather lists of top scores
top_terms = {}
for enr_type in enr_score_types:
top_terms[enr_type] = {}
for num_terms in list(num_dict.keys()):
inst_num = num_dict[num_terms]
top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num]
# gather the terms that should be kept - they are at the top of the score list
keep_terms = []
for inst_enr_score in top_terms:
for tmp_num in list(num_dict.keys()):
keep_terms.extend( top_terms[inst_enr_score][tmp_num] )
keep_terms = list(set(keep_terms))
# keep enriched terms that are at the top 10 based on at least one score
keep_enr = []
for inst_enr in enr:
if inst_enr['name'] in keep_terms:
keep_enr.append(inst_enr)
# fill in full matrix
#######################
# genes
row_node_names = []
# enriched terms
col_node_names = []
# gather information from the list of enriched terms
for inst_enr in keep_enr:
col_node_names.append(inst_enr['name'])
row_node_names.extend(inst_enr['int_genes'])
row_node_names = sorted(list(set(row_node_names)))
net = Network()
net.dat['nodes']['row'] = row_node_names
net.dat['nodes']['col'] = col_node_names
net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)])
for inst_enr in keep_enr:
inst_term = inst_enr['name']
col_index = col_node_names.index(inst_term)
# use combined score for full matrix - will not be seen in viz
tmp_score = scores['combined_score'][inst_term]
net.dat['node_info']['col']['value'].append(tmp_score)
for inst_gene in inst_enr['int_genes']:
row_index = row_node_names.index(inst_gene)
# save association
net.dat['mat'][row_index, col_index] = 1
# cluster full matrix
#############################
# do not make multiple views
views = ['']
if len(net.dat['nodes']['row']) > 1:
net.make_clust(dist_type='jaccard', views=views, dendro=False)
else:
net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False)
# get dataframe from full matrix
df = net.dat_to_df()
for score_type in score_types:
for num_terms in num_dict:
inst_df = deepcopy(df)
inst_net = deepcopy(Network())
inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]]
# load back into net
inst_net.df_to_dat(inst_df)
# make views
if len(net.dat['nodes']['row']) > 1:
inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False)
else:
inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False)
inst_views = inst_net.viz['views']
# add score_type to views
for inst_view in inst_views:
inst_view['N_col_sum'] = num_dict[num_terms]
inst_view['enr_score_type'] = score_type
# add values to col_nodes and order according to rank
for inst_col in inst_view['nodes']['col_nodes']:
inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name'])
inst_name = inst_col['name']
inst_col['value'] = scores[score_type][inst_name]
# add views to main network
net.viz['views'].extend(inst_views)
return net
# make the get request to enrichr using the requests library
# this is done before making the get request with the lib name
def post_request(input_genes, meta=''):
# get metadata
import requests
import json
# stringify list
input_genes = '\n'.join(input_genes)
# define post url
post_url = 'http://amp.pharm.mssm.edu/Enrichr/addList'
# define parameters
params = {'list':input_genes, 'description':''}
# make request: post the gene list
post_response = requests.post( post_url, files=params)
# load json
inst_dict = json.loads( post_response.text )
userListId = str(inst_dict['userListId'])
# return the userListId that is needed to reference the list later
return userListId
# make the get request to enrichr using the requests library
# this is done after submitting post request with the input gene list
def get_request(lib, userListId, max_terms=50 ):
import requests
import json
# convert userListId to string
userListId = str(userListId)
# define the get url
get_url = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
# get parameters
params = {'backgroundType':lib,'userListId':userListId}
# try get request until status code is 200
inst_status_code = 400
# wait until okay status code is returned
num_try = 0
# print(('\tEnrichr enrichment get req userListId: '+str(userListId)))
while inst_status_code == 400 and num_try < 100:
num_try = num_try +1
try:
# make the get request to get the enrichr results
try:
get_response = requests.get( get_url, params=params )
# get status_code
inst_status_code = get_response.status_code
except:
print('retry get request')
except:
print('get requests failed')
# load as dictionary
resp_json = json.loads( get_response.text )
# get the key
only_key = list(resp_json.keys())[0]
# get response_list
response_list = resp_json[only_key]
# transfer the response_list to the enr_dict
enr = transfer_to_enr_dict( response_list, max_terms )
# return enrichment json and userListId
return enr, response_list
# transfer the response_list to a list of dictionaries
def transfer_to_enr_dict(response_list, max_terms=50):
# # reduce the number of enriched terms if necessary
# if len(response_list) < num_terms:
# num_terms = len(response_list)
# p-value, adjusted pvalue, z-score, combined score, genes
# 1: Term
# 2: P-value
# 3: Z-score
# 4: Combined Score
# 5: Genes
# 6: pval_bh
num_enr_term = len(response_list)
if num_enr_term > max_terms:
num_enr_term = max_terms
# transfer response_list to enr structure
# and only keep the top terms
#
# initialize enr
enr = []
for i in range(num_enr_term):
# get list element
inst_enr = response_list[i]
# initialize dict
inst_dict = {}
# transfer term
inst_dict['name'] = inst_enr[1]
# transfer pval
inst_dict['pval'] = inst_enr[2]
# transfer zscore
inst_dict['zscore'] = inst_enr[3]
# transfer combined_score
inst_dict['combined_score'] = inst_enr[4]
# transfer int_genes
inst_dict['int_genes'] = inst_enr[5]
# adjusted pval
inst_dict['pval_bh'] = inst_enr[6]
# append dict
enr.append(inst_dict)
return enr