-
Notifications
You must be signed in to change notification settings - Fork 7
/
DeepTE.py
284 lines (210 loc) · 11.8 KB
/
DeepTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python
##updating 122520 add the prop threshold
##updation 12.14 add argument to download model automatically
##updation 9.26 distinguish the TE and no TE function
##DeepTE is to classify unknown TEs into different TE orders and families
##BUILT-IN MODULES
import os
import argparse
import sys
import time
import subprocess
from scripts import DeepTE_pipeline_no_modification as pipeline_no_m
from scripts import DeepTE_pipeline_yes_modification as pipeline_yes_m
from scripts import DeepTE_pipeline_unknown_sequence as pipeline_uns
from scripts import DeepTE_generate_CNN_dataset as generate_dataset
from scripts import DeepTE_combine_opt as combine_opt
def get_parsed_args():
parser = argparse.ArgumentParser(description="DeepTE classify TEs via neural network")
##require files
parser.add_argument("-d", dest='working_dir', default="./", help="Working directory to store intermediate files of "
"each step. Default: ./ ")
parser.add_argument("-o", dest='output_dir', default="./", help="Output directory to store the output files. "
"Default: ./ ")
parser.add_argument("-i", dest='ipt_seq', help="Input sequences that are unknown TEs or DNA sequences")
parser.add_argument("-sp", dest='sp_type',help="Provide species type of the input sequence")
##optional files
##updation 12.14 add an argument to allow users to download model dir
parser.add_argument("-m", dest='model_name', help="Provide one of model names: -m P or -m M or -m F or -m O or -m U."
"This argument will directly download the model dir,"
"So users do not need to initiate -m_dir.")
parser.add_argument("-m_dir", dest='model_dir',help="Provide model_dir that could be downloaded from website"
"If users already download the model dir and can directly use this argument to indicate the model directory")
parser.add_argument("-fam", dest='te_fam',help="Provide TE family name for the input te sequence"
"Default: All"
"All: the input sequence is unknown TEs"
"ClassI: the input sequence is ClassI TEs"
"ClassII: the input sequence is ClassII_sub1 TEs that do not contain Helitron TEs"
"LTR: the input sequence is LTR TEs"
"nLTR: the input sequence is nLTR TEs"
"LINE: the input sequence is LINE TEs"
"SINE: the input sequence is SINE TEs"
"Domain: the input sequence is Class II TEs with specified super families")
parser.add_argument("-modify", dest='domain_file', help="If users set this argument, "
"users need to provide domain file"
" generated from another script")
##updation 9.26 distinguish TEs and no TE function
parser.add_argument('-UNS', dest='yes', help="If users set this argument, "
"users need change the -i to the the DNA sequences."
"This function will classify the sequences into TEs, CDS, or Intergenic sequences,"
"-sp and -fam do not need to provide" )
parser.add_argument('-prop_thr', dest='prop_thr', help='Specify a probability threshold that a TE is classified into one family.'
'For example, a TE has probability of 0.6 to be ClassI.'
'If users set 0.7 as the threshold, this TE will be labled as unknown.'
'Default: 0.6')
##parse of parameters
args = parser.parse_args()
return args
##updation 12.14
##define a function to indicate where to download model
def download_model (working_dir,model_name,google_drive_path):
##create a dir in the working_dir to store the model dir
download_model_dir = working_dir + '/download_' + model_name + '_model_dir'
if not os.path.exists(download_model_dir):
os.makedirs(download_model_dir)
cmd = "wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate " + \
"\'https://docs.google.com/uc?export=download&id=" + google_drive_path + "\' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=" + google_drive_path + "\" -O " + \
download_model_dir + "/" + model_name + "_model.tar.gz " + "&& rm -rf /tmp/cookies.txt"
subprocess.call(cmd, shell=True)
cmd = 'tar -xvzf ' + download_model_dir + '/' + model_name + '_model.tar.gz -C ' + download_model_dir + '/'
subprocess.call(cmd,shell=True)
target_download_model_dir = download_model_dir + '/*_model'
return (target_download_model_dir)
def main(argv=None):
if argv is None:
argv = sys.argv
args = get_parsed_args()
######################################
##Check whether the files are provided
if args.ipt_seq is None:
print ('Cannot find input sequence, please provide the file!')
return ##import to close the script if not find the te lib
else:
try:
file = open(args.ipt_seq, 'r') ##check if the file is not the right file
except IOError:
print('There was an error opening the file!')
return
if args.domain_file is not None:
##check the file
try:
file = open(args.domain_file, 'r') ##check if the file is not the right file
except IOError:
print('There was an error opening the domain_file!')
return
if args.model_name is not None and args.model_dir is not None:
print('Please only provide model_name \'-m\' or model_dir \'-m_dir\'')
return
else:
if args.model_name is None and args.model_dir is None:
print('Cannot find model dir, please provide the dir in \'-m_dir\' or the model name in \'-m\'!')
return
##updation 122520
##set threshold
if args.prop_thr is not None:
prop_thr = args.prop_thr
else:
prop_thr = '0.6'
###########################################
##create the working and output directories
working_dir = args.working_dir
if not working_dir.endswith('/'):
working_dir = working_dir + '/'
else:
working_dir = working_dir
output_dir = args.output_dir
if not output_dir.endswith('/'):
output_dir = output_dir + '/'
else:
output_dir = output_dir
##updation 12.14
##check download model directory
download_model_dir = ''
if args.model_name is not None:
model_name = args.model_name
if model_name != 'P' and model_name != 'M' and model_name != 'F' and model_name != 'O' and model_name != 'U':
print("Please use one of 'P', 'M', 'F', 'O' to be model name")
return
else:
##download P model
if model_name == 'P':
##create a dir in the working_dir to store the model dir
google_drive_path = '1hHkW5P_3UgdhCDHOTgDSVrrssziFl0BC'
download_model_dir = download_model(working_dir, model_name, google_drive_path)
if model_name == 'M':
##create a dir in the working_dir to store the model dir
google_drive_path = '1ExRwC3szJ4XMa3ikxM9Ccu31lY79rdw9'
download_model_dir = download_model(working_dir, model_name, google_drive_path)
if model_name == 'F':
##create a dir in the working_dir to store the model dir
google_drive_path = '1uvnm99ypauIKtqCxoybdtT-mEMdoupip'
download_model_dir = download_model(working_dir, model_name, google_drive_path)
if model_name == 'O':
##create a dir in the working_dir to store the model dir
google_drive_path = '1Q6HW1NhNs0a6Ykrw7jGEKKPWxawpWiuM'
download_model_dir = download_model(working_dir, model_name, google_drive_path)
if model_name == 'U':
##create a dir in the working_dir to store the model dir
google_drive_path = '1uXTEtNQtJc2DO-JpT0s4Kv1k2ogUjCLr'
download_model_dir = download_model(working_dir, model_name, google_drive_path)
##if users do not notify the model_name instead they want to use downloaded model they need provide model_dir
else:
if args.model_dir is None:
print('Cannot find model dir, please provide the dir in \'-m_dir\' or the model name in \'-m\'!')
return
else:
download_model_dir = args.model_dir
#################################
##set the input model dir and seq
model_dir = download_model_dir
ipt_seq = args.ipt_seq
sp_type = args.sp_type
##default
if args.te_fam is not None:
te_fam = args.te_fam
else:
te_fam = 'All'
##generate temp output in the working_dir
temp_store_opt_dir = working_dir + '/store_temp_opt_dir'
if not os.path.exists(temp_store_opt_dir):
os.makedirs(temp_store_opt_dir)
##########################################
##transfer fasta data to CNN data inputset
print('Step1: transfer fasta data to CNN input data')
final_format_dic = generate_dataset.change_format_for_ncc(ipt_seq)
final_format_line_dic = generate_dataset.generate_target_line(final_format_dic)
with open(working_dir + '/opt_input_CNN_data.txt', 'w+') as opt:
for eachid in final_format_line_dic:
opt.write(final_format_line_dic[str(eachid)] + '\n')
input_CNN_data_file = working_dir + '/opt_input_CNN_data.txt'
##updation 9.26
##if users call UNS model
if args.yes is None:
##If users call domain argument
print('Step2: classify TEs')
if args.domain_file is not None:
print('Step2: 1) domain information is exist')
domain_file = args.domain_file
te_domain_pattern_dic = pipeline_yes_m.store_domain_pattern_infor(domain_file)
pipeline_yes_m.classify_pipeline(model_dir, input_CNN_data_file, temp_store_opt_dir, sp_type,te_domain_pattern_dic,te_fam,prop_thr)
##If users do not call domain argument
else:
print('Step2: 2) domain information is not exist')
##run the DeepTE_pipeline
pipeline_no_m.classify_pipeline(model_dir, input_CNN_data_file, temp_store_opt_dir, sp_type,te_fam,prop_thr)
##write out final results
print('Step3: generate final output')
##write out the name file
combine_opt.extract_combine_infor(temp_store_opt_dir, output_dir,sp_type)
##write out the fasta file
combine_opt.generate_fasta(ipt_seq, output_dir + '/opt_DeepTE.txt', output_dir)
else:
print('Step2: classify unknown sequences')
pipeline_uns.classify_pipeline(model_dir, input_CNN_data_file, temp_store_opt_dir,prop_thr)
##write out the name and fasta files
combine_opt.generate_fasta_UNS(ipt_seq, temp_store_opt_dir, output_dir)
if __name__ == "__main__":
start_time = time.time()
print('start time is ' + str(start_time))
main()
print("--- %s seconds ---" % (time.time() - start_time))