/
getAnnoFastaFromJoingenes.py
executable file
·241 lines (225 loc) · 11.4 KB
/
getAnnoFastaFromJoingenes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
# Author: Katharina J. Hoff
# E-Mail: katharina.hoff@uni-greifswald.de
# Last modified on February 15th 2024
#
# This Python script extracts CDS features from a GTF file, excises
# corresponding sequence windows from a genome FASTA file, stitches the
# codingseq parts together, adds letters N at the ends if bases are
# annotated as missing by frame in the GTF file, makes reverse complement
# if required, and translates to protein sequence.
# Output files are:
# * file with protein sequences in FASTA format,
# * file with coding squences in FASTA format
# The script automatically checks for in-frame stop codons and prints a
# warning to STDOUT if such genes are in the GTF-file. The IDs of bad genes
# are printed to a file bad_genes.lst. Option -s allows to exclude bad genes
# from the FASTA output file, automatically.
# Beware: The script assumes that the gtf input file is sorted by coordinates!
# Assumptions on AUGUSTUS-specific GTF format are made.
try:
import argparse
except ImportError:
raise ImportError(
'Failed to import argparse. Try installing with \"pip3 install argparse\"')
try:
import re
except ImportError:
raise ImportError(
'Failed to import argparse. Try installing with \"pip3 install re\"')
try:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
except ImportError:
raise ImportError(
'Failed to import biophython modules. Try installing with \"pip3 install biopython\"')
parser = argparse.ArgumentParser(
description='Generate *.codingseq and *.aa FASTA-format files from genes \
in a GTF-file produced by AUGUSTUS auxprogs tool joingenes \
and a corresponding genomic FASTA-file.')
parser.add_argument('-g', '--genome', required=True,
type=str, help='genome sequence file (FASTA-format)')
parser.add_argument('-o', '--out', required=True, type=str,
help="name stem pf output file with coding sequences and \
protein sequences (FASTA-format); will be extended by \
.codingseq/.aa")
parser.add_argument('-t', '--table', dest='translation_table', default=1,
type=int, help='Translational code table number (INT)')
parser.add_argument('-s', '--filter_out_invalid_stops', dest='filter',
type=bool, help='Suppress output of protein sequences \
that contain internal stop codons.', default=False)
parser.add_argument('-p', '--print_format_examples', required=False, action='store_true',
help="Print gtf/gff3 input format examples, do not perform analysis")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-f', '--gtf',
type=str, help='file with CDS coordinates (GTF-format)')
group.add_argument('-3', '--gff3',
type=str, help='file with CDS coordinates (GFF3 format)')
args = parser.parse_args()
if args.print_format_examples:
print('This script requires an annotation of protein coding genes with CDS ' +
'features either in GTF or GFF3 format. Since both formats are sometimes ' +
'inhomogeneous accross different tools, we here provide compatible ' +
'examples. ' +
'This script will only process the CDS lines. The input data may contain other feature lines that ' +
'are ignored.')
print('\nGTF format example:\n')
print('NW_018027262_1\tAUGUSTUS\tCDS\t347\t525\t0.5\t-\t0\ttranscript_id "g1.t1"; gene_id "g1";\n' +
'NW_018027262_1\tAUGUSTUS\tCDS\t1791\t2242\t0.59\t-\t2\ttranscript_id "g1.t1"; gene_id "g1";\n' +
'NW_018027262_1\tAUGUSTUS\tCDS\t2451\t2682\t0.24\t-\t0\ttranscript_id "g1.t1"; gene_id "g1";\n')
print('GFF3 format example:\n')
print('NW_018027262_1\tAUGUSTUS\tCDS\t347\t525\t0.5\t-\t0\tID=g1.t1.CDS1;Parent=g1.t1;\n' +
'NW_018027262_1\tAUGUSTUS\tCDS\t1791\t2242\t0.59\t-\t2\tID=g1.t1.CDS2;Parent=g1.t1;\n' +
'NW_018027262_1\tAUGUSTUS\tCDS\t2451\t2682\t0.24\t-\t0\tID=g1.t1.CDS3;Parent=g1.t1;\n')
print('\nThis script has successfully been tested with GTF format produced by BRAKER, and with' +
'the GFF3 format produced by gtf2gff3.pl (Augustus/scripts).')
exit(0)
# output file names:
codingseqFile = args.out + ".codingseq"
proteinFile = args.out + ".aa"
# Read GTF file CDS entries for transcripts
tx2seq = {}
tx2str = {}
cds = {}
if args.gtf:
try:
with open(args.gtf, "r") as gtf_handle:
for line in gtf_handle:
if re.match(
r"\S+\t[^\t]+\tCDS\t\d+\t\d+\t\S+\t\S+\t\d\t.*transcript_id (\S+)", line):
seq_id, st, en, stx, fr, tx_id = re.match(
r"(\S+)\t[^\t]+\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t(\d)\t.*transcript_id (\S+)", line).groups()
tx_id = re.sub(r'\"(\S+)\"', r'\1', tx_id)
tx_id = re.sub(r';', r'', tx_id)
if seq_id not in cds:
cds[seq_id] = {}
if tx_id not in cds[seq_id]:
cds[seq_id][tx_id] = []
cds[seq_id][tx_id].append(
{'start': int(st), 'end': int(en), 'strand': stx,
'frame': int(fr)})
if not tx_id in tx2seq:
tx2seq[tx_id] = seq_id
tx2str[tx_id] = stx
except IOError:
print("Error: Failed to open file " + args.gtf + "!")
exit(1)
elif args.gff3:
try:
with open(args.gff3, "r") as gff3_handle:
for line in gff3_handle:
if re.match(r"\S+\t\S+\tCDS\t\d+\t\d+\t\S+\t\S+\t\d\tID=[^;]+;Parent=[^;]+;", line):
seq_id, st, en, stx, fr, tx_id = re.match(
r"(\S+)\t\S+\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t(\d)\tID=[^;]+;Parent=([^;]+);", line).groups()
if seq_id not in cds:
cds[seq_id] = {}
if tx_id not in cds[seq_id]:
cds[seq_id][tx_id] = []
cds[seq_id][tx_id].append(
{'start': int(st), 'end': int(en), 'strand': stx,
'frame': int(fr)})
if not tx_id in tx2seq:
tx2seq[tx_id] = seq_id
tx2str[tx_id] = stx
except IOError:
print("Error: Failed to open file " + args.gtf + "!")
exit(1)
else:
print("Error: Neither annotation file in GTF, nor in GFF3 was provided!")
exit(1)
# Read genome file (single FASTA entries are held in memory, only), extract
# CDS sequence windows, add N when frame information states missing nucleotides
# Replace all unusual nucleotides in genome by N
seq_len = {}
codingseq = {}
regex = re.compile("[^ATCGatcgNn]");
try:
with open(args.genome, "r") as genome_handle:
for record in SeqIO.parse(genome_handle, "fasta"):
seq_len[record.id] = len(record.seq)
record.seq = re.sub(regex, r'N', str(record.seq))
if record.id in cds:
for tx in cds[record.id]:
if tx not in codingseq:
codingseq[tx] = SeqRecord(
Seq(""), id=tx, description=tx)
nCDS = len(cds[record.id][tx])
for i in range(0, nCDS):
cds_line = cds[record.id][tx][i]
if i == 0 and cds_line['strand'] == '+' and cds_line['frame'] != 0:
codingseq[tx].seq += Seq((3 - cds_line['frame'])
* 'N')
codingseq[tx].seq += Seq(record.seq[cds_line['start'] -
1:cds_line['end']])
if i == (nCDS - 1):
if cds_line['strand'] == '+':
if (len(codingseq[tx].seq) % 3) != 0:
codingseq[tx].seq += Seq(
(3 - (len(codingseq[tx]) % 3)) * 'N')
if i == (nCDS - 1) and cds_line['strand'] == '-':
if cds_line['frame'] != 0:
codingseq[tx].seq += Seq((3 - cds_line['frame'])
* 'N')
if(len(codingseq[tx].seq) % 3) != 0:
codingseq[tx].seq = Seq(
(3 - (len(codingseq[tx].seq) % 3)) * 'N') + codingseq[tx].seq
codingseq[tx].seq = codingseq[tx].seq.reverse_complement()
except IOError:
print("Error: Failed to open file " + args.genome + "!")
exit(1)
# Print coding sequences to file
try:
with open(codingseqFile, "w") as codingseq_handle:
for tx_id, seq_rec in codingseq.items():
SeqIO.write(seq_rec, codingseq_handle, "fasta")
except IOError:
print("Error: Failed to open file " + codingseqFile + "!")
exit(1)
# Translate coding sequences, identify sequences with in-frame stop codons,
# print proteins in FASTA format
bad_tx = {}
try:
with open(proteinFile, "w") as protein_handle:
for tx_id, seq_rec in codingseq.items():
is_good = True
seq_rec.seq = seq_rec.seq.translate(table=args.translation_table)
match = re.search(r"(\*)(\w|\*)", str(seq_rec.seq))
if match:
# 1-based amino acid index of stop codon
bad_tx[tx_id] = match.start(0) + 1
if ((args.filter == True) and (tx_id not in bad_tx)) or args.filter == False:
SeqIO.write(seq_rec, protein_handle, "fasta")
except IOError:
print("Error: Failed to open file " + proteinFile + "!")
exit(1)
# Print IDs of genes with in-frame stop codons if any were found
if len(bad_tx) > 0:
try:
with open("bad_genes.lst", "w") as bad_handle:
bad_handle.write(
"# tx_id\tstop_in_amino_acid\tstop_in_mRNA_start\tstop_in_mRNA_end\tgenomic_transcript_start\tgenomic_transcript_end\tstrand\n")
for tx in bad_tx:
bad_handle.write(tx + "\t" + str(bad_tx[tx]) + "\t" + str(
bad_tx[tx] * 3 - 2) + "\t" + str(bad_tx[tx] * 3) + "\t")
left_most = -1
right_most = -1
for i in range(0, len(cds[tx2seq[tx]][tx])):
if (left_most == -1) or (cds[tx2seq[tx]][tx][i]['start'] <= left_most):
left_most = cds[tx2seq[tx]][tx][i]['start']
if (right_most == -1) or (cds[tx2seq[tx]][tx][i]['end'] >= right_most):
right_most = cds[tx2seq[tx]][tx][i]['end']
bad_handle.write(str(left_most) + '\t' + str(right_most) + '\t' + tx2str[tx] +
'\n')
except IOError:
print("Error: Failed to open file bad_genes.lst!")
exit(1)
print("WARNING: The GTF file contained " + str(len(bad_tx)) + " gene(s) " +
"with internal Stop codons (see file bad_genes.lst).")
if args.filter:
print("Note: Translations of the bad genes have already been excluded " +
"from the output file " + args.gtf + ".")
else:
print("Note: Translations of the bad genes have been included in the " +
"output file. If you wish to exclude them, run " +
"getAnnoFastaFromJoingenes.py with option --filter!")