-
Notifications
You must be signed in to change notification settings - Fork 52
/
agat_convert_genscan2gff.pl
executable file
·347 lines (268 loc) · 10.2 KB
/
agat_convert_genscan2gff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/env perl
use strict;
use warnings;
use Clone 'clone';
use Pod::Usage;
use Getopt::Long;
use Bio::Tools::Genscan;
use AGAT::AGAT;
my $header = get_agat_header();
my $config;
my $outfile = undef;
my $genscan = undef;
my $seq_id = "unknown";
my @list_exon_types = ("Init", "Intr", "Term", "Sngl");
my %hash_raw;
my %hash_uniqID;
my $verbose = undef;
my $help;
if( !GetOptions( 'c|config=s' => \$config,
"h|help" => \$help,
"g|genscan=s" => \$genscan,
"seqid=s" => \$seq_id,
"verbose|v!" => \$verbose,
"outfile|output|o|out|gff=s" => \$outfile ) )
{
pod2usage( { -message => "Failed to parse command line.\n",
-verbose => 1,
-exitval => 1 } );
}
# Print Help and exit
if ($help) {
pod2usage( {-message => "$header\n",
-verbose => 99,
-exitval => 0 } );
}
if ( ! (defined($genscan)) ){
pod2usage( {
-message => "$header\nAt least 1 parameter is mandatory:\nInput genscan file (--genscan).\n\n",
-verbose => 0,
-exitval => 1 } );
}
# --- Manage config ---
$config = get_agat_config({config_file_in => $config});
## Manage output file
my $gffout = prepare_gffout($config, $outfile);
## MAIN ##############################################################
read_genscan($genscan);
#use Data::Dumper; print Dumper (\%hash_raw);
convert_genscan();
## SUBROUTINES #######################################################
sub read_genscan {
open(INPUT, "<", $genscan) or die ("$!\n");
# Open Mfannot file for reading
while (<INPUT>) {
chomp;
$_=~ s/^\s+//; #removing leading spaces
$_=~ s/\s+$//; #removing leading spaces
if ($_ =~ /^\d/) {
my @splitline = split /\s+/, $_;
my @field1 = split /\./, $splitline[0];
$hash_raw{$field1[0]}{$splitline[1]}{$field1[1]} = $_;
}
}
close(INPUT);
}
sub convert_genscan{
my @list_exons;
foreach my $genscan_id ( sort { $a <=> $b } keys %hash_raw){
my $gene_id = create_uniq_id("gene");
my $mrna_id = create_uniq_id("mrna");
@list_exons=();
foreach my $tag ( keys %{$hash_raw{$genscan_id} } ){
# first create EXONS
#print $tag."\n";
if ( grep( /^$tag/, @list_exon_types ) ){
foreach my $nb (keys %{$hash_raw{$genscan_id}{$tag} } ){
my @splitline = split /\s+/, $hash_raw{$genscan_id}{$tag}{$nb};
my $start=$splitline[3];
my $end=$splitline[4];
if($start > $end){
my $tmp_start = $start;
$start = $end;
$end = $tmp_start;
}
# phase and frame from genscan cannot be used
my $strand=$splitline[2];
my $id = "fake";
my $score = $splitline[$#splitline];
my $feature = Bio::SeqFeature::Generic->new(-seq_id => $seq_id,
-source_tag => "genscan",
-primary_tag => "exon",
-start => $start,
-end => $end ,
-frame => ".",
-strand =>$strand,
-score =>$score,
- tag => {'ID' => $id, 'Parent' => $mrna_id}
) ;
push @list_exons, $feature;
}
}
else{
#Prom = Promoter (TATA box / initation site)
#PlyA = poly-A signal (consensus: AATAAA)
}
}
@list_exons = sort {$a->start <=> $b->start} @list_exons;
if (@list_exons) {
# CREATE GENE
my $gene = clone($list_exons[0]);
create_or_replace_tag($gene, 'ID', $gene_id);
$gene->remove_tag('Parent');
$gene->primary_tag('gene');
$gene->end($list_exons[$#list_exons]->end());
# CREATE MRNA
my $mrna = clone($gene);
create_or_replace_tag($mrna, 'Parent', $gene_id);
create_or_replace_tag($mrna, 'ID', $mrna_id);
$mrna->primary_tag('mRNA');
#print gene
$gffout->write_feature($gene);
#print mrna
$gffout->write_feature($mrna);
#print exons
my $strand = $list_exons[0]->strand;
foreach my $exon (@list_exons){
my $ordered_exon_id = create_uniq_id("exon");
create_or_replace_tag($exon, 'ID', $ordered_exon_id);
$gffout->write_feature($exon);
}
#-------------- print cds --------------
my @cds;
foreach my $exon (@list_exons){
my $cds_id = create_uniq_id("cds");
my $cds = clone($exon);
create_or_replace_tag($cds, 'ID', $cds_id);
$cds->primary_tag('CDS');
push @cds, $cds;
}
if(($strand eq "+") or ($strand eq "1")){
@cds=sort {$a->start <=> $b->start} @cds;
}else{
@cds=sort {$b->start <=> $a->start} @cds;
}
# compute the phase. Assuming it always start at 0. No fragmented prediction
my $phase = 0;
foreach my $cds_feature ( @cds) {
$cds_feature->frame($phase);
my $cds_length=$cds_feature->end-$cds_feature->start +1;
$phase=(3-(($cds_length-$phase)%3))%3; #second modulo allows to avoid the frame with 3. Instead we have 0.
}
foreach my $cds ( sort {$a->start <=> $b->start} @cds){
$gffout->write_feature($cds);
}
#-------------- END print cds --------------
}
}
}
sub create_uniq_id{
my ( $tag ) = @_;
my $uniqID;
if(! exists_keys(\%hash_uniqID,($tag) ) ){
$uniqID=$tag."_1";
$hash_uniqID{$tag}=1;
}
else{
$hash_uniqID{$tag}++;
$uniqID=$tag."_".$hash_uniqID{$tag};
}
return $uniqID;
}
__END__
genscan format Explanation
Gn.Ex : gene number, exon number (for reference)
Type : Init = Initial exon (ATG to 5' splice site)
Intr = Internal exon (3' splice site to 5' splice site)
Term = Terminal exon (3' splice site to stop codon)
Sngl = Single-exon gene (ATG to stop)
Prom = Promoter (TATA box / initation site)
PlyA = poly-A signal (consensus: AATAAA)
S : DNA strand (+ = input strand; - = opposite strand)
Begin : beginning of exon or signal (numbered on input strand)
End : end point of exon or signal (numbered on input strand)
Len : length of exon or signal (bp)
Fr : reading frame (a forward strand codon ending at x has frame x mod 3)
Ph : net phase of exon (exon length modulo 3)
I/Ac : initiation signal or 3' splice site score (tenth bit units)
Do/T : 5' splice site or termination signal score (tenth bit units)
CodRg : coding region score (tenth bit units)
P : probability of exon (sum over all parses containing exon)
Tscr : exon score (depends on length, I/Ac, Do/T and CodRg scores)
Comments
The SCORE of a predicted feature (e.g., exon or splice site) is a
log-odds measure of the quality of the feature based on local sequence
properties. For example, a predicted 5' splice site with
score > 100 is strong; 50-100 is moderate; 0-50 is weak; and
below 0 is poor (more than likely not a real donor site).
The PROBABILITY of a predicted exon is the estimated probability under
GENSCAN's model of genomic sequence structure that the exon is correct.
This probability depends in general on global as well as local sequence
properties, e.g., it depends on how well the exon fits with neighboring
exons. It has been shown that predicted exons with higher probabilities
are more likely to be correct than those with lower probabilities.
=head1 NAME
agat_convert_genscan2gff.pl
=head1 DESCRIPTION
The script takes a genscan file as input, and will translate it in gff format.
The genscan format is described here: http://genome.crg.es/courses/Bioinformatics2003_genefinding/results/genscan.html
/!\ vvv Known problem vvv /!\
You must have submited only DNA sequence, wihtout any header!!
Indeed the tool expects only DNA sequences and does not crash/warn if an header
is submited along the sequence.
e.g If you have an header ">seq" s-e-q are seen as the 3 first nucleotides of the sequence.
Then all prediction location are shifted accordingly.
(checked only on the online version http://argonaute.mit.edu/GENSCAN.html. I don't
know if there is the same pronlem elsewhere.)
/!\ ^^^ Known problem ^^^^ /!\
=head1 SYNOPSIS
agat_convert_genscan2gff.pl --genscan infile.bed [ -o outfile ]
agat_convert_genscan2gff.pl -h
=head1 OPTIONS
=over 8
=item B<--genscan> or B<-g>
Input genscan bed file that will be convert.
=item B<--source>
The source informs about the tool used to produce the data and is stored in 2nd field of a gff file.
Example: Stringtie,Maker,Augustus,etc. [default: data]
=item B<--primary_tag>
The primary_tag corresponf to the data type and is stored in 3rd field of a gff file.
Example: gene,mRNA,CDS,etc. [default: gene]
=item B<--inflate_off>
By default we inflate the block fields (blockCount, blockSizes, blockStarts) to create subfeatures
of the main feature (primary_tag). Type of subfeature created based on the
inflate_type parameter. If you don't want this inflating behaviour you can deactivate it
by using the option --inflate_off.
=item B<--inflate_type>
Feature type (3rd column in gff) created when inflate parameter activated [default: exon].
=item B<--verbose>
add verbosity
=item B<-o> , B<--output> , B<--out> , B<--outfile> or B<--gff>
Output GFF file. If no output file is specified, the output will be
written to STDOUT.
=item B<-c> or B<--config>
String - Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any,
otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose".
The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently).
=item B<-h> or B<--help>
Display this helpful text.
=back
=head1 FEEDBACK
=head2 Did you find a bug?
Do not hesitate to report bugs to help us keep track of the bugs and their
resolution. Please use the GitHub issue tracking system available at this
address:
https://github.com/NBISweden/AGAT/issues
Ensure that the bug was not already reported by searching under Issues.
If you're unable to find an (open) issue addressing the problem, open a new one.
Try as much as possible to include in the issue when relevant:
- a clear description,
- as much relevant information as possible,
- the command used,
- a data sample,
- an explanation of the expected behaviour that is not occurring.
=head2 Do you want to contribute?
You are very welcome, visit this address for the Contributing guidelines:
https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md
=cut
AUTHOR - Jacques Dainat