Skip to content

Commit

Permalink
add new script agat_sq_rename_seqid.pl. Fix #300 (#305)
Browse files Browse the repository at this point in the history
  • Loading branch information
Juke34 committed Nov 24, 2022
1 parent e5d3eb6 commit e39b239
Show file tree
Hide file tree
Showing 6 changed files with 321 additions and 7 deletions.
14 changes: 7 additions & 7 deletions bin/agat_sq_add_attributes_from_tsv.pl
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
my $opt_help = 0;

Getopt::Long::Configure ('bundling');
if ( !GetOptions ('gff=s' => \$input_gff,
'o|output=s' => \$outputFile,
'tsv=s' => \$input_tsv,
'csv!' => \$csv,
'v|verbose!' => \$verbose,
'h|help!' => \$opt_help ) )
if ( !GetOptions ( 'gff=s' => \$input_gff,
'o|output=s' => \$outputFile,
'tsv=s' => \$input_tsv,
'csv!' => \$csv,
'v|verbose!' => \$verbose,
'h|help!' => \$opt_help ) )
{
pod2usage( { -message => 'Failed to parse command line',
-verbose => 1,
Expand Down Expand Up @@ -57,7 +57,7 @@
# Manage tsv input
open(INPUT, "<", $input_tsv) or die ("$!\n");

# Open Mfannot file for reading
# Open tsv file for reading
my $line = 0;
my %tsv;
my %header;
Expand Down
172 changes: 172 additions & 0 deletions bin/agat_sq_rename_seqid.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use warnings;
use Pod::Usage;
use Getopt::Long;
use IO::File ;
use Bio::SeqIO;
use AGAT::AGAT;

my $header = get_agat_header();
my $config = get_agat_config();
my $start_run = time();
my $input_gff;
my $input_tsv;
my $outputFile;
my $verbose;
my $csv;
my $opt_help = 0;

Getopt::Long::Configure ('bundling');
if ( !GetOptions ( 'gff=s' => \$input_gff,
'o|output=s' => \$outputFile,
'tsv=s' => \$input_tsv,
'csv!' => \$csv,
'v|verbose!' => \$verbose,
'h|help!' => \$opt_help ) )
{
pod2usage( { -message => 'Failed to parse command line',
-verbose => 1,
-exitval => 1 } );
}

if ($opt_help) {
pod2usage( { -verbose => 99,
-exitval => 0,
-message => "$header\n" } );
}

if (! $input_gff or ! $input_tsv){
pod2usage( { -message => "$header\nAt least 2 input file are mandatory:\n".
"--gff input.gff\n--tsv input.tsv",
-verbose => 0,
-exitval => 1 } );
}

# Manage Output
my $gffout = prepare_gffout($config, $outputFile);

# Manage GFF Input
my $format = $config->{gff_output_version};
if(! $format ){ $format = select_gff_format($input_gff); }
my $gff_in = Bio::Tools::GFF->new(-file => $input_gff, -gff_version => $format);

# Manage tsv input
open(INPUT, "<", $input_tsv) or die ("$!\n");

# Open tsv file for reading
my %tsv;
while (<INPUT>) {
chomp;

$_=~ s/^\s+//; #removing leading spaces
$_=~ s/\s+$//; #removing trailing spaces

# split line
my @splitline;
if ($csv){
@splitline = split /,/, $_;
}
else{
@splitline = split /\t/, $_; # split at tabulation
}

$tsv{$splitline[0]} = $splitline[1];
}

while (my $feature = $gff_in->next_feature() ) {

if(exists_keys(\%tsv, ($feature->seq_id() ) ) ){
$feature->seq_id($tsv{$feature->seq_id()});
}
$gffout->write_feature($feature);
}

my $end_run = time();
my $run_time = $end_run - $start_run;
print "Job done in $run_time seconds\n";

__END__
=head1 NAME
agat_sq_rename_seqid.pl
=head1 DESCRIPTION
The script aims to modify seqid (1st column) of a GFF/GTF file efficiently.
Indeed, when the number of chromosomes or scaffolding is large,
replacement using e.g. sed command can be time-consuming.
You must provide a file (tsv or csv) without header and with
one renaming information by line: The first value is the original sequence identifier (1st column of the GFF/GTF file),
the second value is the new sequence identifier to use.
number of chromosomes or scaffolding is large, sed replacement is time-consuming
=head1 SYNOPSIS
agat_sq_rename_seqid.pl --gff input.gff --tsv input.tsv [ -o output.gff3 ]
agat_sq_rename_seqid.pl --help
=head1 OPTIONS
=over 8
=item B<--gff>
STRING: Input GTF/GFF file.
=item B<--tsv>
STRING: Input tsv file
=item B<--csv>
BOLEAN: Inform the script that the tsv input file is actually a csv (coma-separated).
=item B<-v> or B<--verbose>
BOLEAN: Add verbosity
=item B<-o> or B<--output>
STRING: Output file. If no output file is specified, the output will be written
to STDOUT. The result is in tabulate format.
=item B<--help> or B<-h>
Display this helpful text.
=back
=head1 FEEDBACK
=head2 Did you find a bug?
Do not hesitate to report bugs to help us keep track of the bugs and their
resolution. Please use the GitHub issue tracking system available at this
address:
https://github.com/NBISweden/AGAT/issues
Ensure that the bug was not already reported by searching under Issues.
If you're unable to find an (open) issue addressing the problem, open a new one.
Try as much as possible to include in the issue when relevant:
- a clear description,
- as much relevant information as possible,
- the command used,
- a data sample,
- an explanation of the expected behaviour that is not occurring.
=head2 Do you want to contribute?
You are very welcome, visit this address for the Contributing guidelines:
https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md
=cut
AUTHOR - Jacques Dainat
9 changes: 9 additions & 0 deletions t/scripts_output.t
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,15 @@ system(" $script --gff $input_folder/1.gff -o $outtmp 2>&1 1>/dev/null");
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

# --------check agat_sq_rename_seqid.pl-------------

$script = $script_prefix."bin/agat_sq_rename_seqid.pl";
$result = "$output_folder/agat_sq_rename_seqid_1.gff";
system(" $script --gff $input_folder/agat_sq_rename_seqid/rename_seqid.gff --tsv $input_folder/agat_sq_rename_seqid/rename_table.tsv -o $outtmp 2>&1 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

# --------check agat_sq_repeats_analyzer.pl-------------

# XXX
Expand Down
70 changes: 70 additions & 0 deletions t/scripts_output/in/agat_sq_rename_seqid/rename_seqid.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
##gff-version 3
##sequence-region 1 1 43270923
#!genome-build RAP-DB seq1-1.0
#!genome-version seq1-1.0
#!genome-date 2015-10
#!genome-build-accession GCA_001433935.1
contig1 RAP-DB chromosome 1 43270923 . . . ID=chromosome:1;Alias=Chr1,AP014957.1,NC_029256.1
###
seq1 toolX repeat_region 2000 2100 . + . ID=fakeRepeat1
###
seq1 toolX gene 11218 12435 . + . ID=gene:Os01g0100200;biotype=protein_coding;description=Conserved hypothetical protein. (Os01t0100200-01);gene_id=Os01g0100200;logic_name=seq1v1.0-20170804-genes
seq1 toolX mRNA 11218 12435 . + . ID=transcript:Os01t0100200-01;Parent=gene:Os01g0100200;biotype=protein_coding;transcript_id=Os01t0100200-01
seq1 toolX five_prime_UTR 11218 11797 . + . Parent=transcript:Os01t0100200-01
seq1 toolX exon 11218 12060 . + . Parent=transcript:Os01t0100200-01;Name=Os01t0100200-01.exon1;constitutive=1;ensembl_end_phase=2;ensembl_phase=-1;exon_id=Os01t0100200-01.exon1;rank=1
seq1 toolX CDS 11798 12060 . + 0 ID=CDS:Os01t0100200-01;Parent=transcript:Os01t0100200-01;protein_id=Os01t0100200-01
seq1 toolX CDS 12152 12317 . + 1 ID=CDS:Os01t0100200-01;Parent=transcript:Os01t0100200-01;protein_id=Os01t0100200-01
seq1 toolX exon 12152 12435 . + . Parent=transcript:Os01t0100200-01;Name=Os01t0100200-01.exon2;constitutive=1;ensembl_end_phase=-1;ensembl_phase=2;exon_id=Os01t0100200-01.exon2;rank=2
seq1 toolX three_prime_UTR 12318 12435 . + . Parent=transcript:Os01t0100200-01
###
seq2 toolX gene 11372 12284 . - . ID=gene:Os01g0100300;biotype=protein_coding;description=Cytochrome P450 domain containing protein. (Os01t0100300-00);gene_id=Os01g0100300;logic_name=seq2v1.0-20170804-genes
seq2 toolX mRNA 11372 12284 . - . ID=transcript:Os01t0100300-00;Parent=gene:Os01g0100300;biotype=protein_coding;transcript_id=Os01t0100300-00
seq2 toolX exon 11372 12042 . - . Parent=transcript:Os01t0100300-00;Name=Os01t0100300-00.exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=1;exon_id=Os01t0100300-00.exon2;rank=2
seq2 toolX CDS 11372 12042 . - 2 ID=CDS:Os01t0100300-00;Parent=transcript:Os01t0100300-00;protein_id=Os01t0100300-00
seq2 toolX exon 12146 12284 . - . Parent=transcript:Os01t0100300-00;Name=Os01t0100300-00.exon1;constitutive=1;ensembl_end_phase=1;ensembl_phase=0;exon_id=Os01t0100300-00.exon1;rank=1
seq2 toolX CDS 12146 12284 . - 0 ID=CDS:Os01t0100300-00;Parent=transcript:Os01t0100300-00;protein_id=Os01t0100300-00
###
seq2 toolX gene 12721 15685 . + . ID=gene:Os01g0100400;biotype=protein_coding;description=Similar to Pectinesterase-like protein. (Os01t0100400-01);gene_id=Os01g0100400;logic_name=seq2v1.0-20170804-genes
seq2 toolX mRNA 12721 15685 . + . ID=transcript:Os01t0100400-01;Parent=gene:Os01g0100400;biotype=protein_coding;transcript_id=Os01t0100400-01
seq2 toolX five_prime_UTR 12721 12773 . + . Parent=transcript:Os01t0100400-01
seq2 toolX exon 12721 13813 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon1;constitutive=1;ensembl_end_phase=2;ensembl_phase=-1;exon_id=Os01t0100400-01.exon1;rank=1
seq2 toolX CDS 12774 13813 . + 0 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01
seq2 toolX exon 13906 14271 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon2;constitutive=1;ensembl_end_phase=2;ensembl_phase=2;exon_id=Os01t0100400-01.exon2;rank=2
seq2 toolX CDS 13906 14271 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01
seq2 toolX exon 14359 14437 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=2;exon_id=Os01t0100400-01.exon3;rank=3
seq2 toolX CDS 14359 14437 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01
seq2 toolX exon 14969 15171 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon4;constitutive=1;ensembl_end_phase=2;ensembl_phase=0;exon_id=Os01t0100400-01.exon4;rank=4
seq2 toolX CDS 14969 15171 . + 0 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01
seq2 toolX CDS 15266 15359 . + 1 ID=CDS:Os01t0100400-01;Parent=transcript:Os01t0100400-01;protein_id=Os01t0100400-01
seq2 toolX exon 15266 15685 . + . Parent=transcript:Os01t0100400-01;Name=Os01t0100400-01.exon5;constitutive=1;ensembl_end_phase=-1;ensembl_phase=2;exon_id=Os01t0100400-01.exon5;rank=5
seq2 toolX three_prime_UTR 15360 15685 . + . Parent=transcript:Os01t0100400-01
###
seq3 toolX gene 12808 13978 . - . ID=gene:Os01g0100466;biotype=protein_coding;description=Hypothetical protein. (Os01t0100466-00);gene_id=Os01g0100466;logic_name=seq3v1.0-20170804-genes
seq3 toolX mRNA 12808 13978 . - . ID=transcript:Os01t0100466-00;Parent=gene:Os01g0100466;biotype=protein_coding;transcript_id=Os01t0100466-00
seq3 toolX three_prime_UTR 12808 12868 . - . Parent=transcript:Os01t0100466-00
seq3 toolX exon 12808 13782 . - . Parent=transcript:Os01t0100466-00;Name=Os01t0100466-00.exon2;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100466-00.exon2;rank=2
seq3 toolX CDS 12869 13102 . - 0 ID=CDS:Os01t0100466-00;Parent=transcript:Os01t0100466-00;protein_id=Os01t0100466-00
seq3 toolX five_prime_UTR 13103 13782 . - . Parent=transcript:Os01t0100466-00
seq3 toolX exon 13880 13978 . - . Parent=transcript:Os01t0100466-00;Name=Os01t0100466-00.exon1;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100466-00.exon1;rank=1
seq3 toolX five_prime_UTR 13880 13978 . - . Parent=transcript:Os01t0100466-00
###
seq3 toolX gene 16399 20144 . + . ID=gene:Os01g0100500;biotype=protein_coding;description=Immunoglobulin-like domain containing protein. (Os01t0100500-01);gene_id=Os01g0100500;logic_name=seq3v1.0-20170804-genes
seq3 toolX mRNA 16399 20144 . + . ID=transcript:Os01t0100500-01;Parent=gene:Os01g0100500;biotype=protein_coding;transcript_id=Os01t0100500-01
seq3 toolX five_prime_UTR 16399 16598 . + . Parent=transcript:Os01t0100500-01
seq3 toolX exon 16399 16976 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=-1;exon_id=Os01t0100500-01.exon1;rank=1
seq3 toolX CDS 16599 16976 . + 0 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 17383 17474 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon2;constitutive=1;ensembl_end_phase=2;ensembl_phase=0;exon_id=Os01t0100500-01.exon2;rank=2
seq3 toolX CDS 17383 17474 . + 0 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 17558 18258 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon3;constitutive=1;ensembl_end_phase=1;ensembl_phase=2;exon_id=Os01t0100500-01.exon3;rank=3
seq3 toolX CDS 17558 18258 . + 1 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 18501 18571 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon4;constitutive=1;ensembl_end_phase=0;ensembl_phase=1;exon_id=Os01t0100500-01.exon4;rank=4
seq3 toolX CDS 18501 18571 . + 2 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 18968 19057 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon5;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=Os01t0100500-01.exon5;rank=5
seq3 toolX CDS 18968 19057 . + 0 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 19142 19321 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon6;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=Os01t0100500-01.exon6;rank=6
seq3 toolX CDS 19142 19321 . + 0 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX CDS 19531 19593 . + 0 ID=CDS:Os01t0100500-01;Parent=transcript:Os01t0100500-01;protein_id=Os01t0100500-01
seq3 toolX exon 19531 19629 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon7;constitutive=1;ensembl_end_phase=-1;ensembl_phase=0;exon_id=Os01t0100500-01.exon7;rank=7
seq3 toolX three_prime_UTR 19594 19629 . + . Parent=transcript:Os01t0100500-01
seq3 toolX exon 19734 20144 . + . Parent=transcript:Os01t0100500-01;Name=Os01t0100500-01.exon8;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=Os01t0100500-01.exon8;rank=8
seq3 toolX three_prime_UTR 19734 20144 . + . Parent=transcript:Os01t0100500-01
4 changes: 4 additions & 0 deletions t/scripts_output/in/agat_sq_rename_seqid/rename_table.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
seq1 seqA
seq2 seqB
seq3 seqC
seq4 seqD

0 comments on commit e39b239

Please sign in to comment.