Skip to content

Commit

Permalink
workflow added in toolshed
Browse files Browse the repository at this point in the history
  • Loading branch information
lee212 committed Feb 15, 2016
1 parent c6b49d0 commit ee791fc
Show file tree
Hide file tree
Showing 2 changed files with 302 additions and 0 deletions.
183 changes: 183 additions & 0 deletions galaxy-toolshed/packages/package_mgescan_3/mgescan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/bin/bash
# mgescan.sh $input $input.name 3 $output L None None None $ltr_gff3 None None $sw_rm "$scaffold" $min_dist $max_dist $min_len_ltr $max_len_ltr $ltr_sim_condition $cluster_sim_condition $len_condition $repeatmasker
if [ "" == "$MGESCAN_SRC" ]
then
"\$MGESCAN_SRC is not defined."
exit
fi

script_program=`which python`
script=$MGESCAN_SRC/mgescan/cmd.py
input_file=$1
#input_file_name=$2
input_file_name=`basename $input_file`
hmmsearch_version=$3
output_file=$4
program=$5 # N is nonLTR, L is LTR and B is both
# Optional output parameters for nonLTR
clade=$6
en=$7
rt=$8
ltr_gff3=$9
nonltr_gff3=${10}
both_gff3=${11}
#### for ltr between $11 and $20
if [ "$program" == "L" ]
then
sw_rm=${12}
scaffold=${13}
min_dist=${14}
max_dist=${15}
min_len_ltr=${16}
max_len_ltr=${17}
ltr_sim_condition=${18}
cluster_sim_condition=${19}
len_condition=${20}
repeatmasker=${21}
fi

#elif [ "$program" == "B" ]
if [ $# -eq 12 ]
then
nmpi=${12}
if [ ! -z $nmpi ] && [ $nmpi -ge 1 ]
then
mpi_enabled="--mpi=$nmpi"
fi

fi

# /nfs/nfs4/home/lee212/mgescan/galaxy-dist/tools/mgescan/find_ltr.sh /nfs/nfs4/home/lee212/mgescan/galaxy-dist/database/files/000/dataset_1.dat /nfs/nfs4/home/lee212/mgescan/galaxy-dist/database/files/000/dataset_3.dat

#set path for transeq
#export PATH=$user_dir/mgescan/EMBOSS/bin:/usr/bin:$PATH
transeq --version 2> /dev/null
res=$?
if [ 0 -ne $res ]
then
echo "EMBOSS is not available."
exit
fi

#move to the working directory
work_dir=`dirname $script`
cd $work_dir

#create directory for input and output
mkdir -p input
t_dir=`mktemp -p input -d` #relative path
input_dir="$work_dir/$t_dir/seq" # full path
output_dir="$work_dir/$t_dir/data"
mkdir -p $input_dir
mkdir -p $output_dir

#make a copy of input
#/bin/cp $input_file $input_dir/$input_file_name

# Check tar.gz
tar tf $input_file &> /dev/null
ISGZ=$?
if [ 0 -eq $ISGZ ]
then
# It seems pre_process.pl creates ./data/genome directory and makes a copy of a genome file.
# Due to this reason, extracts compressed inputs to output directory.
tar xzf $input_file -C $input_dir 2> /dev/null
if [ $? -ne 0 ]
then
tar xf $input_file -C $input_dir 2> /dev/null
fi
else
/bin/ln -s $input_file $input_dir/$input_file_name
fi

VERSION2=`hmmsearch -h|grep "HMMER 2" 2> /dev/null`
VERSION3=`hmmsearch -h|grep "HMMER 3" 2> /dev/null`
if [ "2" == "$hmmsearch_version" ] && [ "" != "$VERSION2" ]
then
echo $VERSION2 selected.
elif [ "3" == "$hmmsearch_version" ] && [ "" != "$VERSION3" ]
then
echo $VERSION3 selected.
else
echo HMMER is not available.
exit
fi

if [ "$program" == "L" ]
then
program_name="ltr"
elif [ "$program" == "N" ]
then
program_name="nonltr"
else
program_name="both"
fi

#run
$script_program $script $program_name $input_dir/ --output=$output_dir/ $mpi_enabled #-hmmerv=$hmmsearch_version -sw_rm=${11} -scaffold=${12} -min_dist=${13} -max_dist=${14} -min_len_ltr=${15} -max_len_ltr=${16} -ltr_sim_condition=${17} -cluster_sim_condition=${18} -len_condition=${19}
#/usr/bin/perl $script -genome=$input_dir/ -data=$output_dir/ -hmmerv=$hmmsearch_version -program=$program -sw_rm=${11} -scaffold=${12} -min_dist=${13} -max_dist=${14} -min_len_ltr=${15} -max_len_ltr=${16} -ltr_sim_condition=${17} -cluster_sim_condition=${18} -len_condition=${19}

#RES=`ssh -i $user_dir/.ssh/.internal silo.cs.indiana.edu "/usr/bin/perl $script -genome=$input_dir/ -data=$output_dir/ -hmmerv=$hmmsearch_version -program=$program > /dev/null"`

#make a copy of output
if [ "$program" != "N" ]
then
/bin/cp $output_dir/ltr/ltr.out $output_file
if [ "$ltr_gff3" != "None" ]
then
/bin/cp $output_dir/ltr/ltr.gff3 $ltr_gff3
fi

if [ "$repeatmasker" != "None" ] && [ "$repeatmasker" != "" ]
then
# chr2L.fa.cat.gz chr2L.fa.masked chr2L.fa.out chr2L.fa.out.pos chr2L.fa.tbl
/bin/cp $output_dir/repeatmasker/${input_file_name}.out $repeatmasker
fi
fi
if [ "$program" != "L" ]
then

tmp=`mktemp`
RANDOM=`basename $tmp`
compressed_file=$output_dir/$RANDOM.tar.gz
/bin/tar czfP $compressed_file $output_dir/info
#/bin/cp $compressed_file $output_file
#RES=`/bin/cp $output_dir/info/full/*/* $clade 2> /dev/null`
RES=`/bin/cp $compressed_file $clade 2> /dev/null`
RES=`/bin/cp $output_dir/info/validation/en $en 2> /dev/null`
RES=`/bin/cp $output_dir/info/validation/rt $rt 2> /dev/null`
if [ "$nonltr_gff3" != "None" ]
then
/bin/cp $output_dir/info/nonltr.gff3 $nonltr_gff3
# nonltr.gff3
##gff-version 3
#chr2L.fa MGEScan_nonLTR mobile_genetic_element 19670384 19676921 . . . ID=chr2L.fa_19670384
#chr2L.fa MGEScan_nonLTR mobile_genetic_element 17689430 17695994 . . . ID=chr2L.fa_17689430
#chr2L.fa MGEScan_nonLTR mobile_genetic_element 11897186 11903717 . . . ID=chr2L.fa_11897186
#chr2L.fa MGEScan_nonLTR mobile_genetic_element 49574 56174 . . . ID=chr2L.fa_49574
fi

#else
# Both LTR, nonLTR executed
#compressed_file=$output_dir/$RANDOM.tar.gz
#/bin/tar czfP $compressed_file $output_dir
#/bin/cp $compressed_file $output_file
fi

if [ "$program" == "B" ]
then
#echo "track name=LTR description=\"MGEScan-LTR\" color=0,0,255," > $both_gff3
/bin/cat $output_dir/ltr/ltr.gff3 >> $both_gff3
#echo "track name=nonLTR description=\"MGEScan-nonLTR\" color=255,0,0" >> $both_gff3
/bin/cat $output_dir/info/nonltr.gff3 >> $both_gff3
fi

# delete temp directory
if [ $? -eq 0 ]
then
rm -rf $work_dir/$t_dir
#echo
else
#echo cp -pr $work_dir/$t_dir $work_dir/error-cases/
cp -pr $work_dir/$t_dir $work_dir/error-cases/
fi
119 changes: 119 additions & 0 deletions galaxy-toolshed/packages/package_mgescan_3/mgescan.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<?xml version="1.0"?>

<tool name="MGEScan" id="mgescan" version="0.0.2">
<description>
MGEScan
</description>
<command interpreter="bash">
mgescan.sh $input '$input.name' 3 $output $program $clade $qvalue_en $qvalue_rt $ltr_gff3 $nonltr_gff3 $both_gff3 $mpi_yn.nmpi
<!-- mgescan.sh $input $input.name $hmmver $output $program $clade $qvalue_en $qvalue_rt $ltr_gff3 $nonltr_gff3 -->
</command>
<inputs>
<param format="fasta,tabular,data" name="input" type="data" label="Input FASTA file(s)"/>
<!--param name="hmmver" type="select" label="Hmmsearch version">
<option selected="selected" value="3">3</option>
<option value="2">2</option>
</param-->
<param name="program" type="select" label="MGEScan">
<option selected="selected" value="B">Both</option>
<option value="L">LTR</option>
<option value="N">nonLTR</option>
</param>
<conditional name="mpi_yn">
<param name="mpi_select" type="select" label="Enable MPI">
<option value="no_mpi">No</option>
<option value="yes_mpi">Yes</option>
</param>
<when value="yes_mpi">
<param name="nmpi" format="txt" type="text" value="1" label="Number of MPI Processes"/>
</when>
<when value="no_mpi">
<param name="nmpi" type="hidden" value="0"/>
</when>
</conditional>
</inputs>
<outputs>
<data format="ltr.out" name="output" label="LTR Results (ltr.out)">
<filter>program != "N"</filter>
</data>
<data format="fasta" name="clade" label="clade file (FASTA)">
<filter>program != "L"</filter>
</data>
<data format="qfile" name="qvalue_en" label="qvalue_en">
<filter>program != "L"</filter>
</data>
<data format="qfile" name="qvalue_rt" label="qvalue_rt">
<filter>program != "L"</filter>
</data>
<data format="gff3" name="ltr_gff3" label="GFF3 for LTR">
<filter>program != "N"</filter>
</data>
<data format="gff3" name="nonltr_gff3" label="GFF3 for nonLTR">
<filter>program != "L"</filter>
</data>
<data format="gff3" name="both_gff3" label="GFF3 for LTR and nonLTR">
<filter>program == "B"</filter>
</data>

</outputs>
<help>
How to Run MGEScan
===================

* Select an input genome data from the select box, and choose a program. Both LTR and nonLTR of MGEScan is default.
* Click 'Execute' button.
* MPI will be enabled depending on your system support.

If you like to have more options to run LTR or nonLTR program, use separated tools on the left panel.

For example, in LTR > MGEScan-LTR, preprocessing by repeatmasker and setting other variables are available e.g. distance(bp) between LTRs.

Output
============

A. MGEScan_LTR:

Upon completion, MGEScan-LTR generates a file "ltr.out". This output file has information
about clusters and coordinates of LTR retrotransposons identified. Each cluster of LTR
retrotransposons starts with the head line of "[cluster_number]---------", followed by
the information of LTR retrotransposons in the cluster. The columns for LTR
retrotransposons are as follows.

1. LTR_id: unique id of LTRs identified. It consist of two components, sequence file name and id in the file. For example, chr1_2 is the second LTR retrotransposon in the chr1 file.
2. start position of 5’ LTR.
3. end position of 5’ LTR.
4. start position of 3’ LTR.
5. end position of 3’ LTR.
6. strand: + or -.
7. length of 5’ LTR.
8. length of 3’ LTR.
9. length of the LTR retrotransposon.
10. TSD on the left side of the LTR retotransposons.
11. TSD on the right side of the LTR retrotransposons.
12. di(tri)nucleotide on the left side of 5’LTR
13. di(tri)nucleotide on the right side of 5’LTR
14. di(tri)nucleotide on the left side of 3’LTR
15. di(tri)nucleotide on the right side of 3’LTR

B. MGEScan_nonLTR:
Upon completion, MGEScan-nonLTR generates the directory, "info" in the data directory you
specified. In this "info" directory, two sub-directories ("full" and "validation") are
generated.

* The "full" directory is for storing sequences of elements. Each subdirectory in "full"
is the name of clade. In each directory of clade, the DNA sequences of nonLTRs identified
are listed. Each sequence is in fasta format. The header contains the position
information of TEs identified: [genome_file_name]_[start position in the sequence]

For example, >chr1_333 means that this element start at 333bp in the "chr1" file.

* The "validation" directory is for storing Q values.
In the files "en" and "rt", the first column corresponds to the element name and the last column Q value.

License
============
Copyright 2015.
You may redistribute this software under the terms of the GNU General Public License.

</help>
</tool>

0 comments on commit ee791fc

Please sign in to comment.