workflow added in toolshed

MGEScan · Feb 15, 2016 · ee791fc · ee791fc
1 parent c6b49d0
commit ee791fc
Show file tree

Hide file tree

Showing 2 changed files with 302 additions and 0 deletions.
diff --git a/galaxy-toolshed/packages/package_mgescan_3/mgescan.sh b/galaxy-toolshed/packages/package_mgescan_3/mgescan.sh
@@ -0,0 +1,183 @@
+#!/bin/bash
+# mgescan.sh $input $input.name 3 $output L None None None $ltr_gff3 None None $sw_rm "$scaffold" $min_dist $max_dist $min_len_ltr $max_len_ltr $ltr_sim_condition $cluster_sim_condition $len_condition $repeatmasker
+if [ "" == "$MGESCAN_SRC" ]
+then
+	"\$MGESCAN_SRC is not defined."
+	exit
+fi
+
+script_program=`which python`
+script=$MGESCAN_SRC/mgescan/cmd.py
+input_file=$1
+#input_file_name=$2
+input_file_name=`basename $input_file`
+hmmsearch_version=$3
+output_file=$4
+program=$5 # N is nonLTR, L is LTR and B is both
+# Optional output parameters for nonLTR
+clade=$6
+en=$7
+rt=$8
+ltr_gff3=$9
+nonltr_gff3=${10}
+both_gff3=${11}
+#### for ltr between $11 and $20
+if [ "$program" == "L" ]
+then
+	sw_rm=${12}
+	scaffold=${13}
+	min_dist=${14}
+	max_dist=${15}
+	min_len_ltr=${16}
+	max_len_ltr=${17}
+	ltr_sim_condition=${18}
+	cluster_sim_condition=${19}
+	len_condition=${20}
+	repeatmasker=${21}
+fi
+
+#elif [ "$program" == "B" ]
+if [ $# -eq 12 ]
+then
+	nmpi=${12}
+	if [ ! -z $nmpi ] && [ $nmpi -ge 1 ]
+	then
+		mpi_enabled="--mpi=$nmpi"
+	fi
+
+fi
+
+# /nfs/nfs4/home/lee212/mgescan/galaxy-dist/tools/mgescan/find_ltr.sh /nfs/nfs4/home/lee212/mgescan/galaxy-dist/database/files/000/dataset_1.dat /nfs/nfs4/home/lee212/mgescan/galaxy-dist/database/files/000/dataset_3.dat
+
+#set path for transeq
+#export PATH=$user_dir/mgescan/EMBOSS/bin:/usr/bin:$PATH
+transeq --version 2> /dev/null
+res=$?
+if [ 0 -ne $res ]
+then
+	echo "EMBOSS is not available."
+	exit
+fi
+
+#move to the working directory
+work_dir=`dirname $script`
+cd $work_dir
+
+#create directory for input and output
+mkdir -p input
+t_dir=`mktemp -p input -d` #relative path
+input_dir="$work_dir/$t_dir/seq" # full path
+output_dir="$work_dir/$t_dir/data"
+mkdir -p $input_dir
+mkdir -p $output_dir
+
+#make a copy of input
+#/bin/cp $input_file $input_dir/$input_file_name
+
+# Check tar.gz
+tar tf $input_file &> /dev/null
+ISGZ=$?
+if [ 0 -eq $ISGZ ]
+then
+	# It seems pre_process.pl creates ./data/genome directory and makes a copy of a genome file.
+	# Due to this reason, extracts compressed inputs to output directory.
+	tar xzf $input_file -C $input_dir 2> /dev/null
+	if [ $? -ne 0 ]
+	then
+		tar xf $input_file -C $input_dir 2> /dev/null
+	fi
+else
+	/bin/ln -s $input_file $input_dir/$input_file_name
+fi
+
+VERSION2=`hmmsearch -h|grep "HMMER 2" 2> /dev/null`
+VERSION3=`hmmsearch -h|grep "HMMER 3" 2> /dev/null`
+if [ "2" == "$hmmsearch_version" ] && [ "" != "$VERSION2" ]
+then
+	echo $VERSION2 selected.
+elif [ "3" == "$hmmsearch_version" ] && [ "" != "$VERSION3" ]
+then
+	echo $VERSION3 selected.
+else
+	echo HMMER is not available.
+	exit
+fi
+
+if [ "$program" == "L" ]
+then
+	program_name="ltr"
+elif [ "$program" == "N" ]
+then
+	program_name="nonltr"
+else
+	program_name="both"
+fi
+
+#run
+$script_program $script $program_name $input_dir/ --output=$output_dir/ $mpi_enabled #-hmmerv=$hmmsearch_version -sw_rm=${11} -scaffold=${12} -min_dist=${13} -max_dist=${14} -min_len_ltr=${15} -max_len_ltr=${16} -ltr_sim_condition=${17} -cluster_sim_condition=${18} -len_condition=${19}
+#/usr/bin/perl $script -genome=$input_dir/ -data=$output_dir/ -hmmerv=$hmmsearch_version -program=$program -sw_rm=${11} -scaffold=${12} -min_dist=${13} -max_dist=${14} -min_len_ltr=${15} -max_len_ltr=${16} -ltr_sim_condition=${17} -cluster_sim_condition=${18} -len_condition=${19}
+
+#RES=`ssh -i $user_dir/.ssh/.internal silo.cs.indiana.edu "/usr/bin/perl $script -genome=$input_dir/ -data=$output_dir/ -hmmerv=$hmmsearch_version -program=$program > /dev/null"`
+
+#make a copy of output
+if [ "$program" != "N" ]
+then
+	/bin/cp $output_dir/ltr/ltr.out $output_file
+	if [ "$ltr_gff3" != "None" ]
+	then
+		/bin/cp $output_dir/ltr/ltr.gff3 $ltr_gff3
+	fi
+
+	if [ "$repeatmasker" != "None" ] && [ "$repeatmasker" != "" ]
+	then
+		# chr2L.fa.cat.gz  chr2L.fa.masked  chr2L.fa.out  chr2L.fa.out.pos  chr2L.fa.tbl
+		/bin/cp $output_dir/repeatmasker/${input_file_name}.out $repeatmasker
+	fi
+fi
+if [ "$program" != "L" ]
+then
+
+	tmp=`mktemp`
+	RANDOM=`basename $tmp`
+	compressed_file=$output_dir/$RANDOM.tar.gz
+	/bin/tar czfP $compressed_file $output_dir/info
+	#/bin/cp $compressed_file $output_file
+	#RES=`/bin/cp $output_dir/info/full/*/* $clade 2> /dev/null`
+	RES=`/bin/cp $compressed_file $clade 2> /dev/null`
+	RES=`/bin/cp $output_dir/info/validation/en $en 2> /dev/null`
+	RES=`/bin/cp $output_dir/info/validation/rt $rt 2> /dev/null`
+	if [ "$nonltr_gff3" != "None" ]
+	then
+		/bin/cp $output_dir/info/nonltr.gff3 $nonltr_gff3
+		# nonltr.gff3
+		##gff-version 3
+		#chr2L.fa        MGEScan_nonLTR  mobile_genetic_element  19670384        19676921        .       .       .       ID=chr2L.fa_19670384
+		#chr2L.fa        MGEScan_nonLTR  mobile_genetic_element  17689430        17695994        .       .       .       ID=chr2L.fa_17689430
+		#chr2L.fa        MGEScan_nonLTR  mobile_genetic_element  11897186        11903717        .       .       .       ID=chr2L.fa_11897186
+		#chr2L.fa        MGEScan_nonLTR  mobile_genetic_element  49574   56174   .       .       .       ID=chr2L.fa_49574
+	fi
+
+#else
+	# Both LTR, nonLTR executed
+	#compressed_file=$output_dir/$RANDOM.tar.gz
+	#/bin/tar czfP $compressed_file $output_dir
+	#/bin/cp $compressed_file $output_file
+fi
+
+if [ "$program" == "B" ]
+then
+	#echo "track name=LTR description=\"MGEScan-LTR\" color=0,0,255," > $both_gff3
+	/bin/cat $output_dir/ltr/ltr.gff3 >> $both_gff3
+	#echo "track name=nonLTR description=\"MGEScan-nonLTR\" color=255,0,0" >> $both_gff3
+	/bin/cat $output_dir/info/nonltr.gff3 >> $both_gff3
+fi
+
+# delete temp directory
+if [ $? -eq 0 ]
+then
+	rm -rf $work_dir/$t_dir
+	#echo
+else
+	#echo cp -pr $work_dir/$t_dir $work_dir/error-cases/
+	cp -pr $work_dir/$t_dir $work_dir/error-cases/
+fi
diff --git a/galaxy-toolshed/packages/package_mgescan_3/mgescan.xml b/galaxy-toolshed/packages/package_mgescan_3/mgescan.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0"?>
+
+<tool name="MGEScan" id="mgescan" version="0.0.2">
+	<description>
+		MGEScan
+	</description>
+	<command interpreter="bash">
+		mgescan.sh $input '$input.name' 3 $output $program $clade $qvalue_en $qvalue_rt $ltr_gff3 $nonltr_gff3 $both_gff3 $mpi_yn.nmpi
+		<!-- mgescan.sh $input $input.name $hmmver $output $program $clade $qvalue_en $qvalue_rt $ltr_gff3 $nonltr_gff3 -->
+	</command>
+	<inputs>
+		<param format="fasta,tabular,data" name="input" type="data" label="Input FASTA file(s)"/>
+		<!--param name="hmmver" type="select" label="Hmmsearch version">
+			<option selected="selected" value="3">3</option>
+			<option value="2">2</option>
+		</param-->
+		<param name="program" type="select" label="MGEScan">
+			<option selected="selected" value="B">Both</option>
+			<option value="L">LTR</option>
+			<option value="N">nonLTR</option>
+		</param>
+		<conditional name="mpi_yn">
+			<param name="mpi_select" type="select" label="Enable MPI">
+				<option value="no_mpi">No</option>
+				<option value="yes_mpi">Yes</option>
+			</param>
+			<when value="yes_mpi">
+				<param name="nmpi" format="txt" type="text" value="1" label="Number of MPI Processes"/>
+			</when>
+			<when value="no_mpi">
+				<param name="nmpi" type="hidden" value="0"/>
+			</when>
+		</conditional>
+	</inputs>
+	<outputs>
+		<data format="ltr.out" name="output" label="LTR Results (ltr.out)">
+			<filter>program != "N"</filter>
+		</data>
+		<data format="fasta" name="clade" label="clade file (FASTA)">
+			<filter>program != "L"</filter>
+		</data>
+		<data format="qfile" name="qvalue_en" label="qvalue_en">
+			<filter>program != "L"</filter>
+		</data>
+		<data format="qfile" name="qvalue_rt" label="qvalue_rt">
+			<filter>program != "L"</filter>
+		</data>
+		<data format="gff3" name="ltr_gff3" label="GFF3 for LTR">
+			<filter>program != "N"</filter>
+		</data>
+		<data format="gff3" name="nonltr_gff3" label="GFF3 for nonLTR">
+			<filter>program != "L"</filter>
+		</data>
+		<data format="gff3" name="both_gff3" label="GFF3 for LTR and nonLTR">
+			<filter>program == "B"</filter>
+		</data>
+
+	</outputs>
+	<help>
+How to Run MGEScan
+===================
+
+* Select an input genome data from the select box, and choose a program. Both LTR and nonLTR of MGEScan is default.
+* Click 'Execute' button.
+* MPI will be enabled depending on your system support.
+
+If you like to have more options to run LTR or nonLTR program, use separated tools on the left panel.
+
+For example, in LTR > MGEScan-LTR, preprocessing by repeatmasker and setting other variables are available e.g. distance(bp) between LTRs.
+
+Output
+============
+
+A. MGEScan_LTR:
+
+Upon completion, MGEScan-LTR generates a file "ltr.out". This output file has information
+about clusters and coordinates of LTR retrotransposons identified. Each cluster of LTR
+retrotransposons starts with the head line of "[cluster_number]---------", followed by
+the information of LTR retrotransposons in the cluster. The columns for LTR
+retrotransposons are as follows.
+
+  1. LTR_id: unique id of LTRs identified. It consist of two components, sequence file name and id in the file. For example, chr1_2 is the second LTR retrotransposon in the chr1 file.
+  2. start position of 5’ LTR.
+  3. end position of 5’ LTR.
+  4. start position of 3’ LTR.
+  5. end position of 3’ LTR.
+  6. strand: + or -.
+  7. length of 5’ LTR.
+  8. length of 3’ LTR.
+  9. length of the LTR retrotransposon.
+  10. TSD on the left side of the LTR retotransposons.
+  11. TSD on the right side of the LTR retrotransposons.
+  12. di(tri)nucleotide on the left side of 5’LTR
+  13. di(tri)nucleotide on the right side of 5’LTR
+  14. di(tri)nucleotide on the left side of 3’LTR
+  15. di(tri)nucleotide on the right side of 3’LTR 
+
+B. MGEScan_nonLTR:
+   Upon completion, MGEScan-nonLTR generates the directory, "info" in the data directory you
+   specified. In this "info" directory, two sub-directories ("full" and "validation") are
+   generated.
+
+   * The "full" directory is for storing sequences of elements. Each subdirectory in "full"
+   is the name of clade. In each directory of clade, the DNA sequences of nonLTRs identified
+   are listed. Each sequence is in fasta format. The header contains the position
+   information of TEs identified: [genome_file_name]_[start position in the sequence]
+
+   For example, >chr1_333 means that this element start at 333bp in the "chr1" file.
+
+   * The "validation" directory is for storing Q values. 
+   In the files "en" and "rt", the first column corresponds to the element name and the last column Q value. 
+
+License
+============
+Copyright 2015.
+You may redistribute this software under the terms of the GNU General Public License.
+
+</help>
+</tool>