In [None]:
module avail

# Loading important variables

In [None]:
files=/input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/AGRF_CAGRF15217_H2JYTDMXX/
wd=/powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/
ids=/input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/sample_and_individual_identification.txt
reference=/powerplant/output/genomic/fish/Chrysophrys/auratus/Genome/1000.Genome.v.1.0/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fsa
reference_index=${reference}.fai
sup=/workspace/hramzr/2_Phd_PROJECT/WGS/support_files/

# Analysing startdata

In [None]:
#look for abberations in data
module load FastQC/0.11.7
module load multiqc/1.7
fqdir=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/FQ/
bsub \
-e ${fqdir}FASTQC.e -o ${fqdir}FASTQC.o -J "FQC" \
"fastqc --nogroup -q -t 2 ${files}*.gz -o $fqdir"

In [None]:
IN=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/FQ/
OUT=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/multiqc/
module load MultiQC/1.3
bsub \
-e ${OUT}FASTQC.e -o ${OUT}FASTQC.o -J "multiqc" \
"multiqc ${IN} -o ${OUT}"

# Trimming data

# Based on FASTQC:
* Now we will remove adapters, over-represented sequences, and poor-quality bases from the reads:
    * We remove the first 9 bases (biased reads)
    * We remove trailing bases if quality is less than 10
    * After clipping, the min length for a read will be 75 bp
* The `Clip_Seqs.fa` file contains the TruSeq adapter sequences and homo-polymer sequences to clip
    * This file needs to be edited to contain the appropriate sequences.
* The input for this step are the raw reads
* The output are the trimmed reads

In [None]:
#make list of FQS
up=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/unpaired/
p=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/
files=/input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/AGRF_CAGRF15217_H2JYTDMXX/
sup=/workspace/hramzr/2_Phd_PROJECT/WGS/support_files/
rm ${sup}trimfile
ls $files | egrep ".gz" | awk -F "_" '{print $3}' | sort | uniq -c | awk '{print $2}'>${sup}identifiers
while read -r id
do
var1=$(ls $files | egrep ".gz" | egrep "_${id}_" | \
awk '{print "/input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/AGRF_CAGRF15217_H2JYTDMXX/"$0}' | tr "\n" " " )

var2=$(echo "${up}${id}_unpaired_L1R1.fq.gz ${up}${id}_unpaired_L1R2.fq.gz \
${up}${id}_unpaired_L2R1.fq.gz ${up}${id}_unpaired_L2R2.fq.gz ${p}${id}_L1R1.fq.gz ${p}${id}_L1R2.fq.gz \
${p}${id}_L2R1.fq.gz ${p}${id}_L2R2.fq.gz")
echo $var1 $var2>>${sup}trimfile
done < ${sup}identifiers

In [None]:
cat ${sup}trimfile | awk '{print $3}'

In [None]:
files=/input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/AGRF_CAGRF15217_H2JYTDMXX/
TRPATH=/software/bioinformatics/Trimmomatic-0.36
TRPTH=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/
file_dir=${sup}fqlist
log=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/log/
CLIP=/workspace/hramzr/2_Phd_PROJECT/WGS/adapters/Clip_Seqs.fa
sup=/workspace/hramzr/2_Phd_PROJECT/WGS/support_files/

while read -r in1 in2 in3 in4 up1 up2 up3 up4 p1 p2 p3 p4
do
bsub \
-o ${log}trim.o -e ${log}trim.e -J "triml1" -n8  \
"java -jar ${TRPATH}/trimmomatic-0.36.jar \
PE -phred33 \
-threads 8 \
${in1} ${in2} \
${p1} ${up1} ${p2} ${up2} \
ILLUMINACLIP:${CLIP}:2:30:10 \
HEADCROP:9 \
TRAILING:10 \
SLIDINGWINDOW:5:20 \
MINLEN:75"

bsub \
-o ${log}trim.o -e ${log}trim.e -J "triml2" -n8  \
"java -jar ${TRPATH}/trimmomatic-0.36.jar \
PE -phred33 \
-threads 8 \
${in3} ${in4} \
${p3} ${up3} ${p4} ${up4} \
ILLUMINACLIP:${CLIP}:2:30:10 \
HEADCROP:9 \
TRAILING:10 \
SLIDINGWINDOW:5:20 \
MINLEN:75"
done < ${sup}trimfile

In [None]:
cat /powerplant/workspace/hramzr/2_Phd_PROJECT/Genomics/Fish/Resequencing/Chrysophrys_auratus/10565/metadata/additional_metadata/sample_and_individual_identification.txt | \
grep -w "SNF_gDNA_9"


# cat /powerplant/workspace/hramzr/2_Phd_PROJECT/Genomics/Fish/Resequencing/Chrysophrys_auratus/10565/metadata/additional_metadata/sample_and_individual_identification.txt 
echo ${id}
cat /powerplant/workspace/hramzr/2_Phd_PROJECT/Genomics/Fish/Resequencing/Chrysophrys_auratus/10565/metadata/additional_metadata/sample_and_individual_identification.txt | \
grep -w "SNF_gDNA_${id}}"

cat t | sort | uniq -c | wc -l

cat ${sup}identifiers

# Check trim quality with FASTQC

In [None]:
fqdir=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/FQ/

# Combine replicates

In [None]:
mergedir=/workspace/hramzr/2_Phd_PROJECT/WGS/merging/
trimdir=/workspace/hramzr/2_Phd_PROJECT/WGS/trimming/
log=/workspace/hramzr/2_Phd_PROJECT/WGS/merging/log/

while read -r id
do

bsub \
-e ${log}r1.e -o ${log}r1.o \
"cat ${trimdir}${id}_L1R1.fq.gz ${id}_L2R1.fq.gz > ${mergedir}${id}_R1.fq.gz"

bsub \
-e ${log}r2.e -o ${log}r2.o \
"cat ${trimdir}${id}_L1R2.fq.gz ${id}_L2R2.fq.gz > ${mergedir}${id}_R2.fq.gz"

done < ${sup}identifiers

# mark adapters add metadata

In [None]:
tmpdir=/workspace/hramzr/2_Phd_PROJECT/WGS/MIA/tmp/
METRICS=/workspace/hramzr/2_Phd_PROJECT/WGS/MIA/metrics/
PICARD=/software/bioinformatics/picard-tools-2.18.7/picard.jar
OUT=/workspace/hramzr/2_Phd_PROJECT/WGS/MIA
log=/workspace/hramzr/2_Phd_PROJECT/WGS/MIA/log/
IN=/workspace/hramzr/2_Phd_PROJECT/WGS/merging/
sup=/workspace/hramzr/2_Phd_PROJECT/WGS/support_files/
module load bwa/0.7.17
module load java
rm t
while read -r id
do
new_id=$(cat /powerplant/workspace/hramzr/2_Phd_PROJECT/Genomics/Fish/Resequencing/Chrysophrys_auratus/10565/metadata/additional_metadata/sample_and_individual_identification.txt | \
grep -w "SNF_gDNA_${id}" | awk '{print $3}')
file=$(ls /input/genomic/fish/Chrysophrys/auratus/Resequencing/CAGRF15217/AGRF_CAGRF15217_H2JYTDMXX/ | egrep "_${id}_" | head -n1)
NAME=`basename ${file}`
PREFIX=`echo ${NAME} | awk -F[_] '{print $1"_"$2"_"$3"_"$4"_"$5"_MIA"}'`
RGID="RGID_${new_id}"
SAMPLEID="SID_${new_id}"
LIB=`echo ${PREFIX} | awk -F[_] '{print $1"_"$2}'`
file1="${id}_R1.fq.gz"
file2="${id}_R2.fq.gz"
echo "${id}"
bsub \
-o ${log}mia.o -e ${log}mia.e -n 4 -J "${id}" \
-R "rusage[mem=10000] span[hosts=1]" \
"java -jar -Xmx48G -Djava.io.tmpdir=${tmpdir} $PICARD FastqToSam \
MAX_RECORDS_IN_RAM=1000000 \
VALIDATION_STRINGENCY=STRICT \
FASTQ=${IN}${file1} \
FASTQ2=${IN}${file2} \
OUTPUT=/dev/stdout \
READ_GROUP_NAME=${RGID} \
SAMPLE_NAME=${SAMPLEID} \
LIBRARY_NAME=${LIB} \
PLATFORM_UNIT=unit1 \
PLATFORM=illumina \
SEQUENCING_CENTER=AGRF \
RUN_DATE=2017-08-15T00:00:00-0400 | \
java -jar -Xmx32G -Djava.io.tmpdir=${tmpdir} $PICARD MarkIlluminaAdapters \
MAX_RECORDS_IN_RAM=1000000 \
VALIDATION_STRINGENCY=STRICT \
I=/dev/stdin \
O=${OUT}/${new_id}.bam \
M=${METRICS}/${new_id}.txt \
ADAPTERS=INDEXED \
ADAPTERS=DUAL_INDEXED \
ADAPTERS=PAIRED_END \
ADAPTERS=SINGLE_END \
ADAPTERS=ALTERNATIVE_SINGLE_END"
done < ${sup}identifiers

# Run BWA mem

In [None]:
module load bwa/0.7.17
OUT="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/bwadir/"
LOG="/workspace/hramzr/2_Phd_PROJECT/WGS/aligning/log"
COMMAND="bwa index -p ${OUT}Chrysophrys_auratus.v.1.0.chromosomes.male.map ${OUT}Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta \
${OUT}Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta"
# echo ${COMMAND}
bsub -o ${LOG}/bwa_index.out -e ${LOG}/bwa_index.err \
     -J BWA_INDEX \
     -n 3 \
     -R "span[hosts=1]" \
     ${COMMAND}

In [19]:
module load bwa/0.7.17
IN="/workspace/hramzr/2_Phd_PROJECT/WGS/MIA"
OUT="/workspace/hramzr/2_Phd_PROJECT/WGS/aligning"
LOG="${OUT}/log"
OUT_BWA="/workspace/hramzr/2_Phd_PROJECT/WGS/out_bwa"
BAMS=`ls ${IN}/*.bam`
TEMP="/workspace/hramzr/2_Phd_PROJECT/WGS/tmp"
ref="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta"
PICARD="/software/bioinformatics/picard-tools-2.18.7/picard.jar"
fai="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map"
JAVA="java -jar -Xmx64G"
for bam in ${BAMS}
    do
      NAME=`basename -s .bam ${bam}`
      PREFIX=`echo ${NAME} | awk -F[.] '{print $1}'`
#       echo ${NAME}
#       echo ${PREFIX}
      ### Create Named Pipes: ###
      R1="${TEMP}/${PREFIX}_1"
      R2="${TEMP}/${PREFIX}_2"
      rm -f ${R1}
      rm -f ${R2}
      mkfifo ${R1}
      mkfifo ${R2}
#       echo ${R1}
#       echo ${R2}
      BWA="bwa mem -t 32 -M ${ref} ${R1} ${R2}"
      COMMAND="${JAVA} ${PICARD} SamToFastq \
                  I=${bam} \
                  F=${R1} \
                  F2=${R2} \
                  CLIPPING_ATTRIBUTE=XT \
                  CLIPPING_ACTION=2 \
                  INTERLEAVE=false \
                  NON_PF=true \
                  TMP_DIR=${TEMP} | \
               ${BWA} | \
               tee ${OUT_BWA}/${PREFIX}_aligned.bam | \
               ${JAVA} ${PICARD} MergeBamAlignment \
                  ALIGNED_BAM=/dev/stdin \
                  UNMAPPED_BAM=${bam} \
                  OUTPUT=${OUT}/${PREFIX}_MBA.bam \
                  R=${ref} \
                  ADD_MATE_CIGAR=true \
                  CLIP_ADAPTERS=true \
                  CLIP_OVERLAPPING_READS=true \
                  INCLUDE_SECONDARY_ALIGNMENTS=true \
                  MAX_INSERTIONS_OR_DELETIONS=-1 \
                  PRIMARY_ALIGNMENT_STRATEGY=BestMapq \
                  ATTRIBUTES_TO_RETAIN=XS \
                  MAX_RECORDS_IN_RAM=1000000 \
                  VALIDATION_STRINGENCY=STRICT \
                  SORT_ORDER=coordinate \
                  CREATE_INDEX=true \
                  TMP_DIR=${TEMP}"
      # echo ${COMMAND}
      bsub -o ${LOG}/${PREFIX}.out \
           -e ${LOG}/${PREFIX}.err \
           -J MBA \
           -n 6 \
           -R "rusage[mem=100000] span[hosts=1]" \
           ${COMMAND} 
    done

### MBA

Job <398498> is submitted to default queue <lowpriority>.
Job <398499> is submitted to default queue <lowpriority>.
Job <398500> is submitted to default queue <lowpriority>.
Job <398501> is submitted to default queue <lowpriority>.
Job <398502> is submitted to default queue <lowpriority>.
Job <398503> is submitted to default queue <lowpriority>.
Job <398504> is submitted to default queue <lowpriority>.
Job <398505> is submitted to default queue <lowpriority>.
Job <398506> is submitted to default queue <lowpriority>.
Job <398507> is submitted to default queue <lowpriority>.
Job <398508> is submitted to default queue <lowpriority>.
Job <398509> is submitted to default queue <lowpriority>.
Job <398510> is submitted to default queue <lowpriority>.
Job <398511> is submitted to default queue <lowpriority>.
Job <398512> is submitted to default queue <lowpriority>.
Job <398513> is submitted to default queue <lowpriority>.
Job <398514> is submitted to default queue <lowpriority>.
Job <398515> i

# Mark duplicates

In [30]:
in=/workspace/hramzr/2_Phd_PROJECT/WGS/aligning/
out=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates
tmpdir=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/tmp/
log=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/log/
PICARD=/software/bioinformatics/picard-tools-2.18.7/picard.jar
metrics=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/metrics/
sup=/workspace/hramzr/2_Phd_PROJECT/WGS/support_files/
bams=$(ls /workspace/hramzr/2_Phd_PROJECT/WGS/aligning/*.bam)
for bam in $bams
do
NAME=`basename -s _MBA.bam ${bam}`
echo $NAME
bsub \
-o ${log}mdup.o -e ${log}mdup.e -J "mdup ${NAME}" \
-R "rusage[mem=80000] span[hosts=1]" \
"java -jar -Xmx32G -Djava.io.tmpdir=${tmpdir} $PICARD MarkDuplicates \
 MAX_FILE_HANDLES=1024 \
 I=$bam \
 MAX_RECORDS_IN_RAM=500000 \
 MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=500000 \
 VALIDATION_STRINGENCY=STRICT \
 CREATE_INDEX=true \
 REMOVE_DUPLICATES=true \
 REMOVE_SEQUENCING_DUPLICATES=true \
 O=${out}/${NAME}_M.bam \
 M=${metrics}${NAME}_metrics.txt"
done



000475368
Job <398742> is submitted to default queue <lowpriority>.
008801547
Job <398743> is submitted to default queue <lowpriority>.
008808514
Job <398744> is submitted to default queue <lowpriority>.
008813278
Job <398745> is submitted to default queue <lowpriority>.
008819521
Job <398746> is submitted to default queue <lowpriority>.
008820870
Job <398747> is submitted to default queue <lowpriority>.
008821357
Job <398748> is submitted to default queue <lowpriority>.
008825059
Job <398749> is submitted to default queue <lowpriority>.
008825377
Job <398750> is submitted to default queue <lowpriority>.
008828555
Job <398751> is submitted to default queue <lowpriority>.
008828871
Job <398752> is submitted to default queue <lowpriority>.
008831557
Job <398753> is submitted to default queue <lowpriority>.
008831607
Job <398754> is submitted to default queue <lowpriority>.
008833125
Job <398755> is submitted to default queue <lowpriority>.
008841043
Job <398756> is submitted to default q

# check stats

In [9]:
pwd

/powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates


In [14]:
module load samtools/1.9
bams=$(ls /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/*.bam)
OUT=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats
LOG=/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/log
for bam in $bams
do
NAME=`basename ${bam}`
PREFIX=`echo ${NAME} | awk -F. '{print $1}'`
COMMAND="samtools stats ${bam} > ${OUT}/${PREFIX}.stats"
bsub -o ${LOG}/stats.out -e ${LOG}/stats.err -J SAMSTATS ${COMMAND}
done

Job <399919> is submitted to default queue <lowpriority>.
Job <399920> is submitted to default queue <lowpriority>.
Job <399921> is submitted to default queue <lowpriority>.
Job <399922> is submitted to default queue <lowpriority>.
Job <399923> is submitted to default queue <lowpriority>.
Job <399924> is submitted to default queue <lowpriority>.
Job <399925> is submitted to default queue <lowpriority>.
Job <399926> is submitted to default queue <lowpriority>.
Job <399927> is submitted to default queue <lowpriority>.
Job <399928> is submitted to default queue <lowpriority>.
Job <399929> is submitted to default queue <lowpriority>.
Job <399930> is submitted to default queue <lowpriority>.
Job <399931> is submitted to default queue <lowpriority>.
Job <399932> is submitted to default queue <lowpriority>.
Job <399933> is submitted to default queue <lowpriority>.
Job <399934> is submitted to default queue <lowpriority>.
Job <399935> is submitted to default queue <lowpriority>.
Job <399936> i

In [29]:
STATS=`ls /workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats/*`
OUT="/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats/plot"
module load samtools
mkdir -p ${OUT}
for stats in ${STATS}
    do
        NAME=`basename ${stats}`
        PREFIX=`echo ${NAME} | awk -F. '{print $1}'`
        COMMAND="plot-bamstats ${stats} -p ${OUT}/${PREFIX}/"
        bsub -J PLOT ${COMMAND}
    done

Job <400015> is submitted to default queue <lowpriority>.
Job <400016> is submitted to default queue <lowpriority>.
Job <400017> is submitted to default queue <lowpriority>.
Job <400018> is submitted to default queue <lowpriority>.
Job <400019> is submitted to default queue <lowpriority>.
Job <400020> is submitted to default queue <lowpriority>.
Job <400021> is submitted to default queue <lowpriority>.
Job <400022> is submitted to default queue <lowpriority>.
Job <400023> is submitted to default queue <lowpriority>.
Job <400024> is submitted to default queue <lowpriority>.
Job <400025> is submitted to default queue <lowpriority>.
Job <400026> is submitted to default queue <lowpriority>.
Job <400027> is submitted to default queue <lowpriority>.
Job <400028> is submitted to default queue <lowpriority>.
Job <400029> is submitted to default queue <lowpriority>.
Job <400030> is submitted to default queue <lowpriority>.
Job <400031> is submitted to default queue <lowpriority>.
Job <400032> i

In [31]:
cat /workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats/*stats | egrep "insert size average" | awk '{sum+=$NF; c++} END {print sum/c}'
cat /workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats/*stats | egrep "insert size standard deviation" | awk '{sum+=$NF; c++} END {print sum/c}'
cat /workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/stats/*stats | egrep "average length" | awk '{sum+=$NF; c++} END {print sum/c}'


333.696
89.22
139


In [36]:
module avail


---------- /software/OSutils/modules-4.4.1/share/Modules/modulefiles -----------
1SynChro/1.15
3d-dna/180922
A5-miseq/20160825
abyss/2.0.2
abyss/2.1.0
abyss/2.1.1
abyss/2.1.2
abyss/2.1.3
abyss/2.1.4
abyss/2.1.5
adapterremoval/2.2.2
AfterQC/0.9.1
AfterQC/0.9.5
allpathslg/44837
allpathslg/44913
allpathslg/49722
allpathslg/50191
allpathslg/52488
amos/3.1.0
angsd/0.913
angsd/0.917
angsd/0.917-116-g5d087b2
angsd/0.918
angsd/0.919
angsd/0.921
angsd/0.922
angsd/0.923
angsd/0.925
angsd-wrapper/7224db2
anytag/2.5.2
apsim/7.10-r49ace54f9c8a670190aef9d8d0fb9d5477bb1534
apsim/7.9-r4047
apsimx/2018.01.30.2253
apsimx/2018.09.28.3099
apsimx/2019.01.08.3392
apsimx/2019.04.03.3693
apsimx/2019.06.05.3920
apsimx/2019.07.18.4025
apsimx/2019.10.04.4236
apsimx/2020.04.09.5012
apsimx/2020.08.04.5350
arcs/1.0.6
art/MountRainier
aspera-cli/3.7.2.354.010c3b8
assemblathon_stats/14dfdab
assemblathon_stats/2011_10_13
asub/2.1
augustus/3.2.2
augustus/3.3
augustus/3.3.1
augustus/3.3.3-new
aws-cli/2.0.43
bam2fastx/1

perl/5.20.3
perl/5.20.3-thread-multi
perl/5.22.2
perl/5.22.2-thread-multi
perl/5.24.0
perl/5.24.0-thread-multi
perl/5.24.1
perl/5.24.1-thread-multi
perl/5.26.0
perl/5.26.0-thread-multi
perl/5.28.0
perl/5.28.0-thread-multi
perl/5.30.2
perl/5.30.2-thread-multi
perl/5.8.9
perl/5.8.9-thread-multi
perl/bio-extras
perl/bio-extras-5.24.0-legacy
perl/core
perl/ensembl
perl/jupyterhub
perlbrew/0.76
pfr-python2/2.7.13
pfr-python3/3.6.1
pfr-python3/3.6.5
pfr-python3/3.6.6
phylip/3.696
phylobayes/4.1c
PhyloSift/1.0.1
picard-tools/1.112
picard-tools/1.79
picard-tools/2.10.1
picard-tools/2.18.7
picard-tools/2.2.4
picard-tools/2.9.4
picrust2/2.3.0
pilon/1.18
pilon/1.20
pilon/1.23
pinfish/0.1.0
platanus/1.2.4
platanus_trim/1.0.7
Platypus/0.8.1
plink/1.07
plink/1.90b6.5
plink2/v2.00a2lm
plncpro/1.1
popoolation2/1201
porechop/0.2.3
portcullis/1.1.0
portcullis/1.1.1
portcullis/1.1.2
powerPlant/core
powerPlant/locate
pplacer/v1.1.alpha17-6-g5cecf99
ppsPCP/1.0
primer3/2.2.3
primer3/2.3.5
prinseq-lite/0.20.

In [40]:
module load gatk/3.8.0
bams=$(ls /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/*.bam)
OUT=/workspace/hramzr/2_Phd_PROJECT/WGS/RTC
LOG="${OUT}/logs"
GATK="/powerplant/workspace/cflcyd/software/GATK/gatk-3.8.1/GenomeAnalysisTK.jar"
JAVA="java -jar -Xmx64G"
ref="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta"
for bam in ${bams}
    do
        NAME=`basename ${bam}`
        PREFIX=`echo ${NAME} | awk -F[_] '{print $1}'`
        COMMAND="${JAVA} ${GATK} -T RealignerTargetCreator \
                    -R ${ref} \
                    -I ${bam} \
                    -maxInterval 420 \
                    -o ${OUT}/${PREFIX}.intervals"
                    
        bsub -o ${LOG}/${PREFIX}.out -e ${LOG}/${PREFIX}.err \
             -J "RTC ${PREFIX}" \
             -n 3 \
             -R "rusage[mem=36000] span[hosts=1]" \
             ${COMMAND}
    done
    


Job <400703> is submitted to default queue <lowpriority>.
Job <400704> is submitted to default queue <lowpriority>.
Job <400705> is submitted to default queue <lowpriority>.
Job <400706> is submitted to default queue <lowpriority>.
Job <400707> is submitted to default queue <lowpriority>.
Job <400708> is submitted to default queue <lowpriority>.
Job <400709> is submitted to default queue <lowpriority>.
Job <400710> is submitted to default queue <lowpriority>.
Job <400711> is submitted to default queue <lowpriority>.
Job <400712> is submitted to default queue <lowpriority>.
Job <400713> is submitted to default queue <lowpriority>.
Job <400714> is submitted to default queue <lowpriority>.
Job <400715> is submitted to default queue <lowpriority>.
Job <400716> is submitted to default queue <lowpriority>.
Job <400717> is submitted to default queue <lowpriority>.
Job <400718> is submitted to default queue <lowpriority>.
Job <400719> is submitted to default queue <lowpriority>.
Job <400720> i

In [43]:
bams=$(ls /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/*.bam)
INTERVALS="/workspace/hramzr/2_Phd_PROJECT/WGS/RTC"
OUT="/workspace/hramzr/2_Phd_PROJECT/WGS/IR"
LOG="${OUT}/logs"
GATK="/powerplant/workspace/cflcyd/software/GATK/gatk-3.8.1/GenomeAnalysisTK.jar"
JAVA="java -jar -Xmx64G"
ref="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta"
for bam in ${bams}
    do
        NAME=`basename ${bam}`
        PREFIX2=`echo ${NAME} | awk -F[_] '{print $1}'`
        COMMAND="${JAVA} ${GATK} -T IndelRealigner \
                     -R ${ref} \
                     -I ${bam} \
                     -maxIsize 780 \
                     -targetIntervals ${INTERVALS}/${PREFIX2}.intervals \
                     -o ${OUT}/${PREFIX2}_IR.bam"
        # echo ${COMMAND}
        bsub -o ${LOG}/${PREFIX2}.out -e ${LOG}/${PREFIX2}.err  \
              -J IR \
              -n 3 \
              -R "rusage[mem=48000] span[hosts=1]" \
              ${COMMAND} 
done

Job <400804> is submitted to default queue <lowpriority>.
Job <400805> is submitted to default queue <lowpriority>.
Job <400806> is submitted to default queue <lowpriority>.
Job <400807> is submitted to default queue <lowpriority>.
Job <400808> is submitted to default queue <lowpriority>.
Job <400809> is submitted to default queue <lowpriority>.
Job <400810> is submitted to default queue <lowpriority>.
Job <400811> is submitted to default queue <lowpriority>.
Job <400812> is submitted to default queue <lowpriority>.
Job <400813> is submitted to default queue <lowpriority>.
Job <400814> is submitted to default queue <lowpriority>.
Job <400815> is submitted to default queue <lowpriority>.
Job <400816> is submitted to default queue <lowpriority>.
Job <400817> is submitted to default queue <lowpriority>.
Job <400818> is submitted to default queue <lowpriority>.
Job <400819> is submitted to default queue <lowpriority>.
Job <400820> is submitted to default queue <lowpriority>.
Job <400821> i

In [118]:
bams=$(ls /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/{*008821357*.bam,*967038331*.bam})
INTERVALS="/workspace/hramzr/2_Phd_PROJECT/WGS/RTC"
OUT="/workspace/hramzr/2_Phd_PROJECT/WGS/IR"
LOG="${OUT}/logs"
GATK="/powerplant/workspace/cflcyd/software/GATK/gatk-3.8.1/GenomeAnalysisTK.jar"
JAVA="java -jar -Xmx64G"
ref="/workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta"
echo $bams
for bam in ${bams}
    do
        NAME=`basename ${bam}`
        PREFIX2=`echo ${NAME} | awk -F[_] '{print $1}'`
        COMMAND="${JAVA} ${GATK} -T IndelRealigner \
                     -R ${ref} \
                     -I ${bam} \
                     -maxIsize 780 \
                     -targetIntervals ${INTERVALS}/${PREFIX2}.intervals \
                     -o ${OUT}/${PREFIX2}_IR.bam"
        echo ${COMMAND}
        bsub -o ${LOG}/malf.out -e ${LOG}/malf.err  \
              -J IR \
              -n 3 \
              -R "rusage[mem=48000] span[hosts=1]" \
              ${COMMAND} 
done

/powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/008821357_M.bam /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/967038331_M.bam
java -jar -Xmx64G /powerplant/workspace/cflcyd/software/GATK/gatk-3.8.1/GenomeAnalysisTK.jar -T IndelRealigner -R /workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta -I /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/008821357_M.bam -maxIsize 780 -targetIntervals /workspace/hramzr/2_Phd_PROJECT/WGS/RTC/008821357.intervals -o /workspace/hramzr/2_Phd_PROJECT/WGS/IR/008821357_IR.bam
Job <401518> is submitted to default queue <lowpriority>.
java -jar -Xmx64G /powerplant/workspace/cflcyd/software/GATK/gatk-3.8.1/GenomeAnalysisTK.jar -T IndelRealigner -R /workspace/hramzr/2_Phd_PROJECT/snapper_genome/Chrysophrys_auratus.v.1.0.chromosomes.male.map.fasta -I /powerplant/workspace/hramzr/2_Phd_PROJECT/WGS/mark_duplicates/967038331_M.bam -maxIsize 780 -targetIntervals /workspace/

In [44]:
module avail


---------- /software/OSutils/modules-4.4.1/share/Modules/modulefiles -----------
1SynChro/1.15
3d-dna/180922
A5-miseq/20160825
abyss/2.0.2
abyss/2.1.0
abyss/2.1.1
abyss/2.1.2
abyss/2.1.3
abyss/2.1.4
abyss/2.1.5
adapterremoval/2.2.2
AfterQC/0.9.1
AfterQC/0.9.5
allpathslg/44837
allpathslg/44913
allpathslg/49722
allpathslg/50191
allpathslg/52488
amos/3.1.0
angsd/0.913
angsd/0.917
angsd/0.917-116-g5d087b2
angsd/0.918
angsd/0.919
angsd/0.921
angsd/0.922
angsd/0.923
angsd/0.925
angsd-wrapper/7224db2
anytag/2.5.2
apsim/7.10-r49ace54f9c8a670190aef9d8d0fb9d5477bb1534
apsim/7.9-r4047
apsimx/2018.01.30.2253
apsimx/2018.09.28.3099
apsimx/2019.01.08.3392
apsimx/2019.04.03.3693
apsimx/2019.06.05.3920
apsimx/2019.07.18.4025
apsimx/2019.10.04.4236
apsimx/2020.04.09.5012
apsimx/2020.08.04.5350
arcs/1.0.6
art/MountRainier
aspera-cli/3.7.2.354.010c3b8
assemblathon_stats/14dfdab
assemblathon_stats/2011_10_13
asub/2.1
augustus/3.2.2
augustus/3.3
augustus/3.3.1
augustus/3.3.3-new
aws-cli/2.0.43
bam2fastx/1

perl/5.20.3
perl/5.20.3-thread-multi
perl/5.22.2
perl/5.22.2-thread-multi
perl/5.24.0
perl/5.24.0-thread-multi
perl/5.24.1
perl/5.24.1-thread-multi
perl/5.26.0
perl/5.26.0-thread-multi
perl/5.28.0
perl/5.28.0-thread-multi
perl/5.30.2
perl/5.30.2-thread-multi
perl/5.8.9
perl/5.8.9-thread-multi
perl/bio-extras
perl/bio-extras-5.24.0-legacy
perl/core
perl/ensembl
perl/jupyterhub
perlbrew/0.76
pfr-python2/2.7.13
pfr-python3/3.6.1
pfr-python3/3.6.5
pfr-python3/3.6.6
phylip/3.696
phylobayes/4.1c
PhyloSift/1.0.1
picard-tools/1.112
picard-tools/1.79
picard-tools/2.10.1
picard-tools/2.18.7
picard-tools/2.2.4
picard-tools/2.9.4
picrust2/2.3.0
pilon/1.18
pilon/1.20
pilon/1.23
pinfish/0.1.0
platanus/1.2.4
platanus_trim/1.0.7
Platypus/0.8.1
plink/1.07
plink/1.90b6.5
plink2/v2.00a2lm
plncpro/1.1
popoolation2/1201
porechop/0.2.3
portcullis/1.1.0
portcullis/1.1.1
portcullis/1.1.2
powerPlant/core
powerPlant/locate
pplacer/v1.1.alpha17-6-g5cecf99
ppsPCP/1.0
primer3/2.2.3
primer3/2.3.5
prinseq-lite/0.20.

# Call structural variants DELLY

In [103]:
# download docker for singularity
module load singularity/2.6.1
# singularity pull docker://sameerdcosta/parliament2
# bsub \
# -J "pull singularity docker" \
cd /workspace/hramzr/2_Phd_PROJECT/VarCallingWGS/
singularity pull --name dellySV.simg docker://dellytools/delly

ModuleCmd_Load.c(213):ERROR:105: Unable to locate a modulefile for 'singularity/2.6.1'
[31mFATAL:  [0m Image file already exists: "dellySV.simg" - will not overwrite


: 255

In [104]:
singularity run dellySV.simg \
delly 

**********************************************************************
Program: Delly
This is free software, and you are welcome to redistribute it under
certain conditions (BSD License); for license details use '-l'.
This program comes with ABSOLUTELY NO WARRANTY; for details use '-w'.

Delly (Version: 0.8.3)
Contact: Tobias Rausch (rausch@embl.de)
**********************************************************************

Usage: delly <command> <arguments>

Short-read commands:
    call         discover and genotype structural variants
    merge        merge structural variants across VCF/BCF files and within a single VCF/BCF file
    filter       filter somatic or germline structural variants

Long-read commands:
    lr           long-read SV discovery

Read-depth commands:
    rd           read-depth normalization




In [61]:
bash /workspace/hramzr/2_Phd_PROJECT/cnv_calling/delly/.

/workspace/hramzr/2_Phd_PROJECT/cnv_calling/delly/.: /workspace/hramzr/2_Phd_PROJECT/cnv_calling/delly/.: is a directory


: 126