## Make output directory's

In [None]:
#######################################
Function:
Make folders for the arrow polisher output and the pbmm2 mapper output.

Output:

1. arrow_out (folder for arrow outputfiles)
2. pbmm2_out (folder for pbmm2 outputfiles)
#######################################

In [8]:
mkdir arrow_out
mkdir pbmm2_out

(base) (base) 

: 1

## Installing Anaconda 2, pbmm2 and ARROW

In [None]:
#######################################
Function:
Install Anaconda 2, which is a prerequisite to downloading tools with conda. Then download pbmm2 pacbio mapper for the 
mapping step, which is needed as a prerequisite for polishing with arrow.

Output:

1. pbmm2 (Installed)
2. Arrow (Installed)
3. Anaconda 2 (Installed)
#######################################

In [None]:
#first install anaconda2 (with inbuilt python 2.7 interpreter)

wget https://repo.continuum.io/archive/Anaconda2-4.2.0-Linux-x86_64.sh
bash Anaconda2-4.2.0-Linux-x86_64.sh
rm Anaconda2-4.2.0-Linux-x86_64.sh

#set path for local usage, because of user rights
export PATH="/home/hramzr/anaconda2/bin:$PATH"

#download PacBio genomicconsensus(Arrow) and pbmm2
conda create -c bioconda -n pacbio python=2.7 genomicconsensus

conda install -c bioconda pbmm2

#enter environment
source activate pacbio

# deactivate environment
source deactivate pacbio

## PREP for pbmm2, as only in BAM form is acceptable for eventual Genomicconsensus

In [None]:
#######################################
Input:
1. m54111_190529_173814.subreads.bam (at gscmnt)
2. m54111_190531_160028.subreads.bam (at gscmnt)
3. m54111_190601_021539.subreads.bam (at gscmnt)
4. m54111_190601_123140.subreads.bam (at gscmnt)

Function:
Merge subreads bam files to use them as input for pbmm2.

Output:

1. all_subreads.bam (bam file with all pacbio subreads)
#######################################

In [33]:
module load samtools/1.9
r1=/workspace/hramzr/github/Trevally/PacBio/gscmnt/gc13036/production/smrtlink_data_root/r54111_20190529_172127/1_A01/m54111_190529_173814.subreads.bam
r2A=/workspace/hramzr/github/Trevally/PacBio/gscmnt/gc13036/production/smrtlink_data_root/r54111_20190531_145345/1_A01/m54111_190531_160028.subreads.bam 
r2B=/workspace/hramzr/github/Trevally/PacBio/gscmnt/gc13036/production/smrtlink_data_root/r54111_20190531_145345/2_B01/m54111_190601_021539.subreads.bam
r2C=/workspace/hramzr/github/Trevally/PacBio/gscmnt/gc13036/production/smrtlink_data_root/r54111_20190531_145345/3_C01/m54111_190601_123140.subreads.bam
logdir=/workspace/hramzr/github/Trevally/PacBio/log/
bsub -o ${logdir}Smerge.out -e ${logdir}Smerge.err -J "merging" -n 32 -m wkoppg33 \
"samtools merge /workspace/hramzr/github/Trevally/PacBio/merged_subreads/all_subreads.bam ${r1} ${r2A} ${r2B} ${r2C}"

Job <370034> is submitted to default queue <normal>.


## Index ref for pbmm2 and ARROW, after BLASR errors 

In [None]:
#######################################
Input:
1. fetreval.fasta.contigs.fasta (FASTA format reference sequence file)

Function:
Index the reference sequence with pbmm2 index and faidx, this is needed to run pbmm2 and Arrow.

Output:

1. ref.mmi (Indexed reference file in minimap format)
#######################################

In [30]:
export PATH="/home/hramzr/anaconda2/bin:$PATH"
cd /workspace/hramzr/github/Trevally/PacBio/polishing
source activate pacbio

fqdir=/workspace/hramzr/github/Trevally/PacBio/fastq/
logdir=/workspace/hramzr/github/Trevally/PacBio/log/
ref=/workspace/hramzr/github/Trevally/PacBio/canu_default/fetreval.fasta.contigs.fasta

bsub -o ${logdir}/pbmm2i.out -e ${logdir}/pbmm2i.err -m wkoppg33 -J "pbmm2_index" -q "normal" -n 64 \
"pbmm2 index $ref /workspace/hramzr/github/Trevally/PacBio/canu_default/ref.mmi"
source deactivate pacbio

module load samtools/1.9
bsub -o log/refind.out -e log/refind.err \
"samtools faidx $ref"

(pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) Job <369998> is submitted to queue <normal>.
(pacbio) 

## Align with pbmm2 and sort

In [None]:
#######################################
Input:
1. ref.mmi (Indexed reference file in minimap format)
2. all_subreads.bam (bam file with all pacbio subreads)

Function:
Align the subreads to the reference to create a mapped bam file, when this is done, sort the bamfile. 
All of this is a prerequisite to Arrow polishing.

Output:

1. pbmm2.sorted.bam (pbmm2 mapped subreads, sorted by samtools)
#######################################

In [36]:
export PATH="/home/hramzr/anaconda2/bin:$PATH"
cd /workspace/hramzr/github/Trevally/PacBio/polishing
source activate pacbio
bamfiles=/workspace/hramzr/github/Trevally/PacBio/merged_subreads/all_subreads.bam
fqdir=/workspace/hramzr/github/Trevally/PacBio/fastq/
logdir=/workspace/hramzr/github/Trevally/PacBio/log/
ref=/workspace/hramzr/github/Trevally/PacBio/canu_default/fetreval.fasta.contigs.fasta
refmi=/workspace/hramzr/github/Trevally/PacBio/canu_default/ref.mmi
outdir=/workspace/hramzr/github/Trevally/PacBio/polishing/pbmm2_out/
bsub -o ${logdir}/pbmm2a.out -e ${logdir}/pbmm2a.err -m wkoppg33 -J "pbmm2_align" -q "normal" -n 64 \
"pbmm2 align $refmi $bamfiles --min-id-perc 70 | samtools sort >${outdir}pbmm2.sorted.bam"
source deactivate pacbio

(pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) Job <370274> is submitted to queue <normal>.
(pacbio) 

## Index mapped bamfile with pacbio index

In [None]:
#######################################
Input:
1. pbmm2.sorted.bam (pbmm2 mapped subreads, sorted by samtools)

Function:
Index pbmm2.sorted.bam file in pacbio index format.

Output:

1. pbmm2.sorted.bam.pbi (indexed pbmm2 mapped file)
#######################################

In [3]:
export PATH="/home/hramzr/anaconda2/bin:$PATH"
sort_file=/powerplant/workspace/hramzr/github/Trevally/PacBio/polishing/pbmm2_out/pbmm2.sorted.bam
cd /workspace/hramzr/github/Trevally/PacBio/polishing
source activate pacbio
bsub -o log/pbi.out -e log/pbi.err \
"pbindex $sort_file"

(pacbio) Job <374782> is submitted to default queue <normal>.
(pacbio) 

: 1

## Running Arrow

In [None]:
#######################################
Input:
1. pbmm2.sorted.bam (pbmm2 mapped subreads, sorted by samtools)
2. fetreval.fasta.contigs.fasta (FASTA format reference sequence file)

Function:
Polishing of the made assembly with PacBio mapped reads.

Output:

1. consensus_canu_default.fastq (Consensus polished assembly file in FASTQ format)
#######################################

In [4]:
#set path for local usage, because of user rights
export PATH="/home/hramzr/anaconda2/bin:$PATH"
cd /workspace/hramzr/github/Trevally/PacBio/polishing
source activate pacbio
#Run BLASR, with parameters from similar nile tilapia research -- https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-017-3723-5
#tool for long reads <-- hence why it works best for PACBIO
logdir=/workspace/hramzr/github/Trevally/PacBio/log/
ref=/workspace/hramzr/github/Trevally/PacBio/canu_default/fetreval.fasta.contigs.fasta
pbmm2_map=/workspace/hramzr/github/Trevally/PacBio/polishing/pbmm2_out/pbmm2.sorted.bam
odir=/workspace/hramzr/github/Trevally/PacBio/polishing/arrow_out/canu_default/

bsub -o ${logdir}/arrow.out -e ${logdir}/arrow.out -m wkoppg31 -J "arrow" -q "normal" -n 32 \
"arrow ${pbmm2_map}    \
  -r ${ref} -o ${odir}consensus_canu_defaultvariants.gff       \
  -o ${odir}consensus_canu_default.fasta -o ${odir}consensus_canu_default.fastq"

source deactivate pacbio


(pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) (pacbio) Job <374786> is submitted to queue <normal>.
(pacbio) (pacbio) 