# Remove contaminants from 5 draft haplotypes

| Assembly Tag | File path |
|--------------|-----------|
| progeny_M7_hap1 | /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/M7.bp.hap1.p_ctg.fa |
| progeny_M7_hap2 | /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/M7.bp.hap2.p_ctg.fa |
| progeny_Nui_hap1 | /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/Nui.bp.hap1.p_ctg.fa |
| progeny_Nui_hap2 | /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/Nui.bp.hap2.p_ctg.fa |
| M7xNui_min2k_Flye | /powerplant/workspace/hraijc/Blueberry/BB_M7xNui_Assembly/03_FLYE/M7xNui_min2k_Flye.fasta |

## Assembly-QC (v1) report

https://storage.powerplant.pfr.co.nz/workspace/hrasrb/Blueberry_trio/Contamination_check/report.html


# 00. Get bed file of each contig ID, start & end

In [2]:
mkdir /powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
cd /powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

cat <<EOF > Blueberry_haplotypes.txt
progeny_M7_hap1
progeny_M7_hap2
progeny_Nui_hap1
progeny_Nui_hap2
M7xNui_min2k_Flye
EOF

## create soft links

In [3]:
pwd

/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination


In [4]:
ln -s /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/M7.bp.hap1.p_ctg.fa progeny_M7_hap1.fa
ln -s /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/M7.bp.hap2.p_ctg.fa progeny_M7_hap2.fa
ln -s /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/Nui.bp.hap1.p_ctg.fa progeny_Nui_hap1.fa
ln -s /powerplant/workspace/hraczw/github/bioinf_blueberry_trio/Nui.bp.hap2.p_ctg.fa progeny_Nui_hap2.fa
ln -s /powerplant/workspace/hraijc/Blueberry/BB_M7xNui_Assembly/03_FLYE/M7xNui_min2k_Flye.fasta M7xNui_min2k_Flye.fa

## index FASTA

In [None]:
#!/bin/bash -e

#SBATCH -J RemoveContamination # change
#SBATCH --output=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/log/hrasrb_%j.out
#SBATCH --error=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/log/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:15:00
#SBATCH --mem=500M

base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

file=Blueberry_haplotypes.txt

cd "${base}"

ml samtools/1.16

cat "${file}" | while read tag
do

    samtools faidx "${tag}.fa"
    cut -f 1,2 "${tag}.fa.fai" | awk '{print $1"\t1\t"$2}' > "./${tag}.fai.bed"

done


module unload samtools

# 01. Get contaminated contig & exclude contaminated contig from assembly
* .withAdaptor.fa
* .noAdaptor.fa

| tag | accession | length | action | range |
|--------------|--------------|-----------|--------------|-----------|
| M7xNui_min2k_Flye | contig_12707 | 7205 | ACTION_TRIM | 7174..7205 |
| progeny_Nui_hap1 | h1tg000466l | 31900 | ACTION_TRIM | 329..373 |

In [7]:
cd /workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

# get contaminated contigs
grep contig_12707 M7xNui_min2k_Flye.fai.bed | cut -f 1 - > M7xNui_min2k_Flye_withAdaptor.fai.bed
grep h1tg000466l progeny_Nui_hap1.fai.bed | cut -f 1 - > progeny_Nui_hap1_withAdaptor.fai.bed

In [8]:
cat M7xNui_min2k_Flye_withAdaptor.fai.bed

contig_12707


In [9]:
base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
log=${base}/log

cd ${base}

M7xNui_min2k_Flye=M7xNui_min2k_Flye.fa
id_M7xNui=$(basename ${M7xNui_min2k_Flye} .fa)

progeny_Nui_hap1=progeny_Nui_hap1.fa
id_progeny_Nui_hap1=$(basename ${progeny_Nui_hap1} .fa)

ml seqkit

# create and submit bash script
sbatch << EOF
#!/bin/bash -e

#SBATCH -J seqkit # change this
#SBATCH --output=${log}/hrasrb_%j.out
#SBATCH --error=${log}/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:01:00 # Walltime # change this
#SBATCH --mem=1G # change this

# could you blast that contig containing adapter against NCBI for double checking? 
seqkit grep --pattern-file M7xNui_min2k_Flye_withAdaptor.fai.bed ${M7xNui_min2k_Flye} > "${id_M7xNui}.contig_12707.fa"
seqkit grep --pattern-file progeny_Nui_hap1_withAdaptor.fai.bed ${progeny_Nui_hap1} > "${id_progeny_Nui_hap1}.h1tg000466l.fa"

# remove contig with adaptor from the assemblies
seqkit grep --pattern-file M7xNui_min2k_Flye_withAdaptor.fai.bed --invert-match ${M7xNui_min2k_Flye} > "${id_M7xNui}.noAdaptor.fa"
seqkit grep --pattern-file progeny_Nui_hap1_withAdaptor.fai.bed --invert-match ${progeny_Nui_hap1} > "${id_progeny_Nui_hap1}.noAdaptor.fa"

EOF

module unload seqkit

Submitted batch job 2011006


In [12]:
base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

cd ${base}

ll "${id_M7xNui}.contig_12707.fa"
ll "${id_progeny_Nui_hap1}.h1tg000466l.fa"
ll "${id_M7xNui}.noAdaptor.fa"
ll "${id_progeny_Nui_hap1}.noAdaptor.fa"

-rw-rw-r--. 1 hrasrb powerplant 7340 Sep 15 10:08 M7xNui_min2k_Flye.contig_12707.fa
-rw-rw-r--. 1 hrasrb powerplant 32445 Sep 15 10:08 progeny_Nui_hap1.h1tg000466l.fa
-rw-rw-r--. 1 hrasrb powerplant 1775149697 Sep 15 10:08 M7xNui_min2k_Flye.noAdaptor.fa
-rw-rw-r--. 1 hrasrb powerplant 541017886 Sep 15 10:08 progeny_Nui_hap1.noAdaptor.fa


In [13]:
grep contig_12707 M7xNui_min2k_Flye.fai.bed | cut -f 1,2 - | awk '{print $1"\t"$2"\t7173"}' > M7xNui_min2k_Flye_edited.withAdaptor.fai.bed

In [16]:
grep h1tg000466l progeny_Nui_hap1.fai.bed | cut -f 1,3 - | awk '{print $1"\t374\t"$2}' > progeny_Nui_hap1_edited.withAdaptor.fai.bed

In [17]:
cat progeny_Nui_hap1_edited.withAdaptor.fai.bed

h1tg000466l	374	31900


In [18]:
base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
log=${base}/log

cd ${base}

ml seqkit

# create and submit bash script
sbatch << EOF
#!/bin/bash -e

#SBATCH -J seqkit # change this
#SBATCH --output=${log}/hrasrb_%j.out
#SBATCH --error=${log}/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:01:00 # Walltime # change this
#SBATCH --mem=1G # change this

# remove adaptor
seqkit subseq --bed M7xNui_min2k_Flye_edited.withAdaptor.fai.bed M7xNui_min2k_Flye.contig_12707.fa >  M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.fa
seqkit subseq --bed progeny_Nui_hap1_edited.withAdaptor.fai.bed progeny_Nui_hap1.h1tg000466l.fa > progeny_Nui_hap1.h1tg000466l.removeAdaptor.subseq.fa

EOF

module unload seqkit

Submitted batch job 2011008


In [19]:
ll *.removeAdaptor.subseq.fa

-rw-rw-r--. 1 hrasrb powerplant  7316 Sep 15 10:14 M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.fa
-rw-rw-r--. 1 hrasrb powerplant 32078 Sep 15 10:14 progeny_Nui_hap1.h1tg000466l.removeAdaptor.subseq.fa


## Blast that contig containing adapter against NCBI for double checking

In [14]:
ml ncbi-blast

In [19]:
module list

Currently Loaded Modulefiles:
 1) [46mpowerPlant/core[0m    4) git/2.21.0        7) perl/5.36.0         
 2) texlive/20151117   5) [46mSlurm/21.08.8-2[0m   8) slurm-utils/latest  
 3) pandoc/1.19.2      6) perlbrew/0.76     9) ncbi-blast/2.11.0   

Key:
[46msticky[0m  


In [25]:
# gnl|uv|NGB00293.1 Multimer of EcoRI adaptor used in I.M.A.G.E. library Barstead MPL-RB15 and other libraries
# gnl|uv|NGB00972.1 Pacific Biosciences Blunt Adapter
echo ">gnl|uv|NGB00972.1:1-45 Pacific Biosciences Blunt Adapter
ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT" > Pacific_Biosciences_Blunt_Adapter.fa

In [21]:
pwd

/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination


In [26]:
cat Pacific_Biosciences_Blunt_Adapter.fa

>gnl|uv|NGB00972.1:1-45 Pacific Biosciences Blunt Adapter
ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT


In [25]:
ml ncbi-blast/2.11.0

base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
LOG=${base}/log

M7xNui_min2k_Flye=${base}/M7xNui_min2k_Flye.contig_12707.fa

progeny_Nui_hap1=${base}/progeny_Nui_hap1.h1tg000466l.fa

WKDIR=${base}/2023-09-06_PB_adaptor_blast

# mkdir $WKDIR

cd $WKDIR

# create and submit bash script
sbatch << EOF
#!/bin/bash -e

#SBATCH -J BLAST # change this
#SBATCH --output=${LOG}/hrasrb_%j.out
#SBATCH --error=${LOG}/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:01:00 # Walltime # change this
#SBATCH --mem=1M # change this

# makeblastdb -dbtype nucl -parse_seqids -in ${base}/Pacific_Biosciences_Blunt_Adapter.fa

blastn  -query ${M7xNui_min2k_Flye} -db ${base}/Pacific_Biosciences_Blunt_Adapter.fa -out M7xNui_min2k_Flye.contig_12707.html -html
blastn  -query ${progeny_Nui_hap1} -db ${base}/Pacific_Biosciences_Blunt_Adapter.fa -out progeny_Nui_hap1.h1tg000466l.html -html

EOF

module unload ncbi-blast
squeue -u hrasrb

Submitted batch job 2011261
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           2011261      fast    BLAST   hrasrb PD       0:00      1 (None)


In [1]:
base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

WKDIR=${base}/2023-09-06_PB_adaptor_blast
cd $WKDIR
grep "Sequences producing significant alignments:" -A 15 *html

progeny_Nui_hap1.h1tg000466l.html:Sequences producing significant alignments:                          (Bits)  Value
progeny_Nui_hap1.h1tg000466l.html-
progeny_Nui_hap1.h1tg000466l.html-uv:NGB00972.1:1-45 Pacific Biosciences Blunt Adapter                  <a href=#uv:NGB00972.1:1-45>84.2</a>    5e-20
progeny_Nui_hap1.h1tg000466l.html-
progeny_Nui_hap1.h1tg000466l.html-
progeny_Nui_hap1.h1tg000466l.html->uv:NGB00972.1:1-45<a name=uv:NGB00972.1:1-45></a> Pacific Biosciences Blunt Adapter
progeny_Nui_hap1.h1tg000466l.html-Length=45
progeny_Nui_hap1.h1tg000466l.html-
progeny_Nui_hap1.h1tg000466l.html- Score = 84.2 bits (45),  Expect = 5e-20
progeny_Nui_hap1.h1tg000466l.html- Identities = 45/45 (100%), Gaps = 0/45 (0%)
progeny_Nui_hap1.h1tg000466l.html- Strand=Plus/Minus
progeny_Nui_hap1.h1tg000466l.html-
progeny_Nui_hap1.h1tg000466l.html-Query  329  ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT  373
progeny_Nui_hap1.h1tg000466l.html-            |||||||||||||||||||||||||||||||||||||||||||||

## Rename contigs & merge back into reference

In [14]:
wkdir=/powerplant//workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/no_adaptor_contamination
cd ${wkdir}

file="Blueberry_haplotypes.txt"
if [ -f "$file" ] ; then
    rm "$file"
fi

# contig_12707_2-7173:. contig_12707
cat <<EOF > Blueberry_haplotypes.txt
contig_12707 contig_12707_trimmed_1_7173
EOF

cat Blueberry_haplotypes.txt

contig_12707 contig_12707_trimmed_1_7173


In [15]:
wkdir=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/no_adaptor_contamination
cd ${wkdir}

input_file="M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.rename.fa"
output_file="M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.rename.2.fa"
mapping_file="Blueberry_haplotypes.txt"

awk 'NR==FNR{id_map[$1]=$2; next} /^>/{print ">" id_map[substr($1,2)]; next} {print}' "$mapping_file" "$input_file" > "$output_file"

In [17]:
head M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.rename.2.fa
# head progeny_Nui_hap1.h1tg000466l.removeAdaptor.subseq.rename.2.fa

>contig_12707_trimmed_1_7173
ATCAAAATGATCTTCATTGTTAAATCTTCCGAGTGAGAGCAAAAGGCTTTGTCTCTCACT
CGAGTTATAGTTGAGAGCCATTGAATATCCTTAGTCATGGGTGGTATGAACCTCTAGATC
TTGCCGCCTTTAATTTATAGTTTAAAACTTCATTTTCAATCGCATTAGAGGTAGAGAGAG
AATTCCACAAACAAACCAAACCAAAAGGTTGGTCGTCCTAACGCCGAAACCCCTACTGTA
AAGTAAACCTTTCACTAGCTCCGACCCTTCCCTGTGGATTCGACCTCGGACTTCCAAGTT
ATTATGCTACAACCGACCTAGTCCTACGCTTGGGGCGACGCTACTACGATACAAAGCTAG
GTCGCAAGCATTTTTGGCGCCATTGCCAGGGAAGGCAACAAATGGTGAGTTAGAAAGGAG
TTTGCTTTACTCCATTGCAAGTATATCCATCTTTTCATTTTTGTTTTGTTTGTTGTTTTC
AGCTTTTGTTTATTTTCAGGTCCTTTTAAAGAAAATCCTATTAAAAAAAATGGCTTGTTT


In [19]:
wkdir=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
cd ${wkdir}
gzip ./no_adaptor_contamination/progeny_Nui_hap1.h1tg000466l.removeAdaptor.subseq.rename.2.fa.gz
gzip ./no_adaptor_contamination/M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.rename.2.fa.gz

# 02. Exclude contaminated contigs

## make sure to use noAdaptor.fa for M7xNui_min2k_Flye & progeny_Nui_hap1

In [3]:
cd /powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

cat <<EOF > Blueberry_haplotypes.txt
progeny_M7_hap1
progeny_M7_hap2
progeny_Nui_hap1
progeny_Nui_hap2
M7xNui_min2k_Flye
EOF

In [None]:
#!/bin/bash -e

#SBATCH -J ContaminationActions # change
#SBATCH --output=log/hrasrb_%j.out
#SBATCH --error=log/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:05:00
#SBATCH --mem=1M

base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

file=${base}/Blueberry_haplotypes.txt

cd /powerplant/workspace/hrasrb/Blueberry_trio/Contamination_check/ncbi_fcs_gx/

cat "${file}" | while read tag
do
    grep -v "^#" "${tag}.fcs_gx_report.txt" | grep EXCLUDE | cut -f 1 > "${tag}.fcs_gx_report.EXCLUDE.txt"
    grep -v "^#" "${tag}.fcs_gx_report.txt" | grep REVIEW | cut -f 1,2,3 - | awk '{print $1"\t"$2"\t"$3}' > "${tag}.fcs_gx_report.REVIEW.txt"
    grep -v "^#" "${tag}.fcs_gx_report.txt" | grep TRIM | cut -f 1,2,3 - | awk '{print $1"\t"$2"\t"$3}' > "${tag}.fcs_gx_report.TRIM.txt"
    grep -v "^#" "${tag}.fcs_gx_report.txt" | grep FIX | cut -f 1,2,3 - | awk '{print $1"\t"$2"\t"$3}' > "${tag}.fcs_gx_report.FIX.txt"
    grep -v "^#" "${tag}.fcs_gx_report.txt" | grep REVIEW_RARE | cut -f 1,2,3 - | awk '{print $1"\t"$2"\t"$3}' > "${tag}.fcs_gx_report.REVIEW_RARE.txt"
done

In [4]:
cd /powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

cat <<EOF > Blueberry_haplotypes.fofn
/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/M7xNui_min2k_Flye.noAdaptor.fa
/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination/progeny_Nui_hap1.noAdaptor.fa
EOF

In [None]:
#!/bin/bash -e

#SBATCH -J ContaminationActions # change
#SBATCH --output=log/hrasrb_%j.out
#SBATCH --error=log/hrasrb_%j.err
#SBATCH --mail-user=Sarah.Bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:24:00
#SBATCH --mem=1M

ml seqkit

base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination

file=./Blueberry_haplotypes.fofn

cd "$base"

cat "${file}" | while read line
do
    tag=$(basename "${line}" .fa)
    seqkit grep --pattern-file "${tag}.fcs_gx_report.EXCLUDE.txt" --invert-match "${line}" -o "${tag}.noContamination.fa"
    gzip "${tag}.noContamination.fa"
done

In [18]:
base=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
cd $base
grep contig_12707 *EXCLUDE.txt
grep h1tg000466l *EXCLUDE.txt

: 1

# Combine no adaptor and no contamination

In [None]:
wkdir=/powerplant/workspace/hrasrb/Blueberry_trio/2023-09-01_remove_contamination
cd ${wkdir}
cat ./no_contamination/progeny_Nui_hap1.noContamination.fa.gz ./no_adaptor_contamination/progeny_Nui_hap1.h1tg000466l.removeAdaptor.subseq.rename.2.fa.gz > ./no_contamination/progeny_Nui_hap1.noAdaptor.noContamination.fa.gz

In [None]:
cat ./no_contamination/M7xNui_min2k_Flye.noContamination.fa.gz ./no_adaptor_contamination/M7xNui_min2k_Flye.contig_12707.removeAdaptor.subseq.rename.2.fa.gz > ./no_contamination/M7xNui_min2k_Flye.noAdaptor.noContamination.fa.gz

# Move to blobtools to confirm contamination