Skip to content

Latest commit

 

History

History
89 lines (71 loc) · 3.55 KB

benchmark1.md

File metadata and controls

89 lines (71 loc) · 3.55 KB

Benchmarking NextPolish2 using simulated data

Requirement

Software commands

  1. Download reference
wget https://www.arabidopsis.org/download_files/Genes/Col-CEN%20genome%20assembly%20release/ColCEN.fasta
  1. Simulate diploid genome
pirs diploid --snp-rate 0.005 --indel-rate 0.002 --random-seed=1 ColCEN.fasta -o alta.simu
cat ColCEN.fasta alta.simu.snp.indel.inversion.fa > alta.sim.diploid.fa
  1. Simulate PacBio HiFi data
pbsim --strategy wgs --method errhmm --errhmm ERRHMM-SEQUEL.model --depth 30 --genome alta.sim.diploid.fa --length-mean 13000 --prefix alta.sim --id-prefix alta.sim --length-sd 2000 --pass-num 10
ls *sam|while read line;do samtools view -b $line > $line.bam && conda run ccs $line.bam $line.hifi.bam && samtools fasta $line.hifi.bam > $line.fa; done;
cat alta.sim.hifi*.sam.fa > alta.sim.diploid.hifi.fa
  1. Simulate Illumina data
art_illumina -ss HS25 -i ColCEN.fasta -p -l 150 -f 50 -m 300 -s 10 -o alta.sim.hap1.R
#output: alta.sim.hap1.R1.fq alta.sim.hap1.R2.fq 

art_illumina -ss HS25 -i alta.simu.snp.indel.inversion.fa -p -l 150 -f 50 -m 300 -s 10 -o alta.sim.hap2.R
#output: alta.sim.hap2.R1.fq alta.sim.hap2.R2.fq 
  1. Assemble reference
hifiasm -o alta.sim.diploid.hifiasm -t 20 alta.sim.diploid.hifi.fa
awk '{if ($1~/^S/ && length($3)>=1000000){print ">"$2;print $3}}' alta.sim.diploid.hifiasm.p_ctg.gfa > alta.sim.diploid.hifiasm.p_ctg.gfa.fa
  1. Polishing with Racon + Merfin
k=19
thread=5
# Construct k-mer db
meryl count k=${k} threads=${thread} alta.sim.hap*.fq output reads.meryl

# Collect histogram for GenomeScope
meryl histogram reads.meryl > reads.hist

# Exclude frequency = 1 k-mers
meryl greater-than 1 reads.meryl output reads.gt1.meryl

bash automated-polishing.sh ${thread} 1 alta.sim.diploid.hifiasm.p_ctg.gfa.fa alta.sim.diploid.hifi.fa reads.gt1.meryl racon.meryl
  1. Polishing with NextPolish2
thread=5
yak count -t ${thread} -k 21 -b 37 -o sr.k21.yak <(cat alta.sim.hap*.fq) <(cat alta.sim.hap*.fq)
yak count -t ${thread} -k 31 -b 37 -o sr.k31.yak <(cat alta.sim.hap*.fq) <(cat alta.sim.hap*.fq)

HiFi_mapping_file=racon.meryl.iter_1.winnowmap.bam # output by `Racon + Merfin`
samtools index ${HiFi_mapping_file}

nextPolish2 -t ${thread} -r ${HiFi_mapping_file} alta.sim.diploid.hifiasm.p_ctg.gfa.fa sr.k21.yak sr.k31.yak -o alta.sim.diploid.hifiasm.p_ctg.gfa.np2.fa
  1. Performance assessment
thread=5
yak count -t ${thread} -k 21 -b 37 -o sr.hap1.k21.yak <(cat alta.sim.hap1*.fq) <(cat alta.sim.hap1*.fq)
yak count -t ${thread} -k 21 -b 37 -o sr.hap2.k31.yak <(cat alta.sim.hap2*.fq) <(cat alta.sim.hap2*.fq)

asm=alta.sim.diploid.hifiasm.p_ctg.gfa.fa #alta.sim.diploid.hifiasm.p_ctg.gfa.np2.fa, racon.meryl.iter_1.consensus.fasta
yak qv sr.k21.yak ${asm} > ${asm}.qv
yak trioeval sr.hap1.k21.yak sr.hap2.k31.yak ${asm} > ${asm}.trioeval