# kmer analysis HiFi

# Blueberry trio dataQC

## HiFi
```
/input/genomic/plant/Vaccinium/corymbosum/ExperimentRequestor10969_NuixM7_TrioBin/Blueberry_M7xNui/SMRTcell1/CCS_Data
/input/genomic/plant/Vaccinium/corymbosum/ExperimentRequestor10969_NuixM7_TrioBin/Blueberry_M7xNui/SMRTcell2/CCS_Data
```

* kmer analysis (Jellyfish)


# Read set 1+2 : m64136_221113_041854 and m64136_221114_133241

In [1]:
# set input file
FILE=/powerplant/input/genomic/plant/Vaccinium/corymbosum/ExperimentRequestor10969_NuixM7_TrioBin/Blueberry_M7xNui

PREFIX=NuixM7_HiFi_combined
WKDIR=/workspace/hrasrb/Blueberry_trio/data_qc/hifi/kmer_analysis
LOG=/workspace/hrasrb/log

cd $WKDIR

# create and submit bash script
sbatch << EOF
#!/bin/bash -e

#SBATCH -J Unzipcat
#SBATCH --output=${LOG}/hrasrb_%j.out
#SBATCH --error=${LOG}/hrasrb_%j.err
#SBATCH --mail-user=sarah.bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:30:00 # Walltime
#SBATCH --mem=1G
#SBATCH --cpus-per-task=1

zcat "${FILE}/SMRTcell2/CCS_Data/m64136_221114_133241.hifi_reads.fastq.gz" "${FILE}/SMRTcell1/CCS_Data/m64136_221113_041854.hifi_reads.fastq.gz" > progeny_combinedHiFi.fastq

EOF

Submitted batch job 4481132


### k-mer analysis
#### hash size calculation

In [5]:

read_len=18872 # from nanostat
no_seq=2197381 # from nanostat
genome_size=600000000 # from https://github.com/GenomicsAotearoa/High-quality-genomes/tree/main/Blueberry/M7xNui_Assembly
error_rate=0.001

# cov=($read_len*($no_seq*1000000))/$genome_size
# estimated coverage
cov=$(bc -l <<< " ( $no_seq * $read_len ) / $genome_size " )

# hash=$genome_size+($genome_size*$cov*$error_rate*21)
# Hash size (s) = G + Gcek
s=$(bc -l <<< "$genome_size + ( $genome_size * $cov * $error_rate * 21 )" )

echo $cov 
echo $s


69.11495705333333333333
1470848458.87199999999995800000


#### Bloom counter

In [8]:
module load jellyfish/2.2.10

# set variables for directories
WKDIR=/powerplant/workspace/hrasrb/Blueberry_trio/data_qc/hifi/kmer_analysis
LOG=/powerplant/workspace/hrasrb/log


# set input file
INFILE=/powerplant/workspace/hrasrb/Blueberry_trio/data_qc/hifi/kmer_analysis/progeny_combinedHiFi.fastq
HASH=1500000000
BASE=$(basename ${INFILE} .fastq)

cd $WKDIR

# create and submit bash script
sbatch --dependency=afterok:4481132 << EOF
#!/bin/bash -e

#SBATCH -J Jellyfish
#SBATCH --output=${LOG}/%j.out
#SBATCH --error=${LOG}/%j.err
#SBATCH --mail-user=sarah.bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=06:00:00 # Walltime
#SBATCH --mem=12G
#SBATCH --cpus-per-task=12

pwd

echo $INFILE

echo ${BASE}.bc

# Bloom counter to filter
jellyfish bc -m 21 -s ${HASH} -t 12 -o ${BASE}.bc -C ${INFILE}

# Frequency count
jellyfish count -m 21 -s ${HASH} -t 12 --bc ${BASE}.bc -o ${BASE}_21mer_counts.jf -C ${INFILE}

# Generate Histogram
jellyfish histo ${BASE}_21mer_counts.jf > ${BASE}_Histogram.out

EOF

module unload jellyfish

Submitted batch job 4481484


In [10]:
seff 4481484

Job ID: 4481484
Cluster: powerplant
User/Group: hrasrb/hrasrb
State: COMPLETED (exit code 0)
Nodes: 1
Cores per node: 12
CPU Utilized: 11:57:46
CPU Efficiency: 96.14% of 12:26:36 core-walltime
Job Wall-clock time: 01:02:13
Memory Utilized: 10.59 GB
Memory Efficiency: 88.25% of 12.00 GB


In [12]:
ml apptainer

# set variables for directories
WKDIR=/powerplant/workspace/hrasrb/Blueberry_trio/data_qc/hifi/kmer_analysis
LOG=/powerplant/workspace/hrasrb/log

# set input file
INFILE=/powerplant/workspace/hrasrb/Blueberry_trio/data_qc/hifi/kmer_analysis/progeny_combinedHiFi.fastq

BASE=$(basename ${INFILE} .fastq)

cd $WKDIR

# create and submit bash script
sbatch << EOF
#!/bin/bash -e

#SBATCH -J genomescope
#SBATCH --output=${LOG}/hrasrb_%j.out
#SBATCH --error=${LOG}/hrasrb_%j.err
#SBATCH --mail-user=sarah.bailey@plantandfood.co.nz
#SBATCH --mail-type=ALL
#SBATCH --time=00:02:00 # Walltime
#SBATCH --mem=200M
#SBATCH --cpus-per-task=1

/powerplant/workspace/hrasrb/Repo/genome-assembly-pipeline/pipeline/.snakemake/singularity/b217dacebe6c5100a0c2e2566108bdb3.simg genomescope.R -k 21 -i ${BASE}_Histogram.out -o ${BASE}_genomescope_p4 -p 4

EOF

module unload apptainer

Submitted batch job 4482682


In [None]:
/output/genomic/plant/Vaccinium/corymbosum/2023-10-18_M7xNui_TrioBinned_HiFi_Assemblies 