# Hi-C Scaffolding MynaBird

In [1]:
#Set Up
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna

mkdir -p $WKDIR/log
cd $WKDIR

In [None]:
#IN INTERACTIVE SESSION:
#Use Srun not Salloc
"""
srun --time 01:00:00 --cpus-per-task 1 --mem-per-cpu 4G --pty bash
cd $WKDIR
git clone git@github.com:ignacio3437/hic-scaffolding-pipeline.git
cd ${WKDIR}/hic-scaffolding-pipeline/
git checkout dev
cd ${WKDIR}/hic-scaffolding-pipeline/containers
sudo -E singularity build pipeline_container.sif pipeline_container.def

"""

In [None]:
#Combine Reads from flowcells

sbatch << EOF
#!/bin/bash
#SBATCH -J cat
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=1:00:00
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err



cd /input/genomic/plant/Actinidia/chinensis/ExperimentRequestor11022/AGRF_CAGRF230414403_HFFHYDSX7
date
cat MBC3_C_HFFHYDSX7_CAGGCG_L002_R1.fastq.gz MBC3_C_HFFHYDSX7_CAGGCG_L003_R1.fastq.gz > /workspace/hraijc/TEMP/Myna_HiC23_R1.fastq.gz
cat MBC3_C_HFFHYDSX7_CAGGCG_L002_R2.fastq.gz MBC3_C_HFFHYDSX7_CAGGCG_L003_R2.fastq.gz > /workspace/hraijc/TEMP/Myna_HiC23_R2.fastq.gz
date




EOF


In [None]:
# Modify Config file

cat ${WKDIR}/hic-scaffolding-pipeline/nextflow.config

In [None]:
# Run

sbatch << EOF
#!/bin/bash -e

#SBATCH -J hic_pipeline_${USER}
#SBATCH --time=167:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --mem=4G

ml nextflow/22.10.4
ml apptainer/1.1

cd ${WKDIR}/hic-scaffolding-pipeline

srun nextflow main.nf -resume
EOF





In [None]:
APREFIX=Atri_hic1

In [None]:
#Remove PCR duplicates to speed up analysis.
sbatch << EOF
#!/bin/bash
#SBATCH -J hic
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=12
#SBATCH --mem=10G
#SBATCH --time=15:00:00

module load samtools/1.16

APREFIX=Atri_hic1


cd /workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/hic-scaffolding-pipeline/work/d7/9f9231cf5d6ad9d69cf958c935a037/

samtools view -S -b -h -@ 12 -F 2316 ${APREFIX}_marked_byread.sam > ${APREFIX}_presort_marked.bam

samtools sort -@ 12 ${APREFIX}_presort_marked.bam -o ${APREFIX}_marked.bam

samtools view -h -@ 12 -F 3340 ${APREFIX}_marked.bam > ${APREFIX}_dedup.sam 







EOF


## YAHS

In [3]:
#Set Up
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/yahs
ASSEMBLY=/workspace/hraijc/HiC_trials/HiC23/Assemblies/Atri_polished_final_step3.purge.vecscreen.trim.filtered.fna
BAMFILE=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/hic-scaffolding-pipeline/results/bam/Atri_hic1_dedup.bam
OUT_PREFIX=Atri_hic1
mkdir -p $WKDIR/log
cd $WKDIR

In [None]:
## Just run YAHS

sbatch << EOF
#!/bin/bash
#SBATCH -J YAHS
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=12G
#SBATCH --time=23:00:00

cd ${WKDIR}

/workspace/hraijc/git_clones/yahs/yahs ${ASSEMBLY} ${BAMFILE} --no-mem-check -o ${OUT_PREFIX}_yahs -r 20000,50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000,50000000,100000000,200000000,500000000


EOF


In [None]:
## Index new assembly.

sbatch << EOF
#!/bin/bash
#SBATCH -J YAHS
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=1:00:00

cd ${WKDIR}

module load samtools
samtools faidx Atri_hic1_yahs_scaffolds_final.fa

EOF


In [None]:
# Make heatmap of YAHS output
cd ${WKDIR}
out=Atri_hic1_yahs # prefix of outfiles produced by YAHS.  
contigs=${ASSEMBLY} # need to be indexed, i.e., test.fa.gz.fai in same directory
wkdir=${WKDIR}


sbatch --job-name=hic_mapyahs \
    -o ${WKDIR}/log/%J.out \
    -e ${WKDIR}/log/%J.err \
    --cpus-per-task=8 \
    --mem=12G \
    --time=03:10:00 \
    --export=out=$out,contigs=$contigs,wkdir=$wkdir \
    /workspace/hraijc/Gitrepos/High-quality-genomes/Methods/DNase_HiC/notebooks/yahs_contactmapgen2.sh

In [4]:
##############
#Do manual scaffolding cleanup in JuiceBox then run

##############


sbatch << EOF
#!/bin/bash
#SBATCH -J YAHS
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=4G
#SBATCH --time=2:00:00

#module load samtools


cd ${WKDIR}



/workspace/hraijc/git_clones/yahs/juicer post -o Atri_hic1_yahs3 Atri_hic1_yahs_JBAT.review.assembly Atri_hic1_yahs_JBAT.liftover.agp ${ASSEMBLY}



EOF


Submitted batch job 1985628


### Subset unique mapped reads

In [None]:
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/hic-scaffolding-pipeline/results
BAMFILE=${WKDIR}/bam/Atri_hic1_yahs_dedup.bam
mkdir -p ${WKDIR}/log

In [None]:
##############
# Subset HiC reads to keep unique mapped reads only. speed up analysis for future iterations.

##############


sbatch << EOF
#!/bin/bash
#SBATCH -J samtoolsfastq
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=8
#SBATCH --mem=12G
#SBATCH --time=2:00:00

cd ${WKDIR}

module load samtools 

samtools sort -@ 8 -n ${BAMFILE} -o ${BAMFILE%.bam}.nsorted.bam
samtools fastq -@ 8 ${BAMFILE%.bam}.nsorted.bam -1 Atri_hic1_umap_R1.fastq.gz -2 Atri_hic1_umap_R2.fastq.gz -0 /dev/null -s /dev/null -n



EOF

## ASSEMBLYQC

In [None]:
#git clone git@github.com:PlantandFoodResearch/assembly_qc.git


In [None]:
#Set Up
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/assembly_qc
ASSEMBLY=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna/Atri_hic1_yahs2.fa
mkdir -p $WKDIR/log
cd $WKDIR

In [None]:
cat nextflow.config | grep -v "//"

In [None]:
sbatch ./assembly_qc_pfr.sh


# Hi-C Scaffolding RunB

#### Running full hic scaffolding pipeline using all of the scaffolding tools on just the uniquely mapped reads from the previous round to speed things up.

In [1]:
#Set Up
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Myna

mkdir -p $WKDIR/log
cd $WKDIR

In [2]:
# Run

sbatch << EOF
#!/bin/bash -e

#SBATCH -J hic_pipeline_${USER}
#SBATCH --time=167:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --mem=4G

ml nextflow/22.10.4
ml apptainer/1.1

cd ${WKDIR}/hic-scaffolding-pipeline

srun nextflow main.nf -resume
EOF


Submitted batch job 1953593


In [2]:
cd ${WKDIR}/hic-scaffolding-pipeline

In [4]:
cat nextflow.config 

includeConfig './conf/base.config'

params {

    // R1, R2 dataset
    reads_R1                = "/workspace/hraijc/TEMP/Atri_hic1_umap_R1.fastq.gz"
    reads_R2                = "/workspace/hraijc/TEMP/Atri_hic1_umap_R2.fastq.gz"

    // Assembly file
    assembly_fasta          = "/workspace/hraijc/HiC_trials/HiC23/Assemblies/Atri_polished_final_step3.purge.vecscreen.trim.filtered.fna" // SALSA requires fasta to not contain ":" character

    // Prefix for output files
    out_prefix              = "Atri_hic2"

    //STAG 1 
    stop_at_hicqc           = false

    //STAG 2: Include one specific scaffolder, report included
    only_run_yahs           = false
    only_run_salsa2         = false
    only_run_allhic         = false
    only_run_3D_DNA         = false
    

    /* https://github.com/c-zhou/yahs
    yahs_params{
        scaffolding_with_agp_file       = "" // for exmple: -a
        resolution                      = "" // for exmple: -r
        enzyme                     