# Hi-C Scaffolding Urchin

In [8]:
#Set Up
WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin
ASSEMBLY_PRESCAFF=/workspace/hraijc/HiC_trials/HiC23/Assemblies/Crodergsii_HiFi130423_purged.fa
ASSEMBLY_POSTSCAFF=/workspace/hraijc/HiC_trials/HiC23/Results/Crodergsii_HiC070623_v2.fa
HiC_R1=/workspace/hraijc/TEMP/Urchin_HiC23_R1.fastq.gz
HiC_R2=/workspace/hraijc/TEMP/Urchin_HiC23_R2.fastq.gz
OUT_PREFIX=Crodergsii_HiC070623
TELOMERE=TTAGGG #http://telomerase.asu.edu/sequences_telomere.html
GENOME_SIZE=880000000
NF_CONFIG=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin/hic-scaffolding-pipeline/nextflow.config

mkdir -p $WKDIR/log
cd $WKDIR

In [11]:
#Combine Reads from flowcells

sbatch << EOF
#!/bin/bash
#SBATCH -J cat
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=1:00:00
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err



cd /input/genomic/plant/Actinidia/chinensis/ExperimentRequestor11022/AGRF_CAGRF230414403_HFFHYDSX7

echo "start"
date
cat Urchin_sp_HFFHYDSX7_TAATCG_L002_R1.fastq.gz Urchin_sp_HFFHYDSX7_TAATCG_L003_R1.fastq.gz > /workspace/hraijc/TEMP/Urchin_HiC23_R1.fastq.gz
cat Urchin_sp_HFFHYDSX7_TAATCG_L002_R2.fastq.gz Urchin_sp_HFFHYDSX7_TAATCG_L003_R2.fastq.gz > /workspace/hraijc/TEMP/Urchin_HiC23_R2.fastq.gz
date


EOF


Submitted batch job 1361115


In [4]:
#IN INTERACTIVE SESSION:
#Use Srun not Salloc
"""
srun --time 01:00:00 --cpus-per-task 1 --mem-per-cpu 4G --pty bash
#WKDIR=/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin
cd $WKDIR
git clone git@github.com:PlantandFoodResearch/hic-scaffolding-pipeline.git
cd ${WKDIR}/hic-scaffolding-pipeline/
git checkout dev
cd ${WKDIR}/hic-scaffolding-pipeline/containers
sudo -E singularity build pipeline_container.sif pipeline_container.def

"""

In [23]:
# Modify Config file

sed -i "s:/workspace/hraijc/Kiwi/Ck69_01_monoploid/HiC8_MiSeq/220622_M01815_0436/HiC8-lib_S1_L001_R1_001.fastq.gz:${HiC_R1}:g" ${NF_CONFIG}
sed -i "s:/workspace/hraijc/Kiwi/Ck69_01_monoploid/HiC8_MiSeq/220622_M01815_0436/HiC8-lib_S1_L001_R2_001.fastq.gz:${HiC_R2}:g" ${NF_CONFIG}
sed -i "s:/output/genomic/plant/Actinidia/chinensis/CK69_01m/Genome/Assembly/LATEST/Fasta/CK69_01_v2.scaffolds.fsa:${ASSEMBLY_PRESCAFF}:g" ${NF_CONFIG}
sed -i "s:Ck69_01_monoploid:${OUT_PREFIX}:g" ${NF_CONFIG}
sed -i "s:TTAGGG:${TELOMERE}:g" ${NF_CONFIG}
sed -i "s:758000000:${GENOME_SIZE}:g" ${NF_CONFIG}





In [5]:
# Setup slurm.sh

cat << EOF > hic_pipeline_slurm.sh
#!/bin/bash -e

#SBATCH -J Urchin_hic_pipeline_${USER}
#SBATCH --time=14-00:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --mem=32G

ml nextflow/22.10.4
ml apptainer/1.1

cd ${WKDIR}/hic-scaffolding-pipeline

srun nextflow main.nf -resume
EOF





In [2]:
# Run Pipeline
chmod u+x ./hic_pipeline_slurm.sh
sbatch ./hic_pipeline_slurm.sh

sbatch: It is not safe to run jobs over 7 days of walltime without checkpointing.
sbatch: The time limit for this job is 14 days.
Submitted batch job 1366144


In [None]:
#Just run YAHS

sbatch << EOF
#!/bin/bash
#SBATCH -J YAHS
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=48G
#SBATCH --time=23:00:00

cd ${WKDIR}

/workspace/hraijc/git_clones/yahs/yahs ${ASSEMBLY} ${BAMFILE} --no-mem-check -o ${OUT_PREFIX}_yahs_10K -r 10000,20000,50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000,50000000,100000000,200000000,500000000


EOF



In [7]:
out=${OUT_PREFIX}_yahs_10K # prefix of outfiles produced by YAHS.  
contigs=${ASSEMBLY_PRESCAFF} # need to be indexed, i.e., test.fa.gz.fai in same directory
wkdir=${WKDIR}


sbatch --job-name=hic_mapyahs \
    -o ${WKDIR}/log/%J.out \
    -e ${WKDIR}/log/%J.err \
    --cpus-per-task=8 \
    --mem=24G \
    --time=03:10:00 \
    --export=out=$out,contigs=$contigs,wkdir=$wkdir \
    /workspace/hraijc/Gitrepos/High-quality-genomes/Methods/DNase_HiC/notebooks/yahs_contactmapgen2.sh

hraijc


In [7]:
##############
#Do manual scaffolding cleanup in Juicer then run

##############
out=${OUT_PREFIX}_yahs_10K # prefix of outfiles produced by YAHS.  
contigs=${ASSEMBLY_PRESCAFF} # need to be indexed, i.e., test.fa.gz.fai in same directory
wkdir=${WKDIR}

sbatch << EOF
#!/bin/bash
#SBATCH -J YAHS
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=4G
#SBATCH --time=2:00:00

#module load samtools


cd ${WKDIR}



/workspace/hraijc/git_clones/yahs/juicer post -o Crodergsii_HiC070623_v2 Crodergsii_HiC070623_yahs_10Kv2_JBAT.review.assembly Crodergsii_HiC070623_yahs_10K_JBAT.liftover.agp ${ASSEMBLY_PRESCAFF}



EOF


Submitted batch job 1553444


In [11]:
cp ${WKDIR}/${OUT_PREFIX}_v2.FINAL.fa ${ASSEMBLY_POSTSCAFF}

In [14]:
out=${OUT_PREFIX}_yahs_10K # prefix of outfiles produced by YAHS.  

cd ${WKDIR}
module load BBMap/38.33

statswrapper.sh in=${ASSEMBLY_PRESCAFF},${WKDIR}/${OUT_PREFIX}_yahs_10K_scaffolds_final.fa,${ASSEMBLY_POSTSCAFF} format=3 
module unload BBMap/38.33

java -ea -Xmx200m -cp /software/bioinformatics/BBMap-38.33/current/ jgi.AssemblyStatsWrapper format=3 in=/workspace/hraijc/HiC_trials/HiC23/Assemblies/Crodergsii_HiFi130423_purged.fa,/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin/Crodergsii_HiC070623_yahs_10K_scaffolds_final.fa,/workspace/hraijc/HiC_trials/HiC23/Results/Crodergsii_HiC070623_v2.fa format=3
n_scaffolds	n_contigs	scaf_bp	contig_bp	gap_pct	scaf_N50	scaf_L50	ctg_N50	ctg_L50	scaf_N90	scaf_L90	ctg_N90	ctg_L90	scaf_max	ctg_max	scaf_n_gt50K	scaf_pct_gt50K	gc_avg	gc_std	filename
2770	2887	883753165	883750474	0.000	498	559676	501	557004	1595	156350	1621	150094	4267868	4267868	2376	98.531	0.39359	0.00860	/powerplant/workspace/hraijc/HiC_trials/HiC23/Assemblies/Crodergsii_HiFi130423_purged.fa
337	3002	884262765	883750474	0.058	9	39820943	515	549938	18	32836090	1661	146243	76945733	4267868	167	99.575	0.39359	0.01835	/powerplant/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin/Crodergsii_HiC070623_yahs_10K_scaffolds_final.fa
346	3

| n_scaffolds 	| n_contigs 	| scaf_bp 	| contig_bp 	| gap_pct 	| scaf_N50 	| scaf_L50 	| ctg_N50 	| ctg_L50 	| scaf_N90 	| scaf_L90 	| ctg_N90 	| ctg_L90 	| scaf_max 	| ctg_max 	| scaf_n_gt50K 	| scaf_pct_gt50K 	| gc_avg 	| gc_std 	| filename 	|
|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|---	|
| 2770 	| 2887 	| 883753165 	| 883750474 	| 0.000 	| 498 	| 559676 	| 501 	| 557004 	| 1595 	| 156350 	| 1621 	| 150094 	| 4267868 	| 4267868 	| 2376 	| 98.531 	| 0.39359 	| 0.00860 	| /powerplant/workspace/hraijc/HiC_trials/HiC23/Assemblies/Crodergsii_HiFi130423_purged.fa 	|
| 337 	| 3002 	| 884262765 	| 883750474 	| 0.058 	| 9 	| 39820943 	| 515 	| 549938 	| 18 	| 32836090 	| 1661 	| 146243 	| 76945733 	| 4267868 	| 167 	| 99.575 	| 0.39359 	| 0.01835 	| /powerplant/workspace/hraijc/HiC_trials/HiC23/NovaSeq/Urchin/Crodergsii_HiC070623_yahs_10K_scaffolds_final.fa 	|
| 346 	| 3010 	| 884262565 	| 883750474 	| 0.058 	| 11 	| 38040201 	| 517 	| 549493 	| 21 	| 32277710 	| 1666 	| 145522 	| 52699487 	| 4267868 	| 174 	| 99.570 	| 0.39359 	| 0.01826 	| /powerplant/workspace/hraijc/HiC_trials/HiC23/Results/Crodergsii_HiC070623_v2.fa 	|

## AssemblyQC

In [None]:
#cd ${WKDIR}
#git clone git@github.com:PlantandFoodResearch/assembly_qc.git

In [15]:
cd ${WKDIR}/assembly_qc


In [16]:
sbatch ./assembly_qc_pfr.sh


Submitted batch job 1554492
