# Testing DNase_HiC Libraries
MiSeq run of HiC libraries prepared with protocols.io link here....

Tried method on:
- Kiwifruit (*Actinidia chinensis*)
- *Eucalyptus regnans*
- Karaka (*Corynocarpus laevigatus*)
- Grape (*Vitis sp.*)
- *Wahlenbergia ceracea*


### Metadata
Input genomes here:
/workspace/hraijc/HiC_trials/Ash_MinION/Genomes

HiC MiSeq Data here:
/input/genomic/plant/Actinidia/PopulationsStudies/MiSeq/ExperimentRequestor10984/AG1002_dataset/

- Data Sequenced by Auckland Genomics (AG1002)
- Experiment requestor number 10984

In [5]:
# Set Global Variables
WKDIR=/workspace/hraijc/HiC_trials/Ash_MinION
TEMP=${WKDIR}/temp
LOGDIR=${WKDIR}/log
GENOMEDIR=${WKDIR}/Genomes
HiCDIR=/input/genomic/plant/Actinidia/PopulationsStudies/MiSeq/ExperimentRequestor10984/AG1002_dataset
HiC_CLEAN=${WKDIR}/HiC_Clean

In [6]:
cd ${WKDIR}

In [7]:
# Set up workspace
mkdir -p $LOGDIR
mkdir -p $TEMP
mkdir -p $HiC_CLEAN


In [2]:
ls $HiCDIR

AG1002-001_L001_ds.57aba33445fb4bebb603e3c75a661903.json
AG1002-002_L001_ds.457b47ab32984ccb98a3f9692f82f60f.json
AG1002-003_L001_ds.dcf02d3149ad40bb86e682abe14249ca.json
AG1002-004_L001_ds.68af5d965b0945d08f4a1229d1b5bb60.json
AG1002-005_L001_ds.596f6b2f812344daa91d4c7ce230b98f.json
AG1002-006_L001_ds.2226c11013764a2badf46cf2a1df29ab.json
AG1002-007_L001_ds.11f172f5523c4d32ae1cc4b7c9cab79b.json
Eucalyptus-Hi-C_S1_L001_R1_001.fastq.gz
Eucalyptus-Hi-C_S1_L001_R2_001.fastq.gz
Grape-Hi-C_S2_L001_R1_001.fastq.gz
Grape-Hi-C_S2_L001_R2_001.fastq.gz
Karaka-5-Hi-C_S3_L001_R1_001.fastq.gz
Karaka-5-Hi-C_S3_L001_R2_001.fastq.gz
TC-T1-SI-Amelanandra_S6_L001_R1_001.fastq.gz
TC-T1-SI-Amelanandra_S6_L001_R2_001.fastq.gz
TC-T1-SI-Russell_S5_L001_R1_001.fastq.gz
TC-T1-SI-Russell_S5_L001_R2_001.fastq.gz
TC-T2-Hi-C-Russell_S7_L001_R1_001.fastq.gz
TC-T2-Hi-C-Russell_S7_L001_R2_001.fastq.gz
Wahlenbergia-Hi-C_S4_L001_R1_001.fastq.gz
Wahlenbergia-Hi-C_S4_L001_R2_001.fastq.gz


In [None]:
#create reference files for assemblies.

module load samtools/0.1.19

for ASSEMBLY in ${GENOMEDIR}/*.fasta
do
    samtools faidx $ASSEMBLY
    cut -f1,2 ${ASSEMBLY}.fai > ${ASSEMBLY%.fasta}.genome
done | abatch -j ${LOGDIR}/AssemblyPrep --time 00:10:00 --mem 1G | sbatch



## Wahlenbergia

In [4]:
#Set Variables
SPDIR=${WKDIR}/Wahlenbergia
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/wahlenbergia_ceracea_unphased.fasta
APREFIX=Wahlenbergia_hic
HiC_RAW=${HiCDIR}/Wahlenbergia-Hi-C_S4_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz


## Eucalyptus

In [6]:
#Set Variables
SPDIR=${WKDIR}/Eucalyptus
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/E_regnans_unphased.fasta
APREFIX=Eucalyptus_hic
HiC_RAW=${HiCDIR}/Eucalyptus-Hi-C_S1_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz


## Grape

In [36]:
#Set Variables
SPDIR=${WKDIR}/Grape
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/VITVvi_vZin03_v1.1.primary.fasta
APREFIX=Grape_hic
HiC_RAW=${HiCDIR}/Grape-Hi-C_S2_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz


## Karaka

In [10]:
#Set Variablessss
SPDIR=${WKDIR}/Karaka
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/karaka_phasing.hic.hap1.p_ctg.fasta
APREFIX=Karaka_hic
HiC_RAW=${HiCDIR}/Karaka-5-Hi-C_S3_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz


## Kiwifruit 2N

In [3]:
#Target Capture, Do NOT use
#SPDIR=${WKDIR}/Kiwifruit2n
#mkdir -p $SPDIR
#ASSEMBLY=${GENOMEDIR}/R_Haplotigs.fasta
#APREFIX=Kiwifruit2n_hic
#HiC_RAW=${HiCDIR}/
#READ1=${HiC_RAW}1_001.fastq.gz
#READ2=${HiC_RAW}2_001.fastq.gz


In [10]:
#QC run for Elena. R
#Set Variables
SPDIR=${WKDIR}/Kiwifruit2n
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/R_Primary.fasta
APREFIX=Kiwifruit2n_hic
HiC_RAW=/input/genomic/plant/Actinidia/chinensis/Russell/Genome/HiC/ExperimentRequestor10968/AG0936_dataset/Russell5IIM270PBB_S1_L001_R






READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz

In [26]:
#Set Variables
SPDIR=${WKDIR}/Kiwifruit_XX
mkdir -p $SPDIR
ASSEMBLY=/workspace/hraijc/Kiwi/XX/HiC8_Novaseq/RefGenome/CK69_01_v2_contigs_min1kb.fasta
APREFIX=Kiwifruit_XX_hic
HiC_RAW=/workspace/hraijc/Kiwi/XX/HiC8_MiSeq/220622_M01815_0436/HiC8-lib_S1_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz

## Raspberry

In [8]:
#Set Variables
SPDIR=${WKDIR}/Wakefield
mkdir -p $SPDIR
ASSEMBLY=${GENOMEDIR}/Wakefield.fasta
APREFIX=Wakefield_hic
HiC_RAW=/input/genomic/plant/Rubus/idaeus/Wakefield_genome/HIC/MiseqRun/RI-Hi-C_S2_L001_R
READ1=${HiC_RAW}1_001.fastq.gz
READ2=${HiC_RAW}2_001.fastq.gz

In [16]:
#Run HiC workflow

sbatch << EOF
#!/bin/bash
#SBATCH -J hic
#SBATCH -o ${WKDIR}/log/%J.out
#SBATCH -e ${WKDIR}/log/%J.err
#SBATCH --cpus-per-task=12
#SBATCH --mem=10G
#SBATCH --time=5:00:00

module load bwa/0.7.17
module load samtools/1.16
module load conda
module load fastp/0.23.2


cd ${SPDIR}

### Clean HiC Reads with fastp.###
fastp \
-i ${READ1} \
-o ${HiC_CLEAN}/${APREFIX}_Clean_R1.fastq.gz \
-I ${READ2} \
-O ${HiC_CLEAN}/${APREFIX}_Clean_R2.fastq.gz \
--trim_front1 15 \
--trim_front2 15 \
--qualified_quality_phred 20 \
--length_required 50 \
--thread 12


###Align reads with BWA. Use -5SP for Hi-C reads.###
bwa index $ASSEMBLY
bwa mem -5SP -t12 ${ASSEMBLY} ${HiC_CLEAN}/${APREFIX}_Clean_R1.fastq.gz ${HiC_CLEAN}/${APREFIX}_Clean_R2.fastq.gz -o ${TEMP}/${APREFIX}.sam



###Flag PCR Duplicates with SAMBLASTER###
/workspace/hraijc/git_clones/samblaster/samblaster -i ${TEMP}/${APREFIX}.sam -o ${TEMP}/${APREFIX}_marked_byread.sam



### Remove unmmaped and non-primary aligned reads. Sort and Index bam files.### 
samtools view -S -b -h -@ 12 -F 2316 ${TEMP}/${APREFIX}_marked_byread.sam > ${TEMP}/${APREFIX}_presort_marked.bam

samtools sort -@ 12 ${TEMP}/${APREFIX}_presort_marked.bam -o ${APREFIX}_marked.bam
samtools sort -@ 12 -n ${TEMP}/${APREFIX}_presort_marked.bam -o ${APREFIX}_marked_namesort.bam

samtools view -b -h -@ 12 -F 3340 ${APREFIX}_marked.bam > ${TEMP}/${APREFIX}_presort_marked.bam 

samtools sort -@ 12 ${TEMP}/${APREFIX}_presort_marked.bam -o ${APREFIX}_dedup.bam
samtools sort -@ 12 -n ${TEMP}/${APREFIX}_presort_marked.bam -o ${APREFIX}_dedup_namesort.bam

samtools index -@ 12 ${APREFIX}_marked.bam 

samtools index -@ 12 ${APREFIX}_dedup.bam 

samtools flagstat -@ 12 ${APREFIX}_marked.bam > flagstat.txt
samtools flagstat -@ 12 ${APREFIX}_dedup.bam  > flagstat_dedup.txt


### HiC QC from Phase genomics ###
conda activate hraijc_hic_qc2
python /workspace/hraijc/git_clones/hic_qc/hic_qc.py -n -1 -b ${APREFIX}_marked_namesort.bam --outfile_prefix ${APREFIX}_marked
python /workspace/hraijc/git_clones/hic_qc/hic_qc.py -n -1 -b ${APREFIX}_dedup_namesort.bam --outfile_prefix ${APREFIX}_dedup

conda deactivate


## Viz HiC Coverage #####
/home/hraijc/.local/bin/plotCoverage -b ${APREFIX}_marked.bam --plotFile example_coverage -n 100000 -o HiC_Coverage_allreads.png --labels ${APREFIX} --plotTitle "HiC_Coverage all reads" 
/home/hraijc/.local/bin/plotCoverage -b ${APREFIX}_dedup.bam --plotFile example_coverage -n 100000 -o HiC_Coverage_unique_reads.png --labels ${APREFIX} --plotTitle "HiC_Coverage PCR duplicates removed" 


### Run YAHS ###
#/workspace/hraijc/git_clones/yahs/yahs $ASSEMBLY ${APREFIX}_dedup.bam -o ${APREFIX}_yahs



### YAHS without error correction. ###
#/workspace/hraijc/git_clones/yahs/yahs --no-contig-ec --no-scaffold-ec $ASSEMBLY ${APREFIX}_dedup.bam -o ${APREFIX}_yahsNoEC




EOF


Submitted batch job 949349
