# Sub-sample from the 1Billion OminC data

## Sub-sample for 2 million pairs of data for testing purpose

In [3]:
baseDir=/workspace/hrachd/Fish/2021/HiC
RAW=/input/genomic/fish/Macruronus/novaezelandiae/reference_genome/HIC/
REF=/output/genomic/fish/Macruronus/novaezelandiae/Assembly/Mno_genome.FLYE.PILON.noBac.fasta
R1=HH_Hi-C_S1_R1_001.fastq.gz
R2=HH_Hi-C_S1_R2_001.fastq.gz

workDir=$baseDir/10_2M_data
workDir2=$baseDir/11_200M_data
logDir=$workDir/.log
logDir2=$workDir2/.log
#mkdir -p $logDir $logDir2


In [4]:
module load seqtk/1.2
cd $RAW

m2R1=HH_HiC2M_R1.fastq
m2R2=HH_HiC2M_R2.fastq

# 569125006*0.005 = 2,845,625 reads
bsub -q priority -P P/952006/04 -o $logDir/01_subsampleR1.log -e $logDir/01_subsampleR1.err -J 2MR1 "seqtk sample -s100 $R1 0.005 > $workDir/$m2R1"
bsub -q priority -P P/952006/04 -o $logDir/01_subsampleR2.log -e $logDir/01_subsampleR2.err -J 2MR2 "seqtk sample -s100 $R2 0.005 > $workDir/$m2R2"


Job <94052> is submitted to queue <priority>.
Job <94053> is submitted to queue <priority>.


In [5]:
bjobs 

JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
90980   hrachd  SSUSP priority   aklppf31    wkoppb30    Hoki       Jul  7 09:59
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                                             wkoppb30
                             

## Also create ~200 million read pairs for actual run

In [7]:
m200R1=HH_HiC200M_R1.fastq
m200R2=HH_HiC200M_R1.fastq
# 569125006*0.4 = 227,650,002 reads
bsub -q priority -P P/952006/04 -o $logDir2/01_subsampleR1.log -e $logDir2/01_subsampleR1.err -J 200MR1 "seqtk sample -s80 $R1 0.4 > $workDir2/$m200R1"
bsub -q priority -P P/952006/04 -o $logDir2/01_subsampleR2.log -e $logDir2/01_subsampleR2.err -J 200MR2 "seqtk sample -s80 $R2 0.4 > $workDir2/$m200R2"

Job <94056> is submitted to queue <priority>.
Job <94057> is submitted to queue <priority>.


## Prepare reference genome

In [8]:
assembly=/output/genomic/fish/Macruronus/novaezelandiae/Assembly/Mno_genome.FLYE.PILON.noBac.fasta
genome=Hoki_Mno.fasta
faiFile=$genome'.fai'
genomeFile=Hoki_Mno.genome
genomeDir=/workspace/hrachd/Fish/2021/HiC/00_genome
mkdir -p $genomeDir/.log

cd $genomeDir
ln -s $assembly $genome
module load samtools
bsub -o .log/01.fai.log -e .log/01.fai.err -J fai "samtools faidx $genome; cut -f1,2 $faiFile > $genomeFile"

Job <94062> is submitted to default queue <lowpriority>.


In [10]:
module load bwa/0.7.17
bsub -o .log/02.idx.log -e .log/02.idx.err -J idx -q priority -P P/952006/04 "bwa index $genome"

Job <94071> is submitted to queue <priority>.


In [11]:
bjobs | grep hrachd

90980   hrachd  SSUSP priority   aklppf31    wkoppb30    Hoki       Jul  7 09:59
89858   hrachd  RUN   priority   aklppb34    aklppb34    VB         Jul  5 10:48
94056   hrachd  RUN   priority   aklppj31    aklppb39    200MR1     Jul  9 11:22
94057   hrachd  RUN   priority   aklppj31    aklppb39    200MR2     Jul  9 11:22
94071   hrachd  RUN   priority   aklppj31    aklppg32    idx        Jul  9 11:33
90539   hrachd  PEND  lowpriorit aklppb34                *ene.gff3) Jul  6 10:46
90540   hrachd  PEND  lowpriorit aklppb34                *BI.fasta) Jul  6 10:46
90541   hrachd  PEND  lowpriorit aklppb34                *meta.csv) Jul  6 10:47


## From FastQ to Valid paris: 2M pairs of TEST reads

Refer to [Omni-C, the latest doc](https://omni-c.readthedocs.io/en/latest/fastq_to_bam.html)

Requested to [install the latest OmniC Tools](https://github.com/powerPlant/powerPlant/issues/2737). It's not available yet. Hence try Amali's old installation.

In [13]:
echo $logDir
echo $workDir

/workspace/hrachd/Fish/2021/HiC/10_2M_data/.log
/workspace/hrachd/Fish/2021/HiC/10_2M_data


In [24]:
referenceGenome=$genomeDir/$genome
thread=10
tmpDir=/workspace/hrachd/tmp/hoki/m2test
#mkdir -p $tmpDir
m2R1=HH_HiC2M_R1.fastq
m2R2=HH_HiC2M_R2.fastq

statsFile=test.2MReads.stats.txt
pairsFile=test.2MReads.mapped.pairs
mappedFile=test.2MReads.mapped.PT.bam

## Use Amali's old installation
module load conda
conda activate hraaxt_pairtools
export PATH=/workspace/hraaxt/Tools/samblaster:/workspace/hraaxt/Tools/preseq_v2.0/:$PATH
module load bwa/0.7.17
module load samtools/1.12

cd $workDir

bsub << EOF
#!/bin/bash
#BSUB -J M2Bwa
#BSUB -o $logDir/03_bwa.log
#BSUB -e $logDir/03_bwa.err
#BSUB -n $thread
#BSUB -q priority 
#BSUB -P P/952006/04
#BSUB -R "span[hosts=1] rusage[mem=200000]" 

bwa mem -5SP -T0 -t $thread $referenceGenome $m2R1 $m2R2| \
pairtools parse --min-mapq 40 --walks-policy 5unique \
--max-inter-align-gap 30 --nproc-in $thread --nproc-out $thread --chroms-path $genomeDir/$genomeFile | \
pairtools sort --tmpdir=$tmpDir --nproc $thread | pairtools dedup --nproc-in $thread \
--nproc-out $thread --mark-dups --output-stats $statsFile | pairtools split --nproc-in $thread \
--nproc-out $thread --output-pairs $pairsFile --output-sam -|samtools view -bS -@ $thread | \
samtools sort -@ $thread -o $mappedFile; samtools index $mappedFile

EOF

(/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/miniconda/hraaxt_pairtools) (/workspace/appscratch/minicond

: 1

In [25]:
bjobs | grep hrachd

90980   hrachd  SSUSP priority   aklppf31    wkoppb30    Hoki       Jul  7 09:59
89858   hrachd  RUN   priority   aklppb34    aklppb34    VB         Jul  5 10:48
95029   hrachd  RUN   priority   aklppj31    aklppb34    M2Bwa      Jul  9 14:48
90539   hrachd  PEND  lowpriorit aklppb34                *ene.gff3) Jul  6 10:46
90540   hrachd  PEND  lowpriorit aklppb34                *BI.fasta) Jul  6 10:46
90541   hrachd  PEND  lowpriorit aklppb34                *meta.csv) Jul  6 10:47
(/workspace/appscratch/miniconda/hraaxt_pairtools) 

: 1

In [26]:
ls -l $logDir/

total 192
-rw-rw-r-- 1 hrachd powerplant    0 Jul  9 10:58 01_subsampleR1.err
-rw-rw-r-- 1 hrachd powerplant 1024 Jul  9 11:14 01_subsampleR1.log
-rw-rw-r-- 1 hrachd powerplant    0 Jul  9 10:58 01_subsampleR2.err
-rw-rw-r-- 1 hrachd powerplant 1024 Jul  9 11:18 01_subsampleR2.log
-rw-rw-r-- 1 hrachd powerplant    0 Jul  9 14:48 03_bwa.err
-rw-rw-r-- 1 hrachd powerplant    0 Jul  9 14:48 03_bwa.log
(/workspace/appscratch/miniconda/hraaxt_pairtools) 

: 1

In [27]:
bjobs | grep hrachd

90980   hrachd  SSUSP priority   aklppf31    wkoppb30    Hoki       Jul  7 09:59
89858   hrachd  RUN   priority   aklppb34    aklppb34    VB         Jul  5 10:48
95029   hrachd  RUN   priority   aklppj31    aklppb34    M2Bwa      Jul  9 14:48
90539   hrachd  PEND  lowpriorit aklppb34                *ene.gff3) Jul  6 10:46
90540   hrachd  PEND  lowpriorit aklppb34                *BI.fasta) Jul  6 10:46
90541   hrachd  PEND  lowpriorit aklppb34                *meta.csv) Jul  6 10:47
(/workspace/appscratch/miniconda/hraaxt_pairtools) 

: 1