In [None]:
#######################################
Function: 
1. Make a directory for the BAM files that need to be aligned together.
2. Make a directory for the BAM files that need to be merged together.
3. Make a directory to store errors and output logs for the alignment process.
4. Make a directory to store errors and output logs for the merging process.
5. Make a directory to store errors and output logs for the duplication removal process.
6. Make a directory to store errors and output logs for adding read group IDS.
7. Make a directory to store temporary files that are created when running the script processes.
#######################################

In [95]:
mkdir -p 00.data/alignment_files
mkdir -p 00.data/merging_files
mkdir -p 02.alignment/log
mkdir -p 03.merged_bam_files/log
mkdir -p 04.mark_duplicates/log
mkdir -p 05.add_read_group_IDs/log
mkdir -p tmp

In [None]:
#######################################
Function: 
1. Extract data from the SQLite database of the reference and fastq data tables.
2. Make a ref.tab file to store the path of the reference and the reference filename from the extracted data after trimming 
these variables of enters and dashes so they are readable as input for process running.
3. Make a read_groups.tab file to store group identifiers in after formatting the FASTQ ID to group names.
4. Adding these group names to a samples.tab file, along with all trimmed read data from the fastq data table.
#######################################

In [None]:
# Extract ref and fastq data.
import sqlite3
con = sqlite3.connect(r"/powerplant/workspace/hramzr/DNAseq_mapping/db_mapping.db")
mycur = con.cursor() 
mycur.execute("SELECT fastq_id,read1_L001_trimmed,read2_L001_trimmed,read1_L002_trimmed,read2_L002_trimmed,trimmed_path FROM fastq_data")
dna_data_file=(mycur.fetchall())
mycur.execute("SELECT * FROM ref_data")
ref_data_file=(mycur.fetchall())

# Make REF tab.
f= open("00.data/ref.tab","w+")
ref_data_file = str(ref_data_file[0]).split(",")
IPATH = ref_data_file[1][2:66]
INDEX = ref_data_file[2][2:44]
f.write(IPATH + "\t" + INDEX +"\n")
f.close()
            

# Make groups tab.
f= open("00.data/read_groups.tab", "w+")
for line in dna_data_file:
    FID, R1L1T, R2L1T, R1L2T, R2L2T, TRPTH = line
    if FID < 10:
        f.write("0" + str(FID) + "_group" + "\t" + str(FID) +"\n")    
    else:
        f.write(str(FID) + "_group" + "\t" + str(FID) +"\n")  
f.close()            

# Retrieve group names.
name_list = []
with open("00.data/read_groups.tab") as f:
   for line in f:
        line = line.split("\t")
        name_list.append(line[0])

# Make samples tab.
f= open("00.data/samples.tab", "w+")
i=0
for line in dna_data_file:
    FID, R1L1T, R2L1T, R1L2T, R2L2T, TRPTH = line
    f.write(name_list[i] + "\t" + str(FID) + "\t" + R1L1T.strip() +  "\t" + R2L1T.strip() + "\t" + R1L2T.strip()
            + "\t" + R2L2T.strip() + "\t"+ TRPTH.strip()+"\n")
    i+=1
f.close()       

#### Generating merging config files

In [None]:
#######################################
Function: 
Remove existing merging files if they exist and
Generate new merging config files by using
the group and group number columns from the samples.tab file to store 
lines with the directory and file that contain the BAM files that need to be merged in order to create 1 animal group.
#######################################

In [7]:
rm 00.data/merging_files/*.txt 

while read -r ERRG FID R1L1T R2L1T R1L2T R2L2T TRPTH
do      
    echo "02.alignment/${FID}L1_${ERRG}.bam" >> 00.data/merging_files/${ERRG}.txt
    echo "02.alignment/${FID}L2_${ERRG}.bam" >> 00.data/merging_files/${ERRG}.txt
done < 00.data/samples.tab

#### Set variables and reference file

In [None]:
#######################################
Function: 
1. Load the samtools v1.7, the bwa v0.7.17 and the picard-tools v 2.18.7 modules to use for the alignment,
merging, duplication removal and adding read groups processes.
2. Set the align directory, for the alignment process.
3. Set the merge directory, for the merging process.
4. Set the mark duplicates directory, for the duplication removal process.
5. Set the add read group directory, for the add read group process.
6. Extract the reference sequence directory and filename from the ref.tab file. And set the ref seq directory,
to find the reference sequence in the alignment process.
#######################################

In [8]:
module load samtools/1.7
module load bwa/0.7.17
module load picard-tools/2.18.7

# Set job to unique 6 characters.

ALIGNDIR=02.alignment
MERGEDIR=03.merged_bam_files
MARKDUPDIR=04.mark_duplicates
ADDRGPSDIR=05.add_read_group_IDs

while read -r INDEX_DIR INDEX
do
    INDEX_OUT_TOP_DIR=$INDEX_DIR
    IDX_NAME=$INDEX
    REFSEQ=${INDEX_OUT_TOP_DIR}/${IDX_NAME}
done < 00.data/ref.tab



#### Run the process

In [None]:
#######################################
Function: 
Set a unique job name for running the specific jobs on a cluster.
#######################################

In [9]:
job=$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 3 ; echo '')
echo $job

ZcL


In [None]:
#######################################
Input: 
1. The reference sequence.
2. First trimmed reads of the first lane.
3. Second trimmed reads of the first lane.
4. First trimmed read of the second lane.
5. Second trimmed read of the second lane.

Function: 
1. Read the samples.tab file for its trimmed read data, then use this data to align these reads
to the reference sequence to create BAM files per lane 1 and 2 belonging to 1 animal group.

Output:
1. Lane 1 BAM file containing the aligned reads of lane 1.
2. Lane 2 BAM file containing the aligned reads of lane 2.
#######################################

In [1]:
#Alignment
jobnum=0

while read -r ERRG FID R1L1T R2L1T R1L2T R2L2T TRPTH
do
    bsub \
    -o ${ALIGNDIR}/log/${FID}L1_${ERRG}.out -e ${ALIGNDIR}/log/${FID}L1_${ERRG}.err -n 8 \
    -J "${job}B${jobnum}" \
    "bwa mem -t 8 \
    ${REFSEQ} ${TRPTH}/${R1L1T} ${TRPTH}/${R2L1T}\
    | samtools view -Su - | samtools sort - -o ${ALIGNDIR}/${FID}L1_${ERRG}.bam"
    jobnum=$((jobnum + 1))
    
    
    bsub \
    -o ${ALIGNDIR}/log/${FID}L2_${ERRG}.out -e ${ALIGNDIR}/log/${FID}L2_${ERRG}.err -n 8 \
    -J "${job}B${jobnum}" \
    "bwa mem -t 8 \
    ${REFSEQ} ${TRPTH}/${R1L2T} ${TRPTH}/${R2L2T}\
    | samtools view -Su - | samtools sort - -o ${ALIGNDIR}/${FID}L2_${ERRG}.bam"
    jobnum=$((jobnum + 1))
    
done < 00.data/samples.tab

bash: Input:: command not found
bash: 1.: command not found
bash: 2.: command not found
bash: 3.: command not found
bash: 4.: command not found
bash: 5.: command not found
bash: Function:: command not found
bash: 1.: command not found
bash: to: command not found
bash: Output:: command not found
bash: 1.: command not found
bash: 2.: command not found
bash: 00.data/samples.tab: No such file or directory


: 1

In [None]:
#######################################
Input: 
Lane 1 and lane 2 BAM files.

Function: 
Merge the two lane BAM files created in alignment in one BAM file named after the corresponding animal group.

Output:
Merged BAM file beloning to 1 animal group.
#######################################

In [None]:
# Merging, note that using -f to force in case directory isn't clear of previous files
while read -r ERRG LINE R1L1T R2L1T R1L2T R2L2T TRPTH
do
    FILE=00.data/merging_files/${ERRG}.txt
    bsub -w "done(${job}B*)" -o ${MERGEDIR}/log/${ERRG}.out -e ${MERGEDIR}/log/${ERRG}.err -n 8 -J "${job}C${LINE}" \
    "samtools merge -f -b $FILE ${MERGEDIR}/${ERRG}.bam"
done < 00.data/samples.tab

In [None]:
#######################################
Input: 
Animal group BAM files.

Function: 
Remove duplicates left over from PCR to mitigate variant call biases.

Output:
Animal group BAM files with duplicates removed.
#######################################

In [None]:
# Remove duplicates.
PICARD=/software/bioinformatics/picard-tools-2.18.7/picard.jar
while read -r ERRG LINE R1L1T R2L1T R1L2T R2L2T TRPTH
do
    bsub -w "done(${job}C${LINE})" -o ${MARKDUPDIR}/log/${ERRG}.out -e ${MARKDUPDIR}/log/${ERRG}.err -n 8 -J "${job}D${LINE}" \
    -R "rusage[mem=40800] span[hosts=1]" \
    "java -jar -Xmx32G -Djava.io.tmpdir=`pwd`/tmp $PICARD MarkDuplicates \
    INPUT=${MERGEDIR}/${ERRG}.bam \
    OUTPUT=${MARKDUPDIR}/${ERRG}.bam \
    AS=true \
    MAX_RECORDS_IN_RAM=500000 \
    MAX_FILE_HANDLES=1000 \
    M=${MARKDUPDIR}/${ERRG}.txt \
    OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
    TMP_DIR=`pwd`/tmp \
    REMOVE_DUPLICATES=true"
done < 00.data/samples.tab

In [None]:
#######################################
Input: 
Animal group BAM files with duplicates removed.

Function: 
Add read group data in order to distinguish read origin when variant calling.

Output:
Animal group BAM files with duplicates removed and read groups added.
#######################################

In [5]:
# Add read groups.
PICARD=/software/bioinformatics/picard-tools-2.18.7/picard.jar
while read -r ERRG LINE R1L1T R2L1T R1L2T R2L2T TRPTH
do
    bsub \
    -m wkoppb37 \
    -o ${ADDRGPSDIR}/log/${ERRG}.out -e ${ADDRGPSDIR}/log/${ERRG}.err -n 8 -J "${job}E${LINE}" \
    "java -jar -Xmx32G -Djava.io.tmpdir=`pwd`/tmp $PICARD AddOrReplaceReadGroups \
    I=${MARKDUPDIR}/${ERRG}.bam \
    O=${ADDRGPSDIR}/${ERRG}.bam \
    MAX_RECORDS_IN_RAM=500000 \
    RGID=${ERRG} \
    RGLB=lib1 \
    RGPL=Illumina \
    RGPU=unit1 \
    RGSM=${ERRG}"
done < 00.data/samples.tab

Job <40373> is submitted to default queue <normal>.
Job <40374> is submitted to default queue <normal>.
Job <40375> is submitted to default queue <normal>.
Job <40376> is submitted to default queue <normal>.
Job <40377> is submitted to default queue <normal>.
Job <40378> is submitted to default queue <normal>.
Job <40379> is submitted to default queue <normal>.
Job <40380> is submitted to default queue <normal>.
Job <40381> is submitted to default queue <normal>.
Job <40382> is submitted to default queue <normal>.
Job <40383> is submitted to default queue <normal>.
Job <40384> is submitted to default queue <normal>.
Job <40385> is submitted to default queue <normal>.


In [None]:
#######################################
Input: 
Animal group BAM files with duplicates removed and read groups added.

Function: 
Index the BAM files to use for further processing.

Output:
Indexed BAM files.
#######################################

In [10]:
while read -r ERRG LINE R1L1T R2L1T R1L2T R2L2T TRPTH
do
    bsub  -J "${job}F${LINE}" -o ${ADDRGPSDIR}/log/I${ERRG}.out -e ${ADDRGPSDIR}/log/I${ERRG}.err "samtools index ${ADDRGPSDIR}/${ERRG}.bam"
done < 00.data/samples.tab

Job <41411> is submitted to default queue <normal>.
Job <41412> is submitted to default queue <normal>.
Job <41413> is submitted to default queue <normal>.
Job <41414> is submitted to default queue <normal>.
Job <41415> is submitted to default queue <normal>.
Job <41416> is submitted to default queue <normal>.
Job <41417> is submitted to default queue <normal>.
Job <41418> is submitted to default queue <normal>.
Job <41419> is submitted to default queue <normal>.
Job <41420> is submitted to default queue <normal>.
Job <41421> is submitted to default queue <normal>.
Job <41422> is submitted to default queue <normal>.
Job <41423> is submitted to default queue <normal>.


In [None]:
#######################################
Function: 
Check the jobs running on the cluster to see the status of the script running.
#######################################

In [68]:
bjobs

JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
30603   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF1      Oct 10 10:36
30604   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF2      Oct 10 10:36
30605   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF3      Oct 10 10:36
30606   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF4      Oct 10 10:36
30607   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF5      Oct 10 10:36
30608   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF6      Oct 10 10:36
30609   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF7      Oct 10 10:36
30610   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF8      Oct 10 10:36
30611   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF9      Oct 10 10:36
30612   hramzr  RUN   normal     aklppj31    wkoppb43    iUdF10     Oct 10 10:36
30613   hramzr  RUN   normal     aklppj31    wkoppb37    iUdF11     Oct 10 10:36
30614   hramzr  RUN   normal 

In [None]:
#######################################
Function: 
Kill jobs, incase something goes wrong.
#######################################

In [None]:
bkill 0

In [None]:
#######################################
Function: 
Check available hosts to use.
#######################################

In [5]:
bhosts

HOST_NAME          STATUS       JL/U    MAX  NJOBS    RUN  SSUSP  USUSP    RSV 
aklppb31           closed          -     10     10     10      0      0      0
aklppb32           closed          -     10     10     10      0      0      0
aklppb34           ok              -    160     80     80      0      0      0
aklppb35           closed          -      8      8      8      0      0      0
aklppb36           ok              -     40     24     24      0      0      0
aklppb37           closed          -     10     10     10      0      0      0
aklppb40           ok              -     10      0      0      0      0      0
aklppb41           ok              -     10      1      1      0      0      0
aklppb42           ok              -     10      3      3      0      0      0
aklppb43           ok              -     10      3      3      0      0      0
aklppb44           ok              -     10      8      8      0      0      0
aklppf31           closed          -      0      0 

In [None]:
#######################################
Function: 
Check directory.
#######################################

In [88]:
pwd

/powerplant/workspace/hramzr/DNAseq_mapping/alignment
