In [41]:
%%writefile ./scATAC_pipe.sh

read1="../../nobackup/170109/fastq/Undetermined_S0_R1_001.fastq.gz"
read2="../../nobackup/170109/fastq/Undetermined_S0_R2_001.fastq.gz"
output_folder="../../nobackup/170109/scATAC_output/"
P7_index="./P7_index.txt"
P5_index="./P5_index.txt"
barcode_file="/net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/N7_N5_barcode_txt"
core=20
cutoff=500

script_folder="/net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/"
python="/net/shendure/vol1/home/cao1025/anaconda2/bin/python2.7"


# read in the read1 and read2, and P5 barcode, P7 barcode and then split the read1 and read2 based on the P5 barcode
# and P7 barcode, and output the splitted reads to the output_folder/fastq files
fastq_folder=$output_folder/fastq
mkdir -p $fastq_folder
$python $script_folder/ATAC_split_fastq.py $read1 $read2 $fastq_folder $P7_index $P5_index
gzip $fastq_folder/*.fastq

# Trim the reads
echo
echo "Start trimming the files..."
echo $(date)
module load python/2.7.3
module load cutadapt/1.8.3
module load trim_galore/0.4.1
mkdir $output_folder/trimmed_fastq
trimmed_fastq=$output_folder/trimmed_fastq
sample_ID=$output_folder/fastq/sample_ID.txt
mkdir -p $trimmed_fastq
for sample in $(cat $sample_ID); do echo trimming $sample; sem -j+$core trim_galore $fastq_folder/$sample*R1*.gz $fastq_folder/$sample*R2*.gz --paired -a CTGTCTCTTATA -a2 CTGTCTCTTATA --three_prime_clip_R1 1 --three_prime_clip_R2 1 -o $trimmed_fastq; done
sem --semaphoretimeout 1200
echo "All trimmed file generated."
module unload python/2.7.3


# align the reads with bowtie2
# this script take the input folder, sample ID, output folder and index as input, then it runs the bowtie2 with local alignment, and align single end read to the index
input_folder=$trimmed_fastq
sample_ID=$output_folder/fastq/sample_ID.txt
bowtie_folder=$output_folder/bowtie2_align
mkdir -p $bowtie_folder
index="/net/shendure/vol10/nobackup/shared/alignments/bowtie2-2.0.2/human_g1k_hs37d5/hs37d5"
echo
echo "Start the alignment using bowtie2"
echo input folder: $input_folder
echo sample ID file: $sample_ID
echo output folder: $bowtie_folder
echo index: $index
module load bowtie2/2.2.3
mkdir -p $output_folder
for sample in $(cat $sample_ID); do echo aligning $sample; bowtie2 -p $core --local -X 2000 -x $index -1 $input_folder/$sample*R1*.gz -2 $input_folder/$sample*R2*.gz -S $bowtie_folder/$sample.sam 2>>$bowtie_folder/report.txt; done
echo "all alignment done"


# This function accept a input bowtie2  folder, a sample ID file and a output folder, and then filter the reads: remove the mitochondrial reads, samtools -F4 -q 30, sort the file and use picard to remove the dupllicate; and output the mapped reads into the mapped folder of the output folder, output the remove duplicates reads into the rm_dup folder of the output folder
input_folder=$bowtie_folder
sample_ID=$output_folder/fastq/sample_ID.txt
filtered_folder=$output_folder/filtered_bam
# make the output folder
mkdir -p $filtered_folder
mkdir $filtered_folder/mapped_bam
mkdir $filtered_folder/rmdup_bam
# filter the files and generate the mapped bam file
for sample in $(cat $sample_ID); do echo "generating mapped bam file" $sample; sem -j+$core samtools view -h -F 4 -q 30 $input_folder/$sample.sam |awk '$3 != "MT" && $3 != "chrM"' -|samtools view -bh -|samtools sort -|samtools view -bh>$filtered_folder/mapped_bam/$sample.bam; done
sem --semaphoretimeout 1200
echo All mapped file generated~
# remove duplicates using Picard
echo Remove duplicates
for sample in $(cat $sample_ID); do echo removing duplicates $sample; sem -j+$core java -Xmx1G -jar /net/gs/vol3/software/modules-sw/picard/1.105/Linux/RHEL6/x86_64/MarkDuplicates.jar INPUT=$filtered_folder/mapped_bam/$sample.bam OUTPUT=$filtered_folder/rmdup_bam/$sample.bam REMOVE_DUPLICATES=true ASSUME_SORTED=True METRICS_FILE=/net/shendure/vol1/home/cao1025/sge_logs/CurrSC.metrics.txt VALIDATION_STRINGENCY=LENIENT; done
sem --semaphoretimeout 1200
echo All remove duplicated files generated~ 

# Transform the bam files to sam files
bam_folder=$filtered_folder/rmdup_bam
sample_list=$output_folder/fastq/sample_ID.txt
splitted_folder=$output_folder/splitted_sam
# first convert the bam files into sam files
echo convert bam files to sam files...
for sample in $(cat $sample_list); do echo converting $sample; sem -j+$core samtools view -h $bam_folder/$sample.bam>$bam_folder/$sample.sam; done
sem --semaphoretimeout 1200
echo All bam files are converted to sam files.

echo Start splitting the sam files based on the barcode...
mkdir -p $splitted_folder
bash $script_folder/samfile_split_permuted.sh $bam_folder $sample_list $splitted_folder $barcode_file $cutoff $core

echo "Start calculating the human and mouse reads number..."
bash $script_folder/report_human_mouse_fraction.sh $splitted_folder $output_folder/barcode_samples.txt $output_folder/report/human_mouse_read_number

echo "Calculate the fragment size distribution...."
bash $script_folder/frag_size.sh $splitted_folder $output_folder/barcode_samples.txt $output_folder/report/frag_size $core
echo "All fragment size calculation is done."


# calculate the reads number
# this script accept the input parental folder, create a report folder and a read_number sub folder, and for each sample, 
# calculate the read number in fastq, trimmed_fastq, UMI_attached_R2, STAR_alignment, filtered_sam, samfile. after filter
# barcode
parental_folder=$output_folder
sample_ID=$output_folder/fastq/sample_ID.txt
fastq_folder=$parental_folder/fastq
trimmed_folder=$parental_folder/trimmed_fastq
alignment=$bowtie_folder
filtered_sam=$filtered_folder/mapped_bam
rm_dup_sam=$filtered_folder/rmdup_bam
#split_sam=$parental_folder/splited_sam
report_folder=$parental_folder/report/read_number
echo
echo "Start calculating the reads number..."
#make the report folder
mkdir -p $report_folder
#calculate the read number and output the read number into the report folder
echo sample,total reads,after trimming,all mapped reads, uniquely aligned reads,After remove duplicates>$report_folder/read_number.csv
for sample in $(cat $sample_ID); do echo calculating $sample; echo $sample,$(expr $(zcat $fastq_folder/$sample*R2*.gz|wc -l) / 4),$(expr $(zcat $trimmed_folder/$sample*R2*.gz|wc -l) / 4), $(samtools view -F 4 -q 30 $alignment/$sample.sam|wc -l),$(samtools view $filtered_sam/$sample.bam|wc -l),$(samtools view $rm_dup_sam/$sample.bam|wc -l)>>$report_folder/read_number.csv; done
echo "Read number calculation is done."

Overwriting ./scATAC_pipe.sh


In [9]:
%%writefile ./ATAC_split_fastq.py
# python ATAC_split_fastq.py read1 read2 output_folder P7_index P5_index
'''
This script accept a read1, read2 file, a barcode file P5 and P7 file, output
file folder, then it read in the read1 and read2 file, split the read1 and read2 based on
the combination of the P5 and P7 barcode, and attach the adaptor barcode to the read1 and read2 
sequence
'''

'''
First, I am going to create the files with the P5-P7 index combination,

For each read1, I will go through line by line, for line with the index line,
extract the index, and file which this index belong to, and then write the read1 and read2 both into the 
output file.
'''
import subprocess
import sys
from Levenshtein import distance
import gzip

def extract_barcode(index):
    '''
    This function accept a index sequence, and return the extracted P7+P5 barcode
    in the index, and the barcode in the Tn5 adaptor
    '''
    barcodes = index.split('+')
    P7 = barcodes[0][8:]
    P5 = barcodes[1][0:10]
    N7 = barcodes[0][0:8]
    N5 = barcodes[1][10:]
    
    P_index = P7 + '.' + P5
    N_index = N7 + '.' + N5
    
    return (P_index, N_index)

'''
def P7_P5_list(P7_file, P5_file):
    
    #this function accept a text including P7 index and a text file including a 
    #P5 index and then generate a list for all the P7.P5 barcode combinations
    
    P7 = open(P7_file)
    P5 = open(P5_file)
    index1 = P7.readlines()
    index2 = P5.readlines()
    index1 = map(lambda x: x.strip(), index1)
    index2 = map(lambda x: x.strip(), index2)
    all_indexes = []
    for i1 in index1:
        for i2 in index2:
            all_indexes.append(i1 + '.' + i2)
    
    P7.close()
    P5.close()
    return all_indexes
'''

def P7_P5_list(P7_file, P5_file):
    '''
    this function accept a text including P7 index and a text file including a 
    P5 index and then generate a list for all the P7.P5 barcode combinations
    '''
    P7 = open(P7_file)
    P5 = open(P5_file)
    index1 = P7.readlines()
    index2 = P5.readlines()
    index1 = map(lambda x: x.strip(), index1)
    index2 = map(lambda x: x.strip(), index2)
    all_indexes = []
    for i in range(len(index1)):
            all_indexes.append(index1[i] + '.' + index2[i])
    
    P7.close()
    P5.close()
    return all_indexes


def find_P7_P5_barcode(index, P7_P5_barcodes):
    '''
    this function accept a P7_P5 barcode in the read, and a list of P7_P5 barcodes provided,
    and then find the barcode in the P7_P5 barcode list that match the index with distance <2,
    then return the found barcode; if no barcode is found, then return -1
    '''
    result = -1

    for barcode in P7_P5_barcodes:
        diff = distance(index, barcode)
        if (diff <= 1):
            result = barcode
            break
    
    return result
    
def split_fastq(read1, read2, output_folder, P7_P5_barcodes): 
    '''
    This function accept a read1 file, a read2 file, a list of P7-P5 barcodes, create the output files,
    and then go through each line of the read1 and read2 file,
    and then check the barcode in the read1, compare it with the P7_P5 barcodes,
    if the distance < 2, then output the reads into the output files
    '''
    
    # generate the output files
    output_read1 = {}
    output_read2 = {}
    sample_ID = open(output_folder + "/sample_ID.txt", 'w')
    barcode_count = {}
    for barcode in P7_P5_barcodes:
        output_R1 = output_folder + '/' + barcode + '.R1.fastq'
        output_R2 = output_folder + '/' + barcode + '.R2.fastq'
        output_read1[barcode] = open(output_R1, 'w')
        output_read2[barcode] = open(output_R2, 'w')
        sample_ID.write(barcode + '\n')
        barcode_count[barcode] = 0
        
    sample_ID.close()
    # open the read1 and read2 file, and then for each read, check the barcode and output the matched read into the
    # output file
    
    f1 = gzip.open(read1, 'rb')
    f2 = gzip.open(read2, 'rb')
    line_n = 0
    total_read=0
    
    while(True):
        line1 = f1.readline()
        line2 = f2.readline()
        line_n += 1
        if (not line1):
            break
        if (line_n % 4 == 1):
            index = line1.strip().split(':')[-1]
            #print "index: ", index
            index = extract_barcode(index)
            P_index = index[0]
            N_index = index[1]
            P_index = find_P7_P5_barcode(P_index, P7_P5_barcodes)
            total_read+=1
            if (P_index != -1):
                output_R1 = output_read1[P_index]
                output_R2 = output_read2[P_index]
                barcode_count[P_index] += 1
                line1 = '@' + N_index + ',' + line1[1:]
                line2 = '@' + N_index + ',' + line2[1:]
                output_R1.write(line1)
                output_R2.write(line2)
                
                # output the other lines into the files
                line1 = f1.readline()
                line2 = f2.readline()
                output_R1.write(line1)
                output_R2.write(line2)
                line_n += 1
                
                line1 = f1.readline()
                line2 = f2.readline()
                output_R1.write(line1)
                output_R2.write(line2)
                line_n += 1
                
                line1 = f1.readline()
                line2 = f2.readline()
                output_R1.write(line1)
                output_R2.write(line2)
                line_n += 1
                
            else:
                line1 = f1.readline()
                line2 = f2.readline()
                line_n += 1
                
                line1 = f1.readline()
                line2 = f2.readline()
                line_n += 1
                
                line1 = f1.readline()
                line2 = f2.readline()
                line_n += 1
    # close the files
    f1.close()
    f2.close()
    print "Total read count: ", total_read
    asigned_read = 0
    for barcode in P7_P5_barcodes:
        output_read1[barcode].close()
        output_read2[barcode].close()
        print "Read_count: ", barcode, barcode_count[barcode]
        asigned_read += barcode_count[barcode]
    print "Total asigned read: ", asigned_read

def split_fastq_main(read1, read2, output_folder, P7_file, P5_file):
    '''
    this is the main function that accept the read1 file, the read2 file, the output folder,
    the files including P7 barcode, P5 barcode and then split the 
    the read1 and read2 files based on the P5 and P7 combinations
    '''
    print "Start splitting the fastq files..."
    P7_P5_barcodes = P7_P5_list(P7_file, P5_file)
    print "Number of output files: ", len(P7_P5_barcodes)
    
    split_fastq(read1, read2, output_folder, P7_P5_barcodes)
    print "fastq files splitted~"

if __name__ == "__main__":
    read1 = sys.argv[1]
    read2 = sys.argv[2]
    output_folder = sys.argv[3]
    P7_file = sys.argv[4]
    P5_file = sys.argv[5]
    split_fastq_main(read1, read2, output_folder, P7_file, P5_file)


Overwriting ./ATAC_split_fastq.py


In [None]:
# %load ./bowtie2_PE.sh
#!/bin/bash

# this script take the input folder, sample ID, output folder and index as input, then it runs the bowtie2 with local alignment, and align single end read to the index
input_folder=$1
sample_ID=$2
output_folder=$3
index=$4

echo
echo "Start the alignment using bowtie2"
echo input folder: $input_folder
echo sample ID file: $sample_ID
echo output folder: $output_folder
echo index: $index
module load bowtie2/latest
mkdir -p $output_folder
for sample in $(cat $sample_ID); do echo aligning $sample; bowtie2 -p 10 --local -X 2000 -x $index -1 $input_folder/$sample*R1*.gz -2 $input_folder/$sample*R2*.gz -S $output_folder/$sample.sam 2>>$output_folder/report.txt; done
echo "all alignment done"



In [None]:
# %load /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/bamfile_split_permuted.sh
#! /bin/bash

# this script accept  a input folder for the bam file, a sample ID file, and a output folder, a barcode file, and a cutoff value, and then transform all bam files in the bam folder into sam files, and then use the samfile_split_permuted.sh to split the sam files

bam_folder=$1
sample_list=$2
output_folder=$3
barcode_file=$4
cutoff=$5

# first convert the bam files into sam files
echo convert bam files to sam files...
for sample in $(cat $sample_list); do echo converting $sample; samtools view -h $bam_folder/$sample.bam>$bam_folder/$sample.sam; done
echo All bam files are converted to sam files.

echo Start split the sam files based on the barcode...
/net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/samfile_split_permuted.sh $bam_folder $sample_list $output_folder $barcode_file $cutoff


In [27]:
%%writefile /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/samfile_split_permuted.sh
#!/bin/bash

# this script accept a sam file folder, a sample list, a output folder, a barcode file, then it will run the sam_split.py on each
# sam file and output the splited sam file to the output folder

sam_folder=$1
sample_list=$2
output_folder=$3
barcode_file=$4
cutoff=$5
core=$6

#define the location of the python
python_path="/net/shendure/vol1/home/cao1025/anaconda2/bin/python2.7"
script_path="/net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/"

echo
echo "Start splitting the sam file..."
echo samfile folder: $sam_folder
echo sample list: $sample_list
echo ouput folder: $output_folder
echo barcode file: $barcode_file
echo cutoff value: $cutoff

mkdir $output_folder
for sample in $(cat $sample_list); do echo Now splitting $sample; sem -j+$core $python_path $script_path/sam_split_permuted.py $sam_folder/$sample.sam $barcode_file $output_folder $cutoff; done
sem --wait
cat $output_folder/*sample_list.txt>$output_folder/All_samples.txt
cp $output_folder/All_samples.txt $output_folder/../barcode_samples.txt

# output the report the report/barcode_read_distribution folder
mkdir -p $output_folder/../report/barcode_read_distribution
mv $output_folder/*.txt $output_folder/../report/barcode_read_distribution/
mv $output_folder/*.png $output_folder/../report/barcode_read_distribution/

echo
echo "All sam file splitted."

Overwriting /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/samfile_split_permuted.sh


In [None]:
# %load /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/sam_split_permuted.py
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from Levenshtein import distance

def find_P7_P5_barcode(index, P7_P5_barcodes):
    '''
    this function accept a P7_P5 barcode in the read, and a list of P7_P5 barcodes provided,
    and then find the barcode in the P7_P5 barcode list that match the index with distance <2,
    then return the found barcode; if no barcode is found, then return -1
    '''
    result = -1

    for barcode in P7_P5_barcodes:
        diff = distance(index, barcode)
        if (diff <= 1):
            result = barcode
            break
    
    return result

def samfile_barcode_count(sam_file, barcode_file):
    
    #generate the barcode list and barcode dictionary
    barcodes = open(barcode_file)
    barcode_ls = []
    barcode_dic = {}
    for line in barcodes:
        barcode = line.strip()
        barcode_ls.append(barcode)
        barcode_dic[barcode] = 0
    barcodes.close()
    #read the sam file, and count the number per barcode
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            continue
        else:
            name = (((line.split('\t'))[0]).split(','))
            barcode = name[0]
            barcode = find_P7_P5_barcode(barcode, barcode_ls)
            if (barcode != -1):
                barcode_dic[barcode] += 1
    sam.close()
    return barcode_dic

def permute_samples(sam_file, barcode_count, barcode_list, output_folder):
    '''
    this function accept a sam file, a barcode count dictionary, a barcode list and a output folder.
    then for each barcode in the barcode list, it find the reads number n associated with the barcode, 
    and sample n reads from the samfile and then output the sampled reads to the 
    '''
    
    # Generate a list of output file
    file_name = (sam_file.split('/')[-1]).split('.')
    file_name = file_name[0] + '.' + file_name[1]
    output_files = {}
    for barcode in barcode_list:
        output_file = output_folder + '/' + file_name + '.' + barcode + '.permuted.sam'
        output_files[barcode] = open(output_file, 'w')
    
    # output the header into each file and count the line number in the header and total reads number
    header_number = 0
    input_file = open(sam_file)
    all_lines = input_file.readlines()
    for line in all_lines:
        if (line[0] == '@'):
            header_number += 1
            for barcode in barcode_list:
                output_files[barcode].write(line)
        else:
            break
    
    # for each barcode, generate the permuted line array
    # for each barcode, output the permuted lines to the output file
    permuted_lines = np.random.permutation(all_lines[header_number:])
    first_line = 0
    for barcode in barcode_list:
        end_line = first_line + barcode_count[barcode]
        output_lines = list(permuted_lines[first_line:end_line])
        for output_line in output_lines:
            output_files[barcode].write(output_line)
        first_line = end_line
    
    # close the output file and sam file
    for barcode in barcode_list:
        output_files[barcode].close()
    input_file.close()
    
def split_samfile(sam_file, barcode_file, output_folder, cutoff):
    '''
    this script accept a sam file, a barcode file, a output_file, a cutoff value,
    then it will call the samfile_barcode_count function and get the total read count per barcode,
    then it use the cutoff value to filter the barcode,
    and generate the output samfile for single cells, generate the sample_ID.txt in the output folder,
    generate the reads distribution in the output folder/read_distribution_barcode;
    '''
    
    # generate the count per barcode
    barcode_count = samfile_barcode_count(sam_file, barcode_file)
    
    # plot the barcode reads distribution and save the result to the ouput folder
    plot_name = (sam_file.split('/')[-1]).split('.')
    plot_name = plot_name[0] + '.' + plot_name[1]
    fig = plt.figure()
    plt.hist(barcode_count.values(), bins=100)
    plt.ylabel('frequency')
    plt.xlabel('Number of unique reads')
    fig_output = output_folder + '/' + plot_name + '.png'
    
    fig.savefig(fig_output)

    #also output the barcode number and distribution to the output folder
    read_dist = open(output_folder + '/' + plot_name + '.txt', 'w')
    for barcode in barcode_count:
        line = barcode + ', %d\n' %(barcode_count[barcode])
        read_dist.write(line)
    read_dist.close()
    
    #filter the barcode based on the cutoff value
    barcode_filtered = []
    for barcode in barcode_count:
        if barcode_count[barcode] >= cutoff:
            barcode_filtered.append(barcode)
    print barcode_filtered
     
    # for the barcode in the barcode filter list, generate permuted sample in the output folder
    print "Generatign permuted sequences..."
    permute_samples(sam_file, barcode_count, barcode_filtered, output_folder)
    
    #generate the output sam file and sample_list file
    sample_list_file = open(output_folder + '/' + plot_name + '.' + 'sample_list.txt', 'w')
    output_files = {}
    for barcode in barcode_filtered:
        output_file = output_folder + '/' + plot_name + '.' + barcode + '.sam'
        output_files[barcode] = open(output_file, 'w')
        sample_list_file.write(plot_name + '.' + barcode + '\n')
    
    # output the each read to the output sam file
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            for barcode in barcode_filtered:
                output_files[barcode].write(line)
        else:
            barcode = (((line.split('\t'))[0]).split(','))[0]
            barcode = find_P7_P5_barcode(barcode, barcode_filtered)
            if (barcode != -1):
                output_files[barcode].write(line)
    
    #close the files:
    sample_list_file.close()
    sam.close()
    for barcode in barcode_filtered:
        output_files[barcode].close()

if __name__ == '__main__':
    sam_file = sys.argv[1]
    barcode_file = sys.argv[2]
    output_folder = sys.argv[3]
    cutoff = int(sys.argv[4])
    split_samfile(sam_file, barcode_file, output_folder, cutoff)


In [20]:
# %load /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/sam_split_permuted.py
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from Levenshtein import distance

def find_P7_P5_barcode(index, P7_P5_barcodes):
    '''
    this function accept a P7_P5 barcode in the read, and a list of P7_P5 barcodes provided,
    and then find the barcode in the P7_P5 barcode list that match the index with distance <2,
    then return the found barcode; if no barcode is found, then return -1
    '''
    result = -1

    for barcode in P7_P5_barcodes:
        diff = distance(index, barcode)
        if (diff <= 1):
            result = barcode
            break
    
    return result

def samfile_barcode_count(sam_file, barcode_file):
    
    #generate the barcode list and barcode dictionary
    barcodes = open(barcode_file)
    barcode_ls = []
    barcode_dic = {}
    for line in barcodes:
        barcode = line.strip()
        barcode_ls.append(barcode)
        barcode_dic[barcode] = 0
    barcodes.close()
    #read the sam file, and count the number per barcode
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            continue
        else:
            name = (((line.split('\t'))[0]).split(','))
            barcode = name[0]
            barcode = find_P7_P5_barcode(barcode, barcode_ls)
            if (barcode != -1):
                barcode_dic[barcode] += 1
    sam.close()
    return barcode_dic

def permute_samples(sam_file, barcode_count, barcode_list, output_folder):
    '''
    this function accept a sam file, a barcode count dictionary, a barcode list and a output folder.
    then for each barcode in the barcode list, it find the reads number n associated with the barcode, 
    and sample n reads from the samfile and then output the sampled reads to the 
    '''
    
    # Generate a list of output file
    file_name = (sam_file.split('/')[-1]).split('.')
    file_name = file_name[0] + '.' + file_name[1]
    output_files = {}
    for barcode in barcode_list:
        output_file = output_folder + '/' + file_name + '.' + barcode + '.permuted.sam'
        output_files[barcode] = open(output_file, 'w')
    
    # output the header into each file and count the line number in the header and total reads number
    header_number = 0
    input_file = open(sam_file)
    all_lines = input_file.readlines()
    for line in all_lines:
        if (line[0] == '@'):
            header_number += 1
            for barcode in barcode_list:
                output_files[barcode].write(line)
        else:
            break
    
    # for each barcode, generate the permuted line array
    # for each barcode, output the permuted lines to the output file
    permuted_lines = np.random.permutation(all_lines[header_number:])
    first_line = 0
    for barcode in barcode_list:
        end_line = first_line + barcode_count[barcode]
        output_lines = list(permuted_lines[first_line:end_line])
        for output_line in output_lines:
            output_files[barcode].write(output_line)
        first_line = end_line
    
    # close the output file and sam file
    for barcode in barcode_list:
        output_files[barcode].close()
    input_file.close()
    
def split_samfile(sam_file, barcode_file, output_folder, cutoff):
    '''
    this script accept a sam file, a barcode file, a output_file, a cutoff value,
    then it will call the samfile_barcode_count function and get the total read count per barcode,
    then it use the cutoff value to filter the barcode,
    and generate the output samfile for single cells, generate the sample_ID.txt in the output folder,
    generate the reads distribution in the output folder/read_distribution_barcode;
    '''
    
    # generate the count per barcode
    barcode_count = samfile_barcode_count(sam_file, barcode_file)
    
    # plot the barcode reads distribution and save the result to the ouput folder
    plot_name = (sam_file.split('/')[-1]).split('.')
    plot_name = plot_name[0] + '.' + plot_name[1]
    fig = plt.figure()
    plt.hist(barcode_count.values(), bins=100)
    plt.ylabel('frequency')
    plt.xlabel('Number of unique reads')
    fig_output = output_folder + '/' + plot_name + '.png'
    
    fig.savefig(fig_output)

    #also output the barcode number and distribution to the output folder
    read_dist = open(output_folder + '/' + plot_name + '.txt', 'w')
    for barcode in barcode_count:
        line = barcode + ', %d\n' %(barcode_count[barcode])
        read_dist.write(line)
    read_dist.close()
    
    #filter the barcode based on the cutoff value
    barcode_filtered = []
    for barcode in barcode_count:
        if barcode_count[barcode] >= cutoff:
            barcode_filtered.append(barcode)
    print barcode_filtered
     
    # for the barcode in the barcode filter list, generate permuted sample in the output folder
    print "Generatign permuted sequences..."
    permute_samples(sam_file, barcode_count, barcode_filtered, output_folder)
    
    #generate the output sam file and sample_list file
    sample_list_file = open(output_folder + '/' + plot_name + '.' + 'sample_list.txt', 'w')
    output_files = {}
    for barcode in barcode_filtered:
        output_file = output_folder + '/' + plot_name + '.' + barcode + '.sam'
        output_files[barcode] = open(output_file, 'w')
        sample_list_file.write(plot_name + '.' + barcode + '\n')
    
    # output the each read to the output sam file
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            for barcode in barcode_filtered:
                output_files[barcode].write(line)
        else:
            barcode = (((line.split('\t'))[0]).split(','))[0]
            barcode = find_P7_P5_barcode(barcode, barcode_filtered)
            if (barcode != -1):
                output_files[barcode].write(line)
    
    #close the files:
    sample_list_file.close()
    sam.close()
    for barcode in barcode_filtered:
        output_files[barcode].close()

if __name__ == '__main__':
    sam_file = sys.argv[1]
    barcode_file = sys.argv[2]
    output_folder = sys.argv[3]
    cutoff = int(sys.argv[4])
    split_samfile(sam_file, barcode_file, output_folder, cutoff)



In [28]:
%%writefile /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/sam_split_permuted.py
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from Levenshtein import distance

def find_P7_P5_barcode(index, P7_P5_barcodes):
    '''
    this function accept a P7_P5 barcode in the read, and a list of P7_P5 barcodes provided,
    and then find the barcode in the P7_P5 barcode list that match the index with distance <2,
    then return the found barcode; if no barcode is found, then return -1
    '''
    result = -1

    for barcode in P7_P5_barcodes:
        diff = distance(index, barcode)
        if (diff <= 1):
            result = barcode
            break
    
    return result

def samfile_barcode_count(sam_file, barcode_file):
    
    #generate the barcode list and barcode dictionary
    barcodes = open(barcode_file)
    barcode_ls = []
    barcode_dic = {}
    for line in barcodes:
        barcode = line.strip()
        barcode_ls.append(barcode)
        barcode_dic[barcode] = 0
    barcodes.close()
    #read the sam file, and count the number per barcode
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            continue
        else:
            name = (((line.split('\t'))[0]).split(','))
            barcode = name[0]
            barcode = find_P7_P5_barcode(barcode, barcode_ls)
            if (barcode != -1):
                barcode_dic[barcode] += 1
    sam.close()
    return barcode_dic

def permute_samples(sam_file, barcode_count, barcode_list, output_folder):
    '''
    this function accept a sam file, a barcode count dictionary, a barcode list and a output folder.
    then for each barcode in the barcode list, it find the reads number n associated with the barcode, 
    and sample n reads from the samfile and then output the sampled reads to the 
    '''
    
    # Generate a list of output file
    file_name = (sam_file.split('/')[-1]).split('.')
    file_name = file_name[0] + '.' + file_name[1]
    output_files = {}
    for barcode in barcode_list:
        output_file = output_folder + '/' + file_name + '.' + barcode + '.permuted.sam'
        output_files[barcode] = open(output_file, 'w')
    
    # output the header into each file and count the line number in the header and total reads number
    header_number = 0
    input_file = open(sam_file)
    all_lines = input_file.readlines()
    for line in all_lines:
        if (line[0] == '@'):
            header_number += 1
            for barcode in barcode_list:
                output_files[barcode].write(line)
        else:
            break
    
    # for each barcode, generate the permuted line array
    # for each barcode, output the permuted lines to the output file
    
    
    permuted_lines = np.random.permutation(all_lines[header_number:])
    first_line = 0
    for barcode in barcode_list:
        end_line = first_line + barcode_count[barcode]
        output_lines = list(permuted_lines[first_line:end_line])
        for output_line in output_lines:
            output_files[barcode].write(output_line)
        first_line = end_line
    
    
    # close the output file and sam file
    for barcode in barcode_list:
        output_files[barcode].close()
    input_file.close()
    
def split_samfile(sam_file, barcode_file, output_folder, cutoff):
    '''
    this script accept a sam file, a barcode file, a output_file, a cutoff value,
    then it will call the samfile_barcode_count function and get the total read count per barcode,
    then it use the cutoff value to filter the barcode,
    and generate the output samfile for single cells, generate the sample_ID.txt in the output folder,
    generate the reads distribution in the output folder/read_distribution_barcode;
    '''
    
    # generate the count per barcode
    barcode_count = samfile_barcode_count(sam_file, barcode_file)
    
    
    # plot the barcode reads distribution and save the result to the ouput folder
    plot_name = (sam_file.split('/')[-1]).split('.')
    plot_name = plot_name[0] + '.' + plot_name[1]
    '''
    fig = plt.figure()
    plt.hist(barcode_count.values(), bins=100)
    plt.ylabel('frequency')
    plt.xlabel('Number of unique reads')
    fig_output = output_folder + '/' + plot_name + '.png'
    
    fig.savefig(fig_output)
    '''

    #also output the barcode number and distribution to the output folder
    read_dist = open(output_folder + '/' + plot_name + '.txt', 'w')
    for barcode in barcode_count:
        line = barcode + ', %d\n' %(barcode_count[barcode])
        read_dist.write(line)
    read_dist.close()
    
    #filter the barcode based on the cutoff value
    barcode_filtered = []
    for barcode in barcode_count:
        if barcode_count[barcode] >= cutoff:
            barcode_filtered.append(barcode)
    print barcode_filtered
    
    '''
    # for the barcode in the barcode filter list, generate permuted sample in the output folder
    print "Generatign permuted sequences..."
    permute_samples(sam_file, barcode_count, barcode_filtered, output_folder)
    '''
    
    #generate the output sam file and sample_list file
    sample_list_file = open(output_folder + '/' + plot_name + '.' + 'sample_list.txt', 'w')
    output_files = {}
    for barcode in barcode_filtered:
        output_file = output_folder + '/' + plot_name + '.' + barcode + '.sam'
        output_files[barcode] = open(output_file, 'w')
        sample_list_file.write(plot_name + '.' + barcode + '\n')
    
    # output the each read to the output sam file
    sam = open(sam_file)
    for line in sam:
        if (line[0] == '@'):
            for barcode in barcode_filtered:
                output_files[barcode].write(line)
        else:
            barcode = (((line.split('\t'))[0]).split(','))[0]
            barcode = find_P7_P5_barcode(barcode, barcode_filtered)
            if (barcode != -1):
                output_files[barcode].write(line)
    
    #close the files:
    sample_list_file.close()
    sam.close()
    for barcode in barcode_filtered:
        output_files[barcode].close()

if __name__ == '__main__':
    sam_file = sys.argv[1]
    barcode_file = sys.argv[2]
    output_folder = sys.argv[3]
    cutoff = int(sys.argv[4])
    split_samfile(sam_file, barcode_file, output_folder, cutoff)

Overwriting /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/sam_split_permuted.py


In [None]:
# %load /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/samfile_split_permuted.sh
#!/bin/bash

# this script accept a sam file folder, a sample list, a output folder, a barcode file, then it will run the sam_split.py on each
# sam file and output the splited sam file to the output folder

sam_folder=$1
sample_list=$2
output_folder=$3
barcode_file=$4
cutoff=$5
core=$6

#define the location of the python
python_path="/net/shendure/vol1/home/cao1025/anaconda2/bin/python2.7"
script_path="/net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/"

echo
echo "Start splitting the sam file..."
echo samfile folder: $sam_folder
echo sample list: $sample_list
echo ouput folder: $output_folder
echo barcode file: $barcode_file
echo cutoff value: $cutoff

mkdir $output_folder
for sample in $(cat $sample_list); do echo Now splitting $sample; sem -j+$core $python_path $script_path/sam_split_permuted.py $sam_folder/$sample.sam $barcode_file $output_folder $cutoff; done
sem --wait
cat $output_folder/*sample_list.txt>$output_folder/All_samples.txt
cp $output_folder/All_samples.txt $output_folder/../barcode_samples.txt

# output the report the report/barcode_read_distribution folder
mkdir -p $output_folder/../report/barcode_read_distribution
mv $output_folder/*.txt $output_folder/../report/barcode_read_distribution/
mv $output_folder/*.png $output_folder/../report/barcode_read_distribution/

echo
echo "All sam file splitted."

In [32]:
%%writefile /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/report_human_mouse_fraction.sh
#! /bin/bash

# this script accept a input folder, a sample ID file, and a output folder, then count the read number in mouse and human read into the report folder
input_folder=$1
sample_ID=$2
output_folder=$3

echo 
echo "Start calculating the mouse and human fraction..."
mkdir -p $output_folder
echo sample,human_reads,mouse_reads>$output_folder/human_mouse_fraction.txt
echo sample,human_reads,mouse_reads>$output_folder/human_mouse_fraction_permuted.txt
for sample in $(cat $sample_ID); do echo Processing $sample; echo $sample,$(samtools view $input_folder/$sample.sam|grep 'chr' -v|wc -l),$(samtools view $input_folder/$sample.sam|grep 'chr'|wc -l)>>$output_folder/human_mouse_fraction.txt; done

#for sample in $(cat $sample_ID); do echo Processing permuted $sample; echo $sample.permuted,$(samtools view $input_folder/$sample.permuted.sam|grep 'chr' -v|wc -l),$(samtools view $input_folder/$sample.permuted.sam|grep 'chr'|wc -l)>>$output_folder/human_mouse_fraction_permuted.txt; done
echo "Calculation done."



Overwriting /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/report_human_mouse_fraction.sh


In [None]:
# %load /net/shendure/vol1/home/cao1025/analysis_script/ATAC_RNA_coassay_pipe/scATAC_seq/bam_fragment_size.py
"""
Created on Mon Aug 10 09:07:28 2015

@author: Junyue
"""

'''
python bam_fragment_size.py input_folder sampleID output_folder

this script accept a input folder which have all the bam file and sample ID,
then calculate all the fragment size and output the file to the output folder

'''
import sys
import subprocess

def bam_fragment_size(input_folder, sampleID, output_folder):
    #create the output folder
    make_folder = 'mkdir -p ' + output_folder
    err = subprocess.call(make_folder, shell=True)
    
    #transform bam file to sam file and calculate the fragment size for each file
    # in the input folder according to the sample ID
    f1 = open(sampleID, 'r')
    for i in f1:
        sample = i.strip()
        input_file = input_folder + '/' + sample + '.bam'
        sam_file = input_folder + '/sam_file/' + sample + '.sam'
        frag_size_file = output_folder + '/' + sample + '_frag_size.tsv'
        
        print '''
        ***********calculate the fragment size******************
        sample: %s
        input_file: %s
        output_file: %s''' %(sample, input_file, frag_size_file)
        
        #make a folder for the samfile
        err = subprocess.call('mkdir -p ' + input_folder + '/sam_file', shell = True)
        #transform the file to sam file
        bamtosam = 'samtools view ' + input_file + ' >' + sam_file
        err = subprocess.call(bamtosam, shell=True)
        if err != 0:
            raise IOError("!!!!!!!!bam file cannot be transformed to samfile!!!!!!!!")
        
        #calculate the fragment size of the sam file
        frag_size_cal = 'awk -v pat=' + "'" + sample + "'" + ''' '{if ($9 > 0) {print $9,pat}}' ''' + sam_file \
        + ' >' + frag_size_file
        
        print 'command for fragment size calculation: %s' %(frag_size_cal)
        
        err = subprocess.call(frag_size_cal, shell=True)
        
        if err != 0:
            raise IOError("!!!!!!!!!!fragment size calculation met error!!!!!!!!!")
        else:
            print "~~~~~~~~~fragment size calculation for %s is done~~~~~~~~~~" %(sample)
    
    print "~~~~~~~~~~~~~~~~~all fragment size calculation is done ~~~~~~~~~~~~~"


if __name__ == "__main__":
    input_folder = sys.argv[1]
    sampleID = sys.argv[2]
    output_folder = sys.argv[3]
    bam_fragment_size(input_folder, sampleID, output_folder)


In [36]:
%%writefile frag_size.sh

input_folder=$1
sample_ID=$2
output_folder=$3
core=$4

mkdir -p $output_folder
for sample in $(cat $sample_ID); do echo Processing $sample; sem -j+$core samtools view $input_folder/$sample.sam | awk -v pat=$sample '{if ($9 > 0) {print $9,pat}}' - >$output_folder/$sample.csv; done
sem --semaphoretimeout 600

Overwriting frag_size.sh


In [None]:
# %load ./dnase_intersection.sh
#!/bin/bash

# this script takes a input folder including all the bed files, a sample ID file, and a output foler, a exon1 file, a exon2 file,
#  and then it calculate the intersected read number for each bed file

input_folder=$1
sample_ID=$2
output_folder=$3
exon_bed=~/reference/bed_reference/hg19/dnase_hotspot/rmchr_dnase_hotspot.bed
gene_bed=~/reference/bed_reference/mm10/dnas_hot_spot/dnase_3T3_L1/3T3_L1.dnase.peak.bed

echo
echo "Start read distribution calculation.."
mkdir $output_folder
echo sample,total_reads,293T,3T3 >$output_folder/read_distribution.csv
for sample in $(cat $sample_ID); do echo Calculation for $sample; echo $sample,$(cat $input_folder/$sample.bed|wc -l),$(bedtools intersect -a $input_folder/$sample.bed -b $exon_bed -u|wc -l),$(bedtools intersect -a $input_folder/$sample.bed -b $gene_bed -u|wc -l)>>$output_folder/read_distribution.csv;done

echo "Read distribution calculation finished~"


In [None]:
# %load ../../ATACseq_pipe/report_reads_num.sh
#!/bin/bash/

# this script accept the input parental folder, create a report folder and a read_number sub folder, and for each sample, 
# calculate the read number in fastq, trimmed_fastq, UMI_attached_R2, STAR_alignment, filtered_sam, samfile. after filter
# barcode

parental_folder=$1
sample_ID=$2
fastq_folder=$parental_folder/fastq
trimmed_folder=$parental_folder/trimmed_fastq
alignment=$parental_folder/alignment_bowtie2
filtered_sam=$parental_folder/filtered_sam/mapped_bam
rm_dup_sam=$parental_folder/filtered_sam/rmdup_bam
#split_sam=$parental_folder/splited_sam
report_folder=$parental_folder/report/read_number

echo
echo "Start calculating the reads number..."
#make the report folder
mkdir -p $report_folder
#calculate the read number and output the read number into the report folder

echo sample,total reads,after trimming,all mapped reads, uniquely aligned reads,After remove duplicates>$report_folder/read_number.csv

for sample in $(cat $sample_ID); do echo calculating $sample; echo $sample,$(expr $(zcat $fastq_folder/$sample*R2*.gz|wc -l) / 4),$(expr $(zcat $trimmed_folder/$sample*R2*.gz|wc -l) / 4), $(samtools view -F 4 -q 30 $alignment/$sample.sam|wc -l),$(samtools view $filtered_sam/$sample.bam|wc -l),$(samtools view $rm_dup_sam/$sample.bam|wc -l)>>$report_folder/read_number.csv; done
echo "Read number calculation is done."
