/
Expression_count.py
84 lines (69 loc) · 2.88 KB
/
Expression_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import re
import pysam
import os
parser=argparse.ArgumentParser()
parser.add_argument("-f","--file_in",help="Input bam file name", required=True)
parser.add_argument("-s","--sample_name",help="Sample name", required=True)
parser.add_argument("-c","--coordinates",help="Input file with coordinates", required=True)
parser.add_argument("-o","--file_out",help="Output results file name", default="Expression_results.tsv")
parser.add_argument("-q","--filter_file",help="File with query names for each primer generated by Filter_RACEbam", required=True)
parser.add_argument("-m","--min_mq",help="Minimum mapping quality to count a read", type=int, default=10)
args=parser.parse_args()
bam_chr=False
in_bamfile=pysam.AlignmentFile(args.file_in, "rb")
if "chr1" in in_bamfile.references:
bam_chr=True
filter_file=open(args.filter_file, "rt")
filter_dict={}
for line in filter_file:
line_vals=line.strip("\n").split(" ")
filter_dict[line_vals[0]]=line_vals[1].split(",")
filter_file.close()
coordinates_file=open(args.coordinates,'rt')
header=coordinates_file.readline().strip("\n").split("\t")
print_header=["Sample",]
print_res=[args.sample_name,]
for line in coordinates_file:
line_array=line.strip("\n").split("\t")
line_dict={}
for i in range(0,len(header)):
line_dict[header[i]]=line_array[i]
if re.match("chr",line_dict["Chr"]) and not(bam_chr):
line_dict["Chr"]=line_dict["Chr"][3:]
elif not(re.match("chr",line_dict["Chr"])) and bam_chr:
line_dict["Chr"]="chr"+line_dict["Chr"]
res_fetch=in_bamfile.fetch(contig=line_dict["Chr"], start=int(line_dict["Start"]), stop=int(line_dict["End"]))
all_reads=[]
for each_read in res_fetch:
if (not each_read.is_duplicate) and\
each_read.is_mapped and\
each_read.is_proper_pair and\
(not each_read.is_qcfail) and\
(not each_read.is_secondary) and\
(not each_read.is_supplementary) and\
(not each_read.is_unmapped) and\
each_read.mapping_quality>=args.min_mq and\
each_read.query_name in filter_dict[line_dict["Primer"]]:
all_reads.append(each_read.query_name)
res_count=len(set(all_reads))
print_header.append(line_dict["Name"])
print_res.append(str(res_count))
print_header.append("Total_queries")
total_read_names=[]
total_reads=in_bamfile.fetch()
for one_read in total_reads:
total_read_names.append(one_read.query_name)
total_queries=len(set(total_read_names))
print_res.append(str(total_queries))
for j in range(0,len(print_header)):
print(print_header[j],":",print_res[j])
if not os.path.exists(args.file_out):
res_file=open(args.file_out,'wt')
print("\t".join(print_header),file=res_file)
else:
res_file=open(args.file_out,'at')
print("\t".join(print_res),file=res_file)
res_file.close()
coordinates_file.close()
in_bamfile.close()