-
Notifications
You must be signed in to change notification settings - Fork 0
/
verify_reference.py
62 lines (56 loc) · 2.17 KB
/
verify_reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
"""
verify fasta output from build_reference.py
with Ensembl's transcript fasta file,
exit after spotting one different record
"""
import argparse
import sys
from Bio import SeqIO
def get_cds_from_fasta_description(description):
""" parse CDS range from fasta header """
words = description.split("|")
for w in words:
if w.startswith("CDS:"):
start, end = w.lstrip("CDS:").split("-")
return int(start), int(end)
return None
def make_arg_parser():
""" command line arguments """
parser = argparse.ArgumentParser(prog="verify_reference.py", add_help=True, \
description="compare built transcripts with ground truth")
parser.add_argument("-t", "--true_fa", required=True, help="true transcript fasta from Ensembl")
parser.add_argument("-c", "--case_fa", required=True, help="transcript fasta built by build_reference.py")
return parser
def main():
"""
compare built fasta with ground truth
exit after finding one different record
"""
parser = make_arg_parser()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
truth = SeqIO.index(args.true_fa, "fasta")
test = SeqIO.index(args.case_fa, "fasta")
for tid in test:
if tid in truth:
description = test[tid].description
cds_range = get_cds_from_fasta_description(description)
if not cds_range:
print("fasta not generated by ribodeblur! no CDS range found! ABORT!")
sys.exit(1)
true_seq = truth[tid].seq
test_seq = test[tid].seq[cds_range[0]:cds_range[1]]
if true_seq != test_seq:
print("oops sequence not the same as ground truth!")
print(tid, "true len", len(true_seq),
"case cds len", len(test_seq), cds_range,
"case total len", len(test[tid].seq))
print(description)
print(true_seq)
print(test_seq)
sys.exit(1)
print("Congrats! all sequences matched with truth!")
if __name__ == "__main__": main()