Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
updated sam_to_gff3.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Magdoll committed Oct 16, 2018
1 parent 72eee8a commit a1c6f32
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 16 deletions.
6 changes: 4 additions & 2 deletions README.md
@@ -1,10 +1,10 @@
# cDNA_Cupcake

Last Updated: 10/12/2018
Last Updated: 10/15/2018

**cDNA_Cupcake** is a miscellaneous collection of Python and R scripts used for analyzing sequencing data. Most of the scripts only require [Biopython](http://biopython.org/wiki/Download). For scripts that require additional libraries, it will be specified in documentation.

Current version: 5.10
Current version: 5.11

## Python Requirements
* Python >= 2.7
Expand Down Expand Up @@ -63,6 +63,8 @@ A brief list of currently listed scripts are:

## Version Changes

2018.10.15 updated to v5.11. `sam_to_gff3.py` updated to allow `source` param.

2018.10.12 updated to v5.10. collapse scripts further handles isoseq3 with mapping formats.

2018.08.30 updated to v5.9. have collapse script handle isoseq3 formats correctly in get_fl_from_id().
Expand Down
20 changes: 14 additions & 6 deletions post_isoseq_cluster/demux_isoseq_with_genome.py
Expand Up @@ -49,9 +49,11 @@
from collections import defaultdict, Counter
from Bio import SeqIO

hq1_id_rex = re.compile('i\d+_HQ_\S+\|(\S+)\/f\d+p\d+\/\d+')
hq2_id_rex = re.compile('HQ_\S+\|(\S+)\/f\d+p\d+\/\d+')
hq3_id_rex = re.compile('[\S_]+(transcript/\d+)')
#hq1_id_rex = re.compile('i\d+_HQ_\S+\|(\S+)\/f\d+p\d+\/\d+')
#hq2_id_rex = re.compile('HQ_\S+\|(\S+)\/f\d+p\d+\/\d+')
#hq3_id_rex = re.compile('[\S_]+(transcript/\d+)')

mapped_id_rex = re.compile('(PB.\d+.\d+)')

def link_files(src_dir, out_dir='./'):
"""
Expand Down Expand Up @@ -128,6 +130,9 @@ def read_classify_csv(classify_csv):
else: # isoseq1 or 2
p = r['primer']
primer_list.add(p)
if r['id'] in info:
raise Exception, "{0} listed more than once in {1}!".format(\
r['id'], classify_csv)
info[r['id']] = p
return primer_list, info

Expand Down Expand Up @@ -158,9 +163,9 @@ def main(job_dir=None, mapped_fastq=None, read_stat=None, classify_csv=None, out

# info: dict of hq_isoform --> primer --> FL count
print >> sys.stderr, "Reading {0}....".format(classify_csv)
primer_list, classify_csv = read_classify_csv(classify_csv)
primer_list, classify_info = read_classify_csv(classify_csv)
print >> sys.stderr, "Reading {0}....".format(read_stat)
info = read_read_stat(read_stat, classify_csv)
info = read_read_stat(read_stat, classify_info)

primer_list = list(primer_list)
primer_list.sort()
Expand All @@ -177,7 +182,10 @@ def main(job_dir=None, mapped_fastq=None, read_stat=None, classify_csv=None, out
f.write("id,{0}\n".format(",".join(primer_names.values())))
print >> sys.stderr, "Reading {0}....".format(mapped_fastq)
for r in SeqIO.parse(open(mapped_fastq), 'fastq'):
pbid = r.id.split('|')[0]
m = mapped_id_rex.match(r.id) # expected ID: PB.X.Y|xxxx.....
if m is None:
raise Exception, "Expected ID format PB.X.Y but found {0}!".format(r.id)
pbid = m.group(1)
f.write(pbid)
for p in primer_names:
f.write(",{0}".format(info[pbid][p]))
Expand Down
16 changes: 8 additions & 8 deletions sequence/sam_to_gff3.py
Expand Up @@ -30,7 +30,7 @@
from Bio.SeqFeature import SeqFeature, FeatureLocation
from cupcake.io.BioReaders import GMAPSAMReader

def convert_sam_rec_to_gff3_rec(r, qid_index_dict=None):
def convert_sam_rec_to_gff3_rec(r, source, qid_index_dict=None):
"""
:param r: GMAPSAMRecord record
:param qid_seen: list of qIDs processed so far -- if redundant, we have to put a unique suffix
Expand All @@ -53,8 +53,8 @@ def convert_sam_rec_to_gff3_rec(r, qid_index_dict=None):
r.qID += '_dup' + str(qid_index_dict[r.qID])
else: qid_index_dict[r.qID] += 1

gene_qualifiers = {"source": args.source, "ID": r.qID+'.gene', "Name": r.qID} # for gene record
mRNA_qualifiers = {"source": args.source, "ID": r.qID, "Name": r.qID, "Parent": r.qID+'.gene',
gene_qualifiers = {"source": source, "ID": r.qID+'.gene', "Name": r.qID} # for gene record
mRNA_qualifiers = {"source": source, "ID": r.qID, "Name": r.qID, "Parent": r.qID+'.gene',
"coverage": "{0:.2f}".format(r.qCoverage*10**2) if r.qCoverage is not None else "NA",
"identity": "{0:.2f}".format(r.identity*10**2),
"matches": matches, "mismatches": mismatches, "indels": indels}
Expand All @@ -65,15 +65,15 @@ def convert_sam_rec_to_gff3_rec(r, qid_index_dict=None):
top_feature.sub_features = [SeqFeature(FeatureLocation(r.sStart, r.sEnd), type="mRNA", strand=strand, qualifiers=mRNA_qualifiers)]
# exon lines, as many exons per record
for i,e in enumerate(r.segments):
exon_qual = {"source": args.source, "ID": "{0}.exon{1}".format(r.qID,i+1), "Name": r.qID, "Parent": r.qID}
exon_qual = {"source": source, "ID": "{0}.exon{1}".format(r.qID,i+1), "Name": r.qID, "Parent": r.qID}
top_feature.sub_features.append(SeqFeature(FeatureLocation(e.start, e.end), type="exon", strand=strand, qualifiers=exon_qual))
rec.features = [top_feature]
return rec

def convert_sam_to_gff3(sam_filename, output_gff3, q_dict=None):
def convert_sam_to_gff3(sam_filename, output_gff3, source, q_dict=None):
qid_index_dict = Counter()
with open(output_gff3, 'w') as f:
recs = [convert_sam_rec_to_gff3_rec(r0, qid_index_dict) for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict)]
recs = [convert_sam_rec_to_gff3_rec(r0, source, qid_index_dict) for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict)]
BCBio_GFF.write(filter(lambda x: x is not None, recs), f)

def main():
Expand All @@ -82,7 +82,7 @@ def main():
parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF")
parser.add_argument("sam_filename")
parser.add_argument("-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.")
parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)")
parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)")

args = parser.parse_args()

Expand All @@ -98,7 +98,7 @@ def main():
q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta'))

with open(output_gff3, 'w') as f:
recs = [convert_sam_rec_to_gff3_rec(r0) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)]
recs = [convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)]
BCBio_GFF.write(filter(lambda x: x is not None, recs), f)


Expand Down

0 comments on commit a1c6f32

Please sign in to comment.