From e8657a5cc27abf64d19dc8f6a40203e5a8745201 Mon Sep 17 00:00:00 2001 From: JTFouquier Date: Mon, 27 Aug 2018 21:05:49 -0600 Subject: [PATCH] add transformer; rna to dna; plugin setup fixes --- q2_ghost_tree/_transformer.py | 67 +++++++++++++++++++++++++++++++++++ q2_ghost_tree/plugin_setup.py | 12 +++++-- 2 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 q2_ghost_tree/_transformer.py diff --git a/q2_ghost_tree/_transformer.py b/q2_ghost_tree/_transformer.py new file mode 100644 index 0000000..e5d7f8c --- /dev/null +++ b/q2_ghost_tree/_transformer.py @@ -0,0 +1,67 @@ +from io import StringIO +from q2_ghost_tree.plugin_setup import plugin + +from q2_types.feature_data import AlignedDNAFASTAFormat, \ + AlignedDNASequencesDirectoryFormat +from ._aligned_rna_sequences import AlignedRNAFASTAFormat +from q2_types.feature_data import FeatureData, Sequence, AlignedSequence + + +def parse_fasta(f, trim_desc=False): + # TODO this is from Kyle B. Cite properly before release + """Parse a FASTA format file. + + Parameters + ---------- + f : File object or iterator returning lines in FASTA format. + + Returns + ------- + An iterator of tuples containing two strings + First string is the sequence description, second is the + sequence. + + Notes + ----- + This function removes whitespace in the sequence and translates + "U" to "T", in order to accommodate FASTA files downloaded from + SILVA and the Living Tree Project. + """ + f = iter(f) + desc = next(f).strip()[1:] + if trim_desc: + desc = desc.split()[0] + seq = StringIO() + for line in f: + line = line.strip() + if line.startswith(">"): + yield desc, seq.getvalue() + desc = line[1:] + if trim_desc: + desc = desc.split()[0] + seq = StringIO() + else: + seq.write(line.replace(" ", "").replace("U", "T")) + yield desc, seq.getvalue() + + + +def write_fasta(f, seqs): + for desc, seq in seqs: + f.write(">{0}\n{1}\n".format(desc, seq)) + + +# TODO change silly function name +# The issue here is the wrong data format AND transformer does not work +# even with simple test like seq.write("ATCG") +@plugin.register_transformer +def _my_great_transformer(ff: AlignedRNAFASTAFormat) -> \ + AlignedDNAFASTAFormat: + + ff2 = AlignedDNAFASTAFormat() + seqs = parse_fasta(ff) + + write_fasta(ff2, seqs) + + return ff2 + # convert RNA to DNA, output is a new instance of AlignedDNAFASTAFormat diff --git a/q2_ghost_tree/plugin_setup.py b/q2_ghost_tree/plugin_setup.py index a6f481c..b2c763b 100644 --- a/q2_ghost_tree/plugin_setup.py +++ b/q2_ghost_tree/plugin_setup.py @@ -1,7 +1,7 @@ import qiime2.plugin from q2_types.feature_data import FeatureData, Sequence, AlignedSequence, \ - Taxonomy + Taxonomy, AlignedDNASequencesDirectoryFormat from q2_types.tree import Phylogeny, Rooted, Unrooted import q2_ghost_tree @@ -214,23 +214,29 @@ plugin.register_semantic_type_to_format( SilvaTaxonomy, artifact_format=SilvaTaxonomyDirectoryFormat) +# TODO +# Changing Silva dependent functions to only require DNA AlignedRNASequences = qiime2.plugin.SemanticType('AlignedRNASequences') plugin.register_formats(AlignedRNAFASTAFormat, AlignedRNAFASTADirectoryFormat) plugin.register_semantic_types(AlignedRNASequences) plugin.register_semantic_type_to_format( AlignedRNASequences, artifact_format=AlignedRNAFASTADirectoryFormat) +# TODO +plugin.register_semantic_type_to_format( + FeatureData[AlignedSequence], + artifact_format=AlignedRNAFASTADirectoryFormat) plugin.methods.register_function( function=extract_fungi, inputs={ - 'aligned_silva_file': AlignedRNASequences, + 'aligned_silva_file': FeatureData[AlignedSequence], 'accession_file': SilvaAccession, # Silva semantic type 'taxonomy_file': SilvaTaxonomy, # Silva semantic type }, parameters={ }, outputs=[ - ('aligned_seqs', AlignedRNASequences), + ('aligned_seqs', FeatureData[AlignedSequence]), ], input_descriptions={ 'aligned_silva_file': 'TODO',