From e8657a5cc27abf64d19dc8f6a40203e5a8745201 Mon Sep 17 00:00:00 2001
From: JTFouquier <jennietf@gmail.com>
Date: Mon, 27 Aug 2018 21:05:49 -0600
Subject: [PATCH] add transformer; rna to dna; plugin setup fixes

---
 q2_ghost_tree/_transformer.py | 67 +++++++++++++++++++++++++++++++++++
 q2_ghost_tree/plugin_setup.py | 12 +++++--
 2 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 q2_ghost_tree/_transformer.py

diff --git a/q2_ghost_tree/_transformer.py b/q2_ghost_tree/_transformer.py
new file mode 100644
index 0000000..e5d7f8c
--- /dev/null
+++ b/q2_ghost_tree/_transformer.py
@@ -0,0 +1,67 @@
+from io import StringIO
+from q2_ghost_tree.plugin_setup import plugin
+
+from q2_types.feature_data import AlignedDNAFASTAFormat, \
+    AlignedDNASequencesDirectoryFormat
+from ._aligned_rna_sequences import AlignedRNAFASTAFormat
+from q2_types.feature_data import FeatureData, Sequence, AlignedSequence
+
+
+def parse_fasta(f, trim_desc=False):
+    # TODO this is from Kyle B. Cite properly before release
+    """Parse a FASTA format file.
+
+    Parameters
+    ----------
+    f : File object or iterator returning lines in FASTA format.
+
+    Returns
+    -------
+    An iterator of tuples containing two strings
+        First string is the sequence description, second is the
+        sequence.
+
+    Notes
+    -----
+    This function removes whitespace in the sequence and translates
+    "U" to "T", in order to accommodate FASTA files downloaded from
+    SILVA and the Living Tree Project.
+    """
+    f = iter(f)
+    desc = next(f).strip()[1:]
+    if trim_desc:
+        desc = desc.split()[0]
+    seq = StringIO()
+    for line in f:
+        line = line.strip()
+        if line.startswith(">"):
+            yield desc, seq.getvalue()
+            desc = line[1:]
+            if trim_desc:
+                desc = desc.split()[0]
+            seq = StringIO()
+        else:
+            seq.write(line.replace(" ", "").replace("U", "T"))
+    yield desc, seq.getvalue()
+
+
+
+def write_fasta(f, seqs):
+    for desc, seq in seqs:
+        f.write(">{0}\n{1}\n".format(desc, seq))
+
+
+# TODO change silly function name
+# The issue here is the wrong data format AND transformer does not work
+# even with simple test like seq.write("ATCG")
+@plugin.register_transformer
+def _my_great_transformer(ff: AlignedRNAFASTAFormat) -> \
+        AlignedDNAFASTAFormat:
+
+    ff2 = AlignedDNAFASTAFormat()
+    seqs = parse_fasta(ff)
+
+    write_fasta(ff2, seqs)
+
+    return ff2
+    # convert RNA to DNA, output is a new instance of AlignedDNAFASTAFormat
diff --git a/q2_ghost_tree/plugin_setup.py b/q2_ghost_tree/plugin_setup.py
index a6f481c..b2c763b 100644
--- a/q2_ghost_tree/plugin_setup.py
+++ b/q2_ghost_tree/plugin_setup.py
@@ -1,7 +1,7 @@
 import qiime2.plugin
 
 from q2_types.feature_data import FeatureData, Sequence, AlignedSequence, \
-    Taxonomy
+    Taxonomy, AlignedDNASequencesDirectoryFormat
 from q2_types.tree import Phylogeny, Rooted, Unrooted
 
 import q2_ghost_tree
@@ -214,23 +214,29 @@
 plugin.register_semantic_type_to_format(
     SilvaTaxonomy, artifact_format=SilvaTaxonomyDirectoryFormat)
 
+# TODO
+# Changing Silva dependent functions to only require DNA
 AlignedRNASequences = qiime2.plugin.SemanticType('AlignedRNASequences')
 plugin.register_formats(AlignedRNAFASTAFormat, AlignedRNAFASTADirectoryFormat)
 plugin.register_semantic_types(AlignedRNASequences)
 plugin.register_semantic_type_to_format(
     AlignedRNASequences, artifact_format=AlignedRNAFASTADirectoryFormat)
+# TODO
+plugin.register_semantic_type_to_format(
+    FeatureData[AlignedSequence],
+    artifact_format=AlignedRNAFASTADirectoryFormat)
 
 plugin.methods.register_function(
     function=extract_fungi,
     inputs={
-        'aligned_silva_file': AlignedRNASequences,
+        'aligned_silva_file': FeatureData[AlignedSequence],
         'accession_file': SilvaAccession,  # Silva semantic type
         'taxonomy_file': SilvaTaxonomy,  # Silva semantic type
         },
     parameters={
     },
     outputs=[
-        ('aligned_seqs', AlignedRNASequences),
+        ('aligned_seqs', FeatureData[AlignedSequence]),
     ],
     input_descriptions={
         'aligned_silva_file': 'TODO',