Merge pull request #104 from McTavishLab/bootstrap

Bootstrap
McTavishLab · Jun 9, 2020 · d6bde1f · d6bde1f
2 parents 0aa5245 + 86e656b
commit d6bde1f
Show file tree

Hide file tree

Showing 13 changed files with 202 additions and 96 deletions.
diff --git a/bin/physcraper_run.py b/bin/physcraper_run.py
@@ -14,6 +14,7 @@
 parser.add_argument("-as","--aln_schema", help="alignment schema (nexus or fasta)")
 parser.add_argument("-db", "--blast_db", help="local download of blast database")
 parser.add_argument("-o","--output", help="path to output directory")
+parser.add_argument("-bs","--bootstrap_reps", help="number of bootstrap reps")
 parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
 parser.add_argument("-c","--configfile", help="path to config file")
 parser.add_argument("-e","--email", help="email address for ncbi balst searches")
@@ -166,7 +167,7 @@
                                                 aln_schema = aln_schema,
                                                 workdir = workdir,
                                                 configfile = conf,
-                                                ingroup_mrca = ott_id)
+                                                search_taxon = ott_id)
         scraper = physcraper.PhyscraperScrape(data_obj, ids)
     else:
         scraper = scraper_from_opentree(study_id =study_id, 
@@ -176,11 +177,6 @@
                                         workdir = workdir,
                                         configfile = conf)
     sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))
-    scraper.data.write_files()
-    scraper.data.write_files(treefilename = "before_otu_{}.tre".format(scraper.data.tag),
-                             alnfilename = "before_otu_{}.aln".format(scraper.data.tag))
-    scraper.data.write_labelled(filename="before_labelled_{}".format(scraper.data.tag), label='^ot:ottTaxonName')
-    scraper.data.write_otus(schema="json")
 
 if args.reload_files:
     if args.tag:
@@ -193,7 +189,12 @@
     sys.stdout.write("Reloaded {} taxa in alignment and tree\n".format(len(scraper.data.aln)))
 
 
+if args.bootstrap_reps:
+    boot_reps = args.bootstrap_reps
+else:
+    boot_reps = 100
+
 if not args.no_estimate_tree:
 #scraper.read_blast_wrapper()
-    scraper.est_full_tree()
+    scraper.calculate_final_tree(boot_reps = boot_reps)
     scraper.data.write_labelled(label='^ot:ottTaxonName')
diff --git a/bin/rf_distance.py b/bin/rf_distance.py
@@ -0,0 +1,104 @@
+import sys
+import os
+import json
+import argparse
+import dendropy
+import copy
+import physcraper
+from opentree import OT
+from dendropy.calculate import treecompare
+### Example
+# python ../physcraper/bin/rf_distance.py -t1 pg_238/inputs_pg_238tree109_RPB2/physcraper_pg_238tree109_RPB2.tre -t2 pg_238/outputs_pg_238tree109_RPB2/physcraper_pg_238tree109_RPB2.tre -otu pg_238/run_pg_238tree109_RPB2/otu_info_pg_238tree109_RPB2.json 
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-t1","--original_tree", help="Original tree")
+parser.add_argument("-t2","--updated_tree", help="Updated Tree")
+parser.add_argument("-otu", "--otu_info", help="File with taxon information JSON")
+
+schema = "newick"
+
+
+
+args = parser.parse_args()
+
+tns = dendropy.TaxonNamespace()
+tree1 = dendropy.Tree.get_from_path(args.original_tree,
+                                    schema,
+                                    taxon_namespace=tns,
+                                    preserve_underscores=True)
+tree2 = dendropy.Tree.get_from_path(args.updated_tree,
+                                    schema,
+                                    taxon_namespace=tns,
+                                    preserve_underscores=True)
+
+leaves_t1 = set([leaf.taxon for leaf in tree1.leaf_nodes()])
+leaves_t2 = set([leaf.taxon for leaf in tree2.leaf_nodes()])
+
+new_tips = len(leaves_t1) - len(leaves_t2)
+sys.stdout.write("{} new tips were added\n".format(new_tips))
+
+otu_dict = json.load(open(args.otu_info, "r"))
+
+old_spp = set()
+new_spp = set()
+
+## Prune trees to same leaf set
+
+for leaf in leaves_t1:
+    species = otu_dict[leaf.label]['^ot:ottId']
+    old_spp.add(species)
+
+for leaf in leaves_t2:
+    species = otu_dict[leaf.label]['^ot:ottId']
+    new_spp.add(species)
+
+
+sys.stdout.write("There were {} taxa in tree1, {} taxa in tree2\n".format(len(old_spp), len(new_spp)))
+
+
+prune = leaves_t2.symmetric_difference(leaves_t1)
+
+unpruned_tree2 = copy.deepcopy(tree2)
+tree2.prune_taxa(prune)
+
+
+RF = dendropy.calculate.treecompare.unweighted_robinson_foulds_distance(tree1, tree2)
+
+#Weighted RF
+weightedrf = dendropy.calculate.treecompare.weighted_robinson_foulds_distance(tree1, tree2)
+
+tree2.write(path = "pruned_updated.tre", schema="newick")
+print("The RobinsonFoulds distance between these trees is {} and the weighted RF is {}".format(RF, weightedrf))
+
+
+##Write out t2 for conflict with opentree
+workdir = os.getcwd()
+
+
+def write_conflict_tree(inputtree, otu_dict):
+        tmp_tree = copy.deepcopy(inputtree)
+        new_names = set()
+        i = 1
+        for node in tmp_tree:
+            i+=1
+            if node.taxon:
+                otu = otu_dict[node.taxon.label]
+                ottid = otu['^ot:ottId']
+                new_label = "_nd{}_ott{}".format(i, ottid)
+                node.taxon.label = new_label
+            else:
+                node.label = "_nd{}_".format(i)
+        return tmp_tree.as_string(schema="newick")
+
+treestr_updated = write_conflict_tree(unpruned_tree2, otu_dict)
+treestr_orig = write_conflict_tree(tree1, otu_dict)
+
+
+resp_updated = OT.conflict_str(treestr_updated, 'ott')
+resp_orig = OT.conflict_str(treestr_orig, 'ott')
+
+print(resp_orig.response_dict)
+
+
diff --git a/docs/ATT.md b/docs/ATT.md
@@ -10,7 +10,7 @@ Hypothetically, all the keys in the  otu_dict should be clean.
   * **newick**: dendropy.tre.as_string(schema=schema_trf) object
   * **otu_dict**: json file including the otu_dict information generated earlier
   * **alignment**: dendropy DNACharacterMatrix object
-  * **ingroup_mrca**: OToL identifier of the group of interest, either subclade as defined by user or of
+  * **search_taxon**: OToL identifier of the group of interest, either subclade as defined by user or of
                     all tiplabels in the phylogeny
   * **workdir**: the path to the corresponding working directory
   * **schema**: optional argument to define tre file schema, if different from "newick"

diff --git a/docs/examples/minimal.py b/docs/examples/minimal.py
@@ -42,7 +42,7 @@
                              treefile=tre_fi,
                              tree_schema = "newick", #schema_trf
                              otu_json=otu_jsonfi,
-                             ingroup_mrca=mrca)
+                             search_taxon=mrca)
 
 data_obj.tag = "minEx"
 
@@ -60,7 +60,7 @@
 
 sys.stdout.write("Running align_query_seqs()...\n")
 scraper.align_new_seqs()
-scraper.est_full_tree()
+scraper.calculate_final_tree(boot_reps=30)
 scraper.data.write_labelled(label="^ncbi:taxon", filename="updated_ncbi_id", norepeats=False, direc = scraper.outputsdir)
 
 

diff --git a/physcraper/aligntreetax.py b/physcraper/aligntreetax.py
@@ -11,7 +11,7 @@
 from physcraper.helpers import standardize_label, to_string, debug
 
 _VERBOSE = 1
-_DEBUG = 0
+_DEBUG = 1
 
 def generate_ATT_from_files(workdir,
                             configfile,
@@ -20,7 +20,7 @@ def generate_ATT_from_files(workdir,
                             treefile,
                             otu_json,
                             tree_schema,
-                            ingroup_mrca=None):
+                            search_taxon=None):
     """Build an ATT object without phylesystem, use your own files instead.
 
     Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in.
@@ -34,7 +34,7 @@ def generate_ATT_from_files(workdir,
     :param treefile: path to phylogeny
     :param otu_json: path to json file containing the translation of tip names to taxon names, generated with OtuJsonDict()
     :param tree_schema: a string defining the format of the input phylogeny
-    :param ingroup_mrca: optional - OToL ID of the mrca of the clade of interest. If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca.
+    :param search_taxon: optional - OToL ID of the mrca of the clade of interest. If no search mrca ott_id is provided, will use all taxa in tree to calc mrca.
 
     :return: object of class ATT
     """
@@ -45,14 +45,14 @@ def generate_ATT_from_files(workdir,
         os.makedirs(workdir)
     # use replaced aln as input
     otu_dict = json.load(open(otu_json, "r"))
-    if ingroup_mrca:
-        mrca_ott = int(ingroup_mrca)
+    if search_taxon:
+        mrca_ott = int(search_taxon)
     else:
         ott_ids = [otu_dict[otu].get(u'^ot:ottId', ) for otu in otu_dict]
         ott_ids = filter(None, ott_ids)
         ott_ids = set(ott_ids)
         mrca_ott = get_mrca_ott(ott_ids)
-    return AlignTreeTax(treefile, otu_dict, alnfile, ingroup_mrca=mrca_ott, workdir=workdir,
+    return AlignTreeTax(treefile, otu_dict, alnfile, search_taxon=mrca_ott, workdir=workdir,
                         configfile=configfile, tree_schema=tree_schema)
 
 def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=None):
@@ -78,7 +78,7 @@ def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=No
         assert(os.path.exists(otu_json))
         otu_dict = json.load(open(otu_json, "r"))
         mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
-        return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
+        return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, search_taxon=mrca_ott, workdir=workdir,
                     configfile=configfile, tag=tag, tree_schema='newick')
     except AssertionError:
         sys.stdout.write("No output files found in {}, loading files from {}\n".format(outputsdir, inputsdir))
@@ -90,7 +90,7 @@ def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=No
         assert(os.path.exists(otu_json)), otu_json
         otu_dict = json.load(open(otu_json, "r"))
         mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
-        return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
+        return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, search_taxon=mrca_ott, workdir=workdir,
                 configfile=configfile, tag=tag, tree_schema='newick')
     #except AssertionError:
        #     sys.stdout.write("No run files found in {} or {}. Data not loaded\n".format(outputsdir, inputsdir))
@@ -129,7 +129,7 @@ class AlignTreeTax(object):
           * **newick**: dendropy.tre.as_string(schema=schema_trf) object
           * **otu_dict**: json file including the otu_dict information generated earlier
           * **alignment**: dendropy :class:`DnaCharacterMatrix <dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix>` object
-          * **ingroup_mrca**: OToL identifier of the group of interest, either subclade as defined by user or of all tip labels in the phylogeny
+          * **search_taxon**: OToL identifier of the group of interest, either subclade as defined by user or of all tip labels in the phylogeny
           * **workdir**: the path to the corresponding working directory
           * **config_obj**: Config class
           * **schema**: optional argument to define tre file schema, if different from "newick"
@@ -194,7 +194,7 @@ class AlignTreeTax(object):
           * self.aln, self.tre and self.otu_dict, self.ps_otu, self.gi_dict
     """
 
-    def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=None,
+    def __init__(self, tree, otu_dict, alignment, search_taxon, workdir, configfile=None,
                  tree_schema='newick',aln_schema ='fasta',taxon_namespace=None, tag=None):
         debug("build ATT class")
         if tag == None:
@@ -231,8 +231,8 @@ def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=
         self._reconcile()
         self._reconcile_names()
 
-        assert int(ingroup_mrca), ("your ingroup_mrca '%s' is not an integer." % ingroup_mrca)
-        self.mrca_ott = ingroup_mrca  # ott_ingroup mrca can be pulled directly from phylesystem
+        assert int(search_taxon), ("your search_taxon '%s' is not an integer." % search_taxon)
+        self.mrca_ott = search_taxon  # ott mrca can be pulled directly from phylesystem
         self.orig_seqlen = []  # will get filled in later...
         self.gb_dict = {}  # has all info about new blast seq
         self._reconciled = False
@@ -726,7 +726,7 @@ def write_otus(self, filename = "otu_info", schema="table", direc='workdir'):
             #    all_keys.update(self.otu_dict[otu].keys())
             #keys = list(all_key)
             #keys.sort()
-            keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^ncbi:gi','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
+            keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
             header = ["otu_id"] + keys
             with open("{}/{}_{}.csv".format(direc, filename, self.tag), "w") as outfile:
                 outfile.write("\t".join(header)+"\n")

diff --git a/physcraper/helpers.py b/physcraper/helpers.py
@@ -58,7 +58,7 @@ def cd(path):
                       # configfi='config.out',
                       # treefile='physcraper.tre',
                       # schema_trf='newick',
-                      # ingroup_mrca = 'mrca.txt'):
+                      # search_taxon = 'mrca.txt'):
 
 
 def write_filterblast_db(workdir, seq_name, seq, fn):

diff --git a/physcraper/opentree_helpers.py b/physcraper/opentree_helpers.py
@@ -141,7 +141,7 @@ def generate_ATT_from_phylesystem(alnfile,
                                   study_id,
                                   tree_id,
                                   phylesystem_loc='api',
-                                  ingroup_mrca=None,
+                                  search_taxon=None,
                                   tip_label='^ot:originalLabel'):
     """gathers together tree, alignment, and study info - forces names to otu_ids.
 
@@ -155,7 +155,7 @@ def generate_ATT_from_phylesystem(alnfile,
     :param study_id: OToL study id of the corresponding phylogeny which shall be updated
     :param tree_id: OToL corresponding tree ID as some studies have several phylogenies
     :param phylesystem_loc: access the github version of the OpenTree data store, or a local clone
-    :param ingroup_mrca: optional.  OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny)
+    :param search_taxon: optional.  OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny)
     :return: object of class ATT
     """
     assert(tip_label in ['^ot:originalLabel', 'otu', "^ot:ottTaxonName", "^ot:ottId"])
@@ -196,12 +196,12 @@ def generate_ATT_from_phylesystem(alnfile,
         treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId")
     # need to prune tree to seqs and seqs to tree...
     ott_mrca = None
-    if ingroup_mrca:
-        if type(ingroup_mrca) == list:
-            ott_ids = set(ingroup_mrca)
+    if search_taxon:
+        if type(search_taxon) == list:
+            ott_ids = set(search_taxon)
             ott_mrca = get_mrca_ott(ott_ids)
         else:
-            ott_mrca = int(ingroup_mrca)
+            ott_mrca = int(search_taxon)
     if ott_mrca == None:
         ingroup_ott_ids = set()
         for otu_id in otu_dict:
@@ -212,7 +212,7 @@ def generate_ATT_from_phylesystem(alnfile,
         assert(len(ingroup_ott_ids)>=1)
         ott_mrca = get_mrca_ott(ingroup_ott_ids)
     otu_newick = tree_obj.as_string(schema="newick")
-    return physcraper.aligntreetax.AlignTreeTax(tree = otu_newick, otu_dict =otu_dict, alignment=alnfile, aln_schema = aln_schema, ingroup_mrca=ott_mrca, workdir=workdir, configfile=configfile)
+    return physcraper.aligntreetax.AlignTreeTax(tree = otu_newick, otu_dict =otu_dict, alignment=alnfile, aln_schema = aln_schema, search_taxon=ott_mrca, workdir=workdir, configfile=configfile)
     # newick should be bare, but alignment should be DNACharacterMatrix