Merge pull request #139 from McTavishLab/no-ingroup-fix

No ingroup fix
McTavishLab · Jul 29, 2020 · fe8f64e · fe8f64e
2 parents 2db611f + 9693a61
commit fe8f64e
Show file tree

Hide file tree

Showing 5 changed files with 745 additions and 51 deletions.
diff --git a/bin/physcraper_run.py b/bin/physcraper_run.py
@@ -267,20 +267,39 @@
         sys.stdout.write("Excluding accession numbers {}\n".format(','.join(scraper.blocklist)))
 
 
-run = 1
+run = 0
 if args.repeat:
+    besttreepath = None
     to_be_blasted = ['first_pass']
+    rundir_base = scraper.rundir
     while len(to_be_blasted) >= 1:
         run += 1
-        print(scraper.blast_subdir)
         scraper.run_blast_wrapper()
         besttreepath = scraper.est_full_tree()
-        scraper.replace_tre(besttreepath)
-        scraper.data.write_labelled(filename="run_{}".format(run), label='^ot:ottTaxonName', direc=scraper.outputsdir)
-        scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
-        scraper.data.write_otus(schema='json', direc=scraper.rundir)
-        to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
-    scraper.calculate_final_tree(boot_reps = boot_reps)
+        if besttreepath:
+            prev_besttreepath = besttreepath
+            scraper.replace_tre(besttreepath)
+            scraper.data.write_labelled(filename="run_{}".format(run), label='^ot:ottTaxonName', direc=scraper.outputsdir)
+            scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
+            scraper.data.write_otus(schema='json', direc=scraper.rundir)
+            new_rundir = "{}_run{}".format(rundir_base, run)
+            prev_rundir = scraper.rundir
+            scraper.rundir = new_rundir
+            os.mkdir(scraper.rundir)
+            to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
+        elif besttreepath is None:
+            os.rmdir(scraper.rundir)
+            scraper.rundir = prev_rundir
+            updated_alnfi = "{}/physcraper_{}.fas".format(prev_rundir, scraper.data.tag)
+            bootpath = scraper.calculate_bootstrap(alignment = updated_alnfi, num_reps = boot_reps)
+            sumtreepath = scraper.summarize_boot(prev_besttreepath, bootpath)
+            scraper.replace_tre(sumtreepath, schema="nexus")
+            scraper.data.write_files(direc=scraper.outputsdir)
+            scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
+            scraper.data.write_labelled(filename='updated_taxonname',label='^ot:ottTaxonName', direc = scraper.outputsdir)
+            to_be_blasted =  []
+        else:
+            sys.stderr.write("unexpected error")
 elif not args.no_estimate:
 #scraper.read_blast_wrapper()
     scraper.calculate_final_tree(boot_reps = boot_reps)

diff --git a/physcraper/opentree_helpers.py b/physcraper/opentree_helpers.py
@@ -167,27 +167,32 @@ def bulk_tnrs_load(filename):
     otu_dict = {}
     with open(filename) as data_file:
         input_dict = json.load(data_file)
-    for name in input_dict["names"]:
-        i = 1
-        otu = "otu" + name['id'].strip('name')
-        while otu in otu_dict.keys():
-            otu = "{}_{}".format(otu, i)
-            i += 1
-        otu_dict[otu] = {"^ot:originalLabel":name["originalLabel"]}
-        if name.get("ottTaxonName"):
-            otu_dict[otu]["^ot:ottTaxonName"] = name["ottTaxonName"]
-        if name.get("ottId"):
-            otu_dict[otu]["^ot:ottId"] = name["ottId"]
-        for source in name.get("taxonomicSources", []):
-            #debug(source)
-            if source:
-                taxsrc = source.split(":")
-                assert len(taxsrc) == 2, taxsrc
-                otu_dict[otu]["^{}:taxon".format(taxsrc[0])] = taxsrc[1]
-    for otu in otu_dict:
-        otu_dict[otu]["^physcraper:status"] = "original"
-        otu_dict[otu]["^physcraper:last_blasted"] = None
-        otu_dict[otu]["^physcraper:ingroup"] = "unknown"
+    if "names" in input_dict.keys():
+        for name in input_dict["names"]:
+            i = 1
+            otu = "otu" + name['id'].strip('name')
+            while otu in otu_dict.keys():
+                otu = "{}_{}".format(otu, i)
+                i += 1
+            otu_dict[otu] = {"^ot:originalLabel":name["originalLabel"]}
+            if name.get("ottTaxonName"):
+                otu_dict[otu]["^ot:ottTaxonName"] = name["ottTaxonName"]
+            if name.get("ottId"):
+                otu_dict[otu]["^ot:ottId"] = name["ottId"]
+            for source in name.get("taxonomicSources", []):
+                #debug(source)
+                if source:
+                    taxsrc = source.split(":")
+                    assert len(taxsrc) == 2, taxsrc
+                    otu_dict[otu]["^{}:taxon".format(taxsrc[0])] = taxsrc[1]
+        for otu in otu_dict:
+            otu_dict[otu]["^physcraper:status"] = "original"
+            otu_dict[otu]["^physcraper:last_blasted"] = None
+            otu_dict[otu]["^physcraper:ingroup"] = "unknown"
+    else:
+        for otu in input_dict:
+            assert input_dict[otu]["^physcraper:status"], otu_dict[otu]
+            otu_dict = input_dict
     return otu_dict
 
 
@@ -311,10 +316,13 @@ def generate_ATT_from_phylesystem(alnfile,
     otu_dict = {tn.taxon.otu:{} for tn in tree_obj.leaf_node_iter()}
     orig_lab_to_otu = {}
     treed_taxa = {}
-    ingroup_otus = set(nexson_helpers.get_subtree_otus(study_nexson,
+    ingroup_otus = nexson_helpers.get_subtree_otus(study_nexson,
                                                        tree_id=tree_id,
                                                        subtree_id="ingroup",
-                                                       return_format="otu_id"))
+                                                       return_format="otu_id")
+    if not ingroup_otus:
+        sys.stdout.write("No ingroup annotation found in tree; using all taxa.\n \
+                          Please update tree annotation through OpenTree curation app.\n")
     for leaf in tree_obj.leaf_node_iter():
         tn = leaf.taxon
         otu_id = tn.otu
@@ -323,10 +331,13 @@ def generate_ATT_from_phylesystem(alnfile,
         otu_dict[otu_id]["^ot:originalLabel"] = tn.original_label.replace(" ", "_")
         otu_dict[otu_id]["^physcraper:status"] = "original"
         otu_dict[otu_id]["^physcraper:last_blasted"] = None
-        if otu_id in ingroup_otus:
-            otu_dict[otu_id]["^physcraper:ingroup"] = True
+        if ingroup_otus:
+            if otu_id in set(ingroup_otus):
+                otu_dict[otu_id]["^physcraper:ingroup"] = True
+            else:
+                otu_dict[otu_id]["^physcraper:ingroup"] = False
         else:
-            otu_dict[otu_id]["^physcraper:ingroup"] = False
+            otu_dict[otu_id]["^physcraper:ingroup"] = 'unknown'
         orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_")
         orig_lab_to_otu[orig] = otu_id
         if tip_label == 'otu':
@@ -345,7 +356,10 @@ def generate_ATT_from_phylesystem(alnfile,
     if ott_mrca is None:
         ingroup_ott_ids = set()
         for otu_id in otu_dict:
-            if otu_dict[otu_id]["^physcraper:ingroup"] == True:
+            if ingroup_otus:
+                if otu_dict[otu_id]["^physcraper:ingroup"] == True:
+                    ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId"))
+            else:
                 ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId"))
         if None in ingroup_ott_ids:
             ingroup_ott_ids.remove(None)

diff --git a/physcraper/scrape.py b/physcraper/scrape.py
@@ -284,7 +284,8 @@ def run_blast_wrapper(self):
                     file_ending = "txt"
                 else:
                     file_ending = "xml"
-                fn_path = "{}/{}.{}".format(self.blast_subdir, taxon.label, file_ending)
+                idtag = "{}{}".format(taxon.label, self.data.otu_dict[taxon.label].get('^ncbi:accession', ''))
+                fn_path = "{}/{}.{}".format(self.blast_subdir, idtag, file_ending)
                 # if _DEBUG:
                 #     sys.stdout.write("attempting to write {}\n".format(fn_path))
                 if not os.path.isfile(fn_path):
@@ -309,8 +310,6 @@ def run_blast_wrapper(self):
                 if _VERBOSE:
                     sys.stdout.write("otu {} was last blasted on {}, {} days ago and is not being re-blasted. "
                                      "Use run_blast_wrapper(delay = 0) to force a search.\n".format(otu_id, last_blast, time_passed))
-    #except KeyboardInterrupt:
-           # sys.exit()
         self._blasted = 1
 
 
@@ -517,7 +516,8 @@ def read_blast_wrapper(self, blast_dir=None):
                 file_ending = "txt"
             else:
                 file_ending = "xml"
-            fn_path = "{}/{}.{}".format(self.blast_subdir, taxon.label, file_ending)
+            idtag = "{}{}".format(taxon.label, self.data.otu_dict[taxon.label].get('^ncbi:accession', ''))
+            fn_path = "{}/{}.{}".format(self.blast_subdir, idtag, file_ending)
             if _DEBUG:
                 sys.stdout.write("reading {}\n".format(fn_path))
             if os.path.isfile(fn_path):
@@ -533,8 +533,9 @@ def read_blast_wrapper(self, blast_dir=None):
         if len(self.new_seqs) == 0:
             sys.stderr.write("no new sequences found in blast. Exiting")
             sys.exit()
-        self.remove_identical_seqs()
+        ret = self.remove_identical_seqs()
         self._blast_read = 1
+        return ret
 
 
     def seq_dict_build(self, seq, new_otu_label, seq_dict):
@@ -681,14 +682,13 @@ def remove_identical_seqs(self):
         self.new_seqs_otu_id = filter_dict  # renamed new seq to their otu_ids from GI's, but all info is in self.otu_dict
         self.new_seqs = {} #Wipe clean
 #        debug("len new seqs otu dict after remove identical{}".format(len(self.new_seqs_otu_id)))
-        sys.stdout.write("**** Found {} new sequences****\n".format(len(self.new_seqs_otu_id)))
+        sys.stdout.write("**** Found {} new sequences after removing identical seqs****\n".format(len(self.new_seqs_otu_id)))
         if len(self.new_seqs_otu_id)==0:
-            pass
-#            sys.exit()
+            return None
         with open(self.logfile, "a") as log:
             log.write("{} new sequences added from Genbank after removing identical seq, "
                       "of {} before filtering\n".format(len(self.new_seqs_otu_id), len(self.new_seqs)))
-#        self.data.dump()
+        return len(self.new_seqs_otu_id)
 
     def filter_seqs(self, tmp_dict, type="random", threshold=None):
         if threshold == None:
@@ -776,8 +776,11 @@ def select_seq_at_random(self, otu_list, count):
     def write_new_seqs(self, filename='date'):
         """writes out the query sequence file"""
         debug("write query seq")
+        new_seqs = len(self.new_seqs_otu_id.keys())
         if not self._blast_read:
-            self.read_blast_wrapper()
+            new_seqs = self.read_blast_wrapper()
+        if new_seqs is None:
+            return None
         if filename == 'date':
             self.newseqs_file = "NEW{}_{}.fasta".format(self.date, self.data.tag)
         else:
@@ -794,14 +797,19 @@ def write_new_seqs(self, filename='date'):
 
 
     def align_new_seqs(self, aligner = 'muscle'):
+        ret = len(self.new_seqs_otu_id)
         if not self._blast_read:
-            self.read_blast_wrapper()
+           ret = self.read_blast_wrapper()
+        if ret is None:
+            return None
         assert aligner in ['muscle', 'papara']
         if aligner == 'papara':
             self.run_papara()
         if aligner == 'muscle':
             self.run_muscle()
         alnfi = self.data.write_aln(direc=self.rundir)
+        if alnfi is None:
+            return None
         self.data.write_otus(schema='table', direc=self.outputsdir)
         self.data.write_otus(schema='json', direc=self.rundir)
         return alnfi
@@ -884,14 +892,16 @@ def run_muscle(self, input_aln_path = None, new_seqs_path = None, outname = 'all
         return(outpath_ALL)
 
 
-    def est_full_tree(self, alignment = None, startingtree = None, method = "raxml"):
+    def est_full_tree(self, alignment = 'default', startingtree = None, method = "raxml"):
         """Full raxml run from the placement tree as starting tree.
         The PTHREAD version is the faster one, hopefully people install it. if not it falls back to the normal raxml.
         """
         cwd = os.getcwd()
-        if alignment == None:
+        if alignment == 'default':
             debug("call align query seqs from est full tree, self._blast_read is {}".format(self._blast_read))
             alignment = self.align_new_seqs()
+        if alignment is None:
+            return None
         if startingtree == None:
             startingtree = os.path.abspath(self.data.write_random_resolve_tre(direc=self.rundir))
         debug("est full tree")
@@ -914,7 +924,7 @@ def est_full_tree(self, alignment = None, startingtree = None, method = "raxml")
 #
 
 
-    def calculate_bootstrap(self, alignment = None, num_reps = "10"):
+    def calculate_bootstrap(self, alignment = 'default', num_reps = "10"):
         """Calculates bootstrap and consensus trees.
 
         -p: random seed
@@ -927,9 +937,12 @@ def calculate_bootstrap(self, alignment = None, num_reps = "10"):
 
         """
         debug("calculate bootstrap")
-        if alignment == None:
+        if alignment == 'default':
             debug("call align query seqs from est full tree, self._blast_read is {}".format(self._blast_read))
             alignment = self.align_new_seqs()
+        if alignment is None:
+            sys.stderr.write("no default alignemnt found for bootstrap\n")
+            return None
         with cd(self.rundir):
             ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
             nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
@@ -981,6 +994,8 @@ def calculate_final_tree(self, boot_reps = 100):
         debug("calculate final tree")
         debug("current alignment length {}".format(len(self.data.aln)))
         besttreepath = self.est_full_tree()
+        if besttreepath is None:
+            return None
         bootpath = self.calculate_bootstrap(num_reps = boot_reps)
         sumtreepath = self.summarize_boot(besttreepath, bootpath)
         self.replace_tre(sumtreepath, schema="nexus")