Skip to content

Commit

Permalink
Merge pull request #139 from McTavishLab/no-ingroup-fix
Browse files Browse the repository at this point in the history
No ingroup fix
  • Loading branch information
snacktavish committed Jul 29, 2020
2 parents 2db611f + 9693a61 commit fe8f64e
Show file tree
Hide file tree
Showing 5 changed files with 745 additions and 51 deletions.
35 changes: 27 additions & 8 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,20 +267,39 @@
sys.stdout.write("Excluding accession numbers {}\n".format(','.join(scraper.blocklist)))


run = 1
run = 0
if args.repeat:
besttreepath = None
to_be_blasted = ['first_pass']
rundir_base = scraper.rundir
while len(to_be_blasted) >= 1:
run += 1
print(scraper.blast_subdir)
scraper.run_blast_wrapper()
besttreepath = scraper.est_full_tree()
scraper.replace_tre(besttreepath)
scraper.data.write_labelled(filename="run_{}".format(run), label='^ot:ottTaxonName', direc=scraper.outputsdir)
scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
scraper.data.write_otus(schema='json', direc=scraper.rundir)
to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
scraper.calculate_final_tree(boot_reps = boot_reps)
if besttreepath:
prev_besttreepath = besttreepath
scraper.replace_tre(besttreepath)
scraper.data.write_labelled(filename="run_{}".format(run), label='^ot:ottTaxonName', direc=scraper.outputsdir)
scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
scraper.data.write_otus(schema='json', direc=scraper.rundir)
new_rundir = "{}_run{}".format(rundir_base, run)
prev_rundir = scraper.rundir
scraper.rundir = new_rundir
os.mkdir(scraper.rundir)
to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
elif besttreepath is None:
os.rmdir(scraper.rundir)
scraper.rundir = prev_rundir
updated_alnfi = "{}/physcraper_{}.fas".format(prev_rundir, scraper.data.tag)
bootpath = scraper.calculate_bootstrap(alignment = updated_alnfi, num_reps = boot_reps)
sumtreepath = scraper.summarize_boot(prev_besttreepath, bootpath)
scraper.replace_tre(sumtreepath, schema="nexus")
scraper.data.write_files(direc=scraper.outputsdir)
scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
scraper.data.write_labelled(filename='updated_taxonname',label='^ot:ottTaxonName', direc = scraper.outputsdir)
to_be_blasted = []
else:
sys.stderr.write("unexpected error")
elif not args.no_estimate:
#scraper.read_blast_wrapper()
scraper.calculate_final_tree(boot_reps = boot_reps)
Expand Down
68 changes: 41 additions & 27 deletions physcraper/opentree_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,27 +167,32 @@ def bulk_tnrs_load(filename):
otu_dict = {}
with open(filename) as data_file:
input_dict = json.load(data_file)
for name in input_dict["names"]:
i = 1
otu = "otu" + name['id'].strip('name')
while otu in otu_dict.keys():
otu = "{}_{}".format(otu, i)
i += 1
otu_dict[otu] = {"^ot:originalLabel":name["originalLabel"]}
if name.get("ottTaxonName"):
otu_dict[otu]["^ot:ottTaxonName"] = name["ottTaxonName"]
if name.get("ottId"):
otu_dict[otu]["^ot:ottId"] = name["ottId"]
for source in name.get("taxonomicSources", []):
#debug(source)
if source:
taxsrc = source.split(":")
assert len(taxsrc) == 2, taxsrc
otu_dict[otu]["^{}:taxon".format(taxsrc[0])] = taxsrc[1]
for otu in otu_dict:
otu_dict[otu]["^physcraper:status"] = "original"
otu_dict[otu]["^physcraper:last_blasted"] = None
otu_dict[otu]["^physcraper:ingroup"] = "unknown"
if "names" in input_dict.keys():
for name in input_dict["names"]:
i = 1
otu = "otu" + name['id'].strip('name')
while otu in otu_dict.keys():
otu = "{}_{}".format(otu, i)
i += 1
otu_dict[otu] = {"^ot:originalLabel":name["originalLabel"]}
if name.get("ottTaxonName"):
otu_dict[otu]["^ot:ottTaxonName"] = name["ottTaxonName"]
if name.get("ottId"):
otu_dict[otu]["^ot:ottId"] = name["ottId"]
for source in name.get("taxonomicSources", []):
#debug(source)
if source:
taxsrc = source.split(":")
assert len(taxsrc) == 2, taxsrc
otu_dict[otu]["^{}:taxon".format(taxsrc[0])] = taxsrc[1]
for otu in otu_dict:
otu_dict[otu]["^physcraper:status"] = "original"
otu_dict[otu]["^physcraper:last_blasted"] = None
otu_dict[otu]["^physcraper:ingroup"] = "unknown"
else:
for otu in input_dict:
assert input_dict[otu]["^physcraper:status"], otu_dict[otu]
otu_dict = input_dict
return otu_dict


Expand Down Expand Up @@ -311,10 +316,13 @@ def generate_ATT_from_phylesystem(alnfile,
otu_dict = {tn.taxon.otu:{} for tn in tree_obj.leaf_node_iter()}
orig_lab_to_otu = {}
treed_taxa = {}
ingroup_otus = set(nexson_helpers.get_subtree_otus(study_nexson,
ingroup_otus = nexson_helpers.get_subtree_otus(study_nexson,
tree_id=tree_id,
subtree_id="ingroup",
return_format="otu_id"))
return_format="otu_id")
if not ingroup_otus:
sys.stdout.write("No ingroup annotation found in tree; using all taxa.\n \
Please update tree annotation through OpenTree curation app.\n")
for leaf in tree_obj.leaf_node_iter():
tn = leaf.taxon
otu_id = tn.otu
Expand All @@ -323,10 +331,13 @@ def generate_ATT_from_phylesystem(alnfile,
otu_dict[otu_id]["^ot:originalLabel"] = tn.original_label.replace(" ", "_")
otu_dict[otu_id]["^physcraper:status"] = "original"
otu_dict[otu_id]["^physcraper:last_blasted"] = None
if otu_id in ingroup_otus:
otu_dict[otu_id]["^physcraper:ingroup"] = True
if ingroup_otus:
if otu_id in set(ingroup_otus):
otu_dict[otu_id]["^physcraper:ingroup"] = True
else:
otu_dict[otu_id]["^physcraper:ingroup"] = False
else:
otu_dict[otu_id]["^physcraper:ingroup"] = False
otu_dict[otu_id]["^physcraper:ingroup"] = 'unknown'
orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_")
orig_lab_to_otu[orig] = otu_id
if tip_label == 'otu':
Expand All @@ -345,7 +356,10 @@ def generate_ATT_from_phylesystem(alnfile,
if ott_mrca is None:
ingroup_ott_ids = set()
for otu_id in otu_dict:
if otu_dict[otu_id]["^physcraper:ingroup"] == True:
if ingroup_otus:
if otu_dict[otu_id]["^physcraper:ingroup"] == True:
ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId"))
else:
ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId"))
if None in ingroup_ott_ids:
ingroup_ott_ids.remove(None)
Expand Down
45 changes: 30 additions & 15 deletions physcraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ def run_blast_wrapper(self):
file_ending = "txt"
else:
file_ending = "xml"
fn_path = "{}/{}.{}".format(self.blast_subdir, taxon.label, file_ending)
idtag = "{}{}".format(taxon.label, self.data.otu_dict[taxon.label].get('^ncbi:accession', ''))
fn_path = "{}/{}.{}".format(self.blast_subdir, idtag, file_ending)
# if _DEBUG:
# sys.stdout.write("attempting to write {}\n".format(fn_path))
if not os.path.isfile(fn_path):
Expand All @@ -309,8 +310,6 @@ def run_blast_wrapper(self):
if _VERBOSE:
sys.stdout.write("otu {} was last blasted on {}, {} days ago and is not being re-blasted. "
"Use run_blast_wrapper(delay = 0) to force a search.\n".format(otu_id, last_blast, time_passed))
#except KeyboardInterrupt:
# sys.exit()
self._blasted = 1


Expand Down Expand Up @@ -517,7 +516,8 @@ def read_blast_wrapper(self, blast_dir=None):
file_ending = "txt"
else:
file_ending = "xml"
fn_path = "{}/{}.{}".format(self.blast_subdir, taxon.label, file_ending)
idtag = "{}{}".format(taxon.label, self.data.otu_dict[taxon.label].get('^ncbi:accession', ''))
fn_path = "{}/{}.{}".format(self.blast_subdir, idtag, file_ending)
if _DEBUG:
sys.stdout.write("reading {}\n".format(fn_path))
if os.path.isfile(fn_path):
Expand All @@ -533,8 +533,9 @@ def read_blast_wrapper(self, blast_dir=None):
if len(self.new_seqs) == 0:
sys.stderr.write("no new sequences found in blast. Exiting")
sys.exit()
self.remove_identical_seqs()
ret = self.remove_identical_seqs()
self._blast_read = 1
return ret


def seq_dict_build(self, seq, new_otu_label, seq_dict):
Expand Down Expand Up @@ -681,14 +682,13 @@ def remove_identical_seqs(self):
self.new_seqs_otu_id = filter_dict # renamed new seq to their otu_ids from GI's, but all info is in self.otu_dict
self.new_seqs = {} #Wipe clean
# debug("len new seqs otu dict after remove identical{}".format(len(self.new_seqs_otu_id)))
sys.stdout.write("**** Found {} new sequences****\n".format(len(self.new_seqs_otu_id)))
sys.stdout.write("**** Found {} new sequences after removing identical seqs****\n".format(len(self.new_seqs_otu_id)))
if len(self.new_seqs_otu_id)==0:
pass
# sys.exit()
return None
with open(self.logfile, "a") as log:
log.write("{} new sequences added from Genbank after removing identical seq, "
"of {} before filtering\n".format(len(self.new_seqs_otu_id), len(self.new_seqs)))
# self.data.dump()
return len(self.new_seqs_otu_id)

def filter_seqs(self, tmp_dict, type="random", threshold=None):
if threshold == None:
Expand Down Expand Up @@ -776,8 +776,11 @@ def select_seq_at_random(self, otu_list, count):
def write_new_seqs(self, filename='date'):
"""writes out the query sequence file"""
debug("write query seq")
new_seqs = len(self.new_seqs_otu_id.keys())
if not self._blast_read:
self.read_blast_wrapper()
new_seqs = self.read_blast_wrapper()
if new_seqs is None:
return None
if filename == 'date':
self.newseqs_file = "NEW{}_{}.fasta".format(self.date, self.data.tag)
else:
Expand All @@ -794,14 +797,19 @@ def write_new_seqs(self, filename='date'):


def align_new_seqs(self, aligner = 'muscle'):
ret = len(self.new_seqs_otu_id)
if not self._blast_read:
self.read_blast_wrapper()
ret = self.read_blast_wrapper()
if ret is None:
return None
assert aligner in ['muscle', 'papara']
if aligner == 'papara':
self.run_papara()
if aligner == 'muscle':
self.run_muscle()
alnfi = self.data.write_aln(direc=self.rundir)
if alnfi is None:
return None
self.data.write_otus(schema='table', direc=self.outputsdir)
self.data.write_otus(schema='json', direc=self.rundir)
return alnfi
Expand Down Expand Up @@ -884,14 +892,16 @@ def run_muscle(self, input_aln_path = None, new_seqs_path = None, outname = 'all
return(outpath_ALL)


def est_full_tree(self, alignment = None, startingtree = None, method = "raxml"):
def est_full_tree(self, alignment = 'default', startingtree = None, method = "raxml"):
"""Full raxml run from the placement tree as starting tree.
The PTHREAD version is the faster one, hopefully people install it. if not it falls back to the normal raxml.
"""
cwd = os.getcwd()
if alignment == None:
if alignment == 'default':
debug("call align query seqs from est full tree, self._blast_read is {}".format(self._blast_read))
alignment = self.align_new_seqs()
if alignment is None:
return None
if startingtree == None:
startingtree = os.path.abspath(self.data.write_random_resolve_tre(direc=self.rundir))
debug("est full tree")
Expand All @@ -914,7 +924,7 @@ def est_full_tree(self, alignment = None, startingtree = None, method = "raxml")
#


def calculate_bootstrap(self, alignment = None, num_reps = "10"):
def calculate_bootstrap(self, alignment = 'default', num_reps = "10"):
"""Calculates bootstrap and consensus trees.
-p: random seed
Expand All @@ -927,9 +937,12 @@ def calculate_bootstrap(self, alignment = None, num_reps = "10"):
"""
debug("calculate bootstrap")
if alignment == None:
if alignment == 'default':
debug("call align query seqs from est full tree, self._blast_read is {}".format(self._blast_read))
alignment = self.align_new_seqs()
if alignment is None:
sys.stderr.write("no default alignemnt found for bootstrap\n")
return None
with cd(self.rundir):
ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
Expand Down Expand Up @@ -981,6 +994,8 @@ def calculate_final_tree(self, boot_reps = 100):
debug("calculate final tree")
debug("current alignment length {}".format(len(self.data.aln)))
besttreepath = self.est_full_tree()
if besttreepath is None:
return None
bootpath = self.calculate_bootstrap(num_reps = boot_reps)
sumtreepath = self.summarize_boot(besttreepath, bootpath)
self.replace_tre(sumtreepath, schema="nexus")
Expand Down

0 comments on commit fe8f64e

Please sign in to comment.