Skip to content

Commit

Permalink
Merge pull request #104 from McTavishLab/bootstrap
Browse files Browse the repository at this point in the history
Bootstrap
  • Loading branch information
snacktavish committed Jun 9, 2020
2 parents 0aa5245 + 86e656b commit d6bde1f
Show file tree
Hide file tree
Showing 13 changed files with 202 additions and 96 deletions.
15 changes: 8 additions & 7 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
parser.add_argument("-as","--aln_schema", help="alignment schema (nexus or fasta)")
parser.add_argument("-db", "--blast_db", help="local download of blast database")
parser.add_argument("-o","--output", help="path to output directory")
parser.add_argument("-bs","--bootstrap_reps", help="number of bootstrap reps")
parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
parser.add_argument("-c","--configfile", help="path to config file")
parser.add_argument("-e","--email", help="email address for ncbi balst searches")
Expand Down Expand Up @@ -166,7 +167,7 @@
aln_schema = aln_schema,
workdir = workdir,
configfile = conf,
ingroup_mrca = ott_id)
search_taxon = ott_id)
scraper = physcraper.PhyscraperScrape(data_obj, ids)
else:
scraper = scraper_from_opentree(study_id =study_id,
Expand All @@ -176,11 +177,6 @@
workdir = workdir,
configfile = conf)
sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))
scraper.data.write_files()
scraper.data.write_files(treefilename = "before_otu_{}.tre".format(scraper.data.tag),
alnfilename = "before_otu_{}.aln".format(scraper.data.tag))
scraper.data.write_labelled(filename="before_labelled_{}".format(scraper.data.tag), label='^ot:ottTaxonName')
scraper.data.write_otus(schema="json")

if args.reload_files:
if args.tag:
Expand All @@ -193,7 +189,12 @@
sys.stdout.write("Reloaded {} taxa in alignment and tree\n".format(len(scraper.data.aln)))


if args.bootstrap_reps:
boot_reps = args.bootstrap_reps
else:
boot_reps = 100

if not args.no_estimate_tree:
#scraper.read_blast_wrapper()
scraper.est_full_tree()
scraper.calculate_final_tree(boot_reps = boot_reps)
scraper.data.write_labelled(label='^ot:ottTaxonName')
104 changes: 104 additions & 0 deletions bin/rf_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import sys
import os
import json
import argparse
import dendropy
import copy
import physcraper
from opentree import OT
from dendropy.calculate import treecompare
### Example
# python ../physcraper/bin/rf_distance.py -t1 pg_238/inputs_pg_238tree109_RPB2/physcraper_pg_238tree109_RPB2.tre -t2 pg_238/outputs_pg_238tree109_RPB2/physcraper_pg_238tree109_RPB2.tre -otu pg_238/run_pg_238tree109_RPB2/otu_info_pg_238tree109_RPB2.json



parser = argparse.ArgumentParser()
parser.add_argument("-t1","--original_tree", help="Original tree")
parser.add_argument("-t2","--updated_tree", help="Updated Tree")
parser.add_argument("-otu", "--otu_info", help="File with taxon information JSON")

schema = "newick"



args = parser.parse_args()

tns = dendropy.TaxonNamespace()
tree1 = dendropy.Tree.get_from_path(args.original_tree,
schema,
taxon_namespace=tns,
preserve_underscores=True)
tree2 = dendropy.Tree.get_from_path(args.updated_tree,
schema,
taxon_namespace=tns,
preserve_underscores=True)

leaves_t1 = set([leaf.taxon for leaf in tree1.leaf_nodes()])
leaves_t2 = set([leaf.taxon for leaf in tree2.leaf_nodes()])

new_tips = len(leaves_t1) - len(leaves_t2)
sys.stdout.write("{} new tips were added\n".format(new_tips))

otu_dict = json.load(open(args.otu_info, "r"))

old_spp = set()
new_spp = set()

## Prune trees to same leaf set

for leaf in leaves_t1:
species = otu_dict[leaf.label]['^ot:ottId']
old_spp.add(species)

for leaf in leaves_t2:
species = otu_dict[leaf.label]['^ot:ottId']
new_spp.add(species)


sys.stdout.write("There were {} taxa in tree1, {} taxa in tree2\n".format(len(old_spp), len(new_spp)))


prune = leaves_t2.symmetric_difference(leaves_t1)

unpruned_tree2 = copy.deepcopy(tree2)
tree2.prune_taxa(prune)


RF = dendropy.calculate.treecompare.unweighted_robinson_foulds_distance(tree1, tree2)

#Weighted RF
weightedrf = dendropy.calculate.treecompare.weighted_robinson_foulds_distance(tree1, tree2)

tree2.write(path = "pruned_updated.tre", schema="newick")
print("The RobinsonFoulds distance between these trees is {} and the weighted RF is {}".format(RF, weightedrf))


##Write out t2 for conflict with opentree
workdir = os.getcwd()


def write_conflict_tree(inputtree, otu_dict):
tmp_tree = copy.deepcopy(inputtree)
new_names = set()
i = 1
for node in tmp_tree:
i+=1
if node.taxon:
otu = otu_dict[node.taxon.label]
ottid = otu['^ot:ottId']
new_label = "_nd{}_ott{}".format(i, ottid)
node.taxon.label = new_label
else:
node.label = "_nd{}_".format(i)
return tmp_tree.as_string(schema="newick")

treestr_updated = write_conflict_tree(unpruned_tree2, otu_dict)
treestr_orig = write_conflict_tree(tree1, otu_dict)


resp_updated = OT.conflict_str(treestr_updated, 'ott')
resp_orig = OT.conflict_str(treestr_orig, 'ott')

print(resp_orig.response_dict)


2 changes: 1 addition & 1 deletion docs/ATT.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Hypothetically, all the keys in the otu_dict should be clean.
* **newick**: dendropy.tre.as_string(schema=schema_trf) object
* **otu_dict**: json file including the otu_dict information generated earlier
* **alignment**: dendropy DNACharacterMatrix object
* **ingroup_mrca**: OToL identifier of the group of interest, either subclade as defined by user or of
* **search_taxon**: OToL identifier of the group of interest, either subclade as defined by user or of
all tiplabels in the phylogeny
* **workdir**: the path to the corresponding working directory
* **schema**: optional argument to define tre file schema, if different from "newick"
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/minimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
treefile=tre_fi,
tree_schema = "newick", #schema_trf
otu_json=otu_jsonfi,
ingroup_mrca=mrca)
search_taxon=mrca)

data_obj.tag = "minEx"

Expand All @@ -60,7 +60,7 @@

sys.stdout.write("Running align_query_seqs()...\n")
scraper.align_new_seqs()
scraper.est_full_tree()
scraper.calculate_final_tree(boot_reps=30)
scraper.data.write_labelled(label="^ncbi:taxon", filename="updated_ncbi_id", norepeats=False, direc = scraper.outputsdir)


Expand Down
26 changes: 13 additions & 13 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from physcraper.helpers import standardize_label, to_string, debug

_VERBOSE = 1
_DEBUG = 0
_DEBUG = 1

def generate_ATT_from_files(workdir,
configfile,
Expand All @@ -20,7 +20,7 @@ def generate_ATT_from_files(workdir,
treefile,
otu_json,
tree_schema,
ingroup_mrca=None):
search_taxon=None):
"""Build an ATT object without phylesystem, use your own files instead.
Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in.
Expand All @@ -34,7 +34,7 @@ def generate_ATT_from_files(workdir,
:param treefile: path to phylogeny
:param otu_json: path to json file containing the translation of tip names to taxon names, generated with OtuJsonDict()
:param tree_schema: a string defining the format of the input phylogeny
:param ingroup_mrca: optional - OToL ID of the mrca of the clade of interest. If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca.
:param search_taxon: optional - OToL ID of the mrca of the clade of interest. If no search mrca ott_id is provided, will use all taxa in tree to calc mrca.
:return: object of class ATT
"""
Expand All @@ -45,14 +45,14 @@ def generate_ATT_from_files(workdir,
os.makedirs(workdir)
# use replaced aln as input
otu_dict = json.load(open(otu_json, "r"))
if ingroup_mrca:
mrca_ott = int(ingroup_mrca)
if search_taxon:
mrca_ott = int(search_taxon)
else:
ott_ids = [otu_dict[otu].get(u'^ot:ottId', ) for otu in otu_dict]
ott_ids = filter(None, ott_ids)
ott_ids = set(ott_ids)
mrca_ott = get_mrca_ott(ott_ids)
return AlignTreeTax(treefile, otu_dict, alnfile, ingroup_mrca=mrca_ott, workdir=workdir,
return AlignTreeTax(treefile, otu_dict, alnfile, search_taxon=mrca_ott, workdir=workdir,
configfile=configfile, tree_schema=tree_schema)

def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=None):
Expand All @@ -78,7 +78,7 @@ def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=No
assert(os.path.exists(otu_json))
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, search_taxon=mrca_ott, workdir=workdir,
configfile=configfile, tag=tag, tree_schema='newick')
except AssertionError:
sys.stdout.write("No output files found in {}, loading files from {}\n".format(outputsdir, inputsdir))
Expand All @@ -90,7 +90,7 @@ def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=No
assert(os.path.exists(otu_json)), otu_json
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, search_taxon=mrca_ott, workdir=workdir,
configfile=configfile, tag=tag, tree_schema='newick')
#except AssertionError:
# sys.stdout.write("No run files found in {} or {}. Data not loaded\n".format(outputsdir, inputsdir))
Expand Down Expand Up @@ -129,7 +129,7 @@ class AlignTreeTax(object):
* **newick**: dendropy.tre.as_string(schema=schema_trf) object
* **otu_dict**: json file including the otu_dict information generated earlier
* **alignment**: dendropy :class:`DnaCharacterMatrix <dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix>` object
* **ingroup_mrca**: OToL identifier of the group of interest, either subclade as defined by user or of all tip labels in the phylogeny
* **search_taxon**: OToL identifier of the group of interest, either subclade as defined by user or of all tip labels in the phylogeny
* **workdir**: the path to the corresponding working directory
* **config_obj**: Config class
* **schema**: optional argument to define tre file schema, if different from "newick"
Expand Down Expand Up @@ -194,7 +194,7 @@ class AlignTreeTax(object):
* self.aln, self.tre and self.otu_dict, self.ps_otu, self.gi_dict
"""

def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=None,
def __init__(self, tree, otu_dict, alignment, search_taxon, workdir, configfile=None,
tree_schema='newick',aln_schema ='fasta',taxon_namespace=None, tag=None):
debug("build ATT class")
if tag == None:
Expand Down Expand Up @@ -231,8 +231,8 @@ def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=
self._reconcile()
self._reconcile_names()

assert int(ingroup_mrca), ("your ingroup_mrca '%s' is not an integer." % ingroup_mrca)
self.mrca_ott = ingroup_mrca # ott_ingroup mrca can be pulled directly from phylesystem
assert int(search_taxon), ("your search_taxon '%s' is not an integer." % search_taxon)
self.mrca_ott = search_taxon # ott mrca can be pulled directly from phylesystem
self.orig_seqlen = [] # will get filled in later...
self.gb_dict = {} # has all info about new blast seq
self._reconciled = False
Expand Down Expand Up @@ -726,7 +726,7 @@ def write_otus(self, filename = "otu_info", schema="table", direc='workdir'):
# all_keys.update(self.otu_dict[otu].keys())
#keys = list(all_key)
#keys.sort()
keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^ncbi:gi','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
header = ["otu_id"] + keys
with open("{}/{}_{}.csv".format(direc, filename, self.tag), "w") as outfile:
outfile.write("\t".join(header)+"\n")
Expand Down
2 changes: 1 addition & 1 deletion physcraper/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def cd(path):
# configfi='config.out',
# treefile='physcraper.tre',
# schema_trf='newick',
# ingroup_mrca = 'mrca.txt'):
# search_taxon = 'mrca.txt'):


def write_filterblast_db(workdir, seq_name, seq, fn):
Expand Down
14 changes: 7 additions & 7 deletions physcraper/opentree_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def generate_ATT_from_phylesystem(alnfile,
study_id,
tree_id,
phylesystem_loc='api',
ingroup_mrca=None,
search_taxon=None,
tip_label='^ot:originalLabel'):
"""gathers together tree, alignment, and study info - forces names to otu_ids.
Expand All @@ -155,7 +155,7 @@ def generate_ATT_from_phylesystem(alnfile,
:param study_id: OToL study id of the corresponding phylogeny which shall be updated
:param tree_id: OToL corresponding tree ID as some studies have several phylogenies
:param phylesystem_loc: access the github version of the OpenTree data store, or a local clone
:param ingroup_mrca: optional. OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny)
:param search_taxon: optional. OToL identifier of the mrca of the clade that shall be updated (can be subset of the phylogeny)
:return: object of class ATT
"""
assert(tip_label in ['^ot:originalLabel', 'otu', "^ot:ottTaxonName", "^ot:ottId"])
Expand Down Expand Up @@ -196,12 +196,12 @@ def generate_ATT_from_phylesystem(alnfile,
treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId")
# need to prune tree to seqs and seqs to tree...
ott_mrca = None
if ingroup_mrca:
if type(ingroup_mrca) == list:
ott_ids = set(ingroup_mrca)
if search_taxon:
if type(search_taxon) == list:
ott_ids = set(search_taxon)
ott_mrca = get_mrca_ott(ott_ids)
else:
ott_mrca = int(ingroup_mrca)
ott_mrca = int(search_taxon)
if ott_mrca == None:
ingroup_ott_ids = set()
for otu_id in otu_dict:
Expand All @@ -212,7 +212,7 @@ def generate_ATT_from_phylesystem(alnfile,
assert(len(ingroup_ott_ids)>=1)
ott_mrca = get_mrca_ott(ingroup_ott_ids)
otu_newick = tree_obj.as_string(schema="newick")
return physcraper.aligntreetax.AlignTreeTax(tree = otu_newick, otu_dict =otu_dict, alignment=alnfile, aln_schema = aln_schema, ingroup_mrca=ott_mrca, workdir=workdir, configfile=configfile)
return physcraper.aligntreetax.AlignTreeTax(tree = otu_newick, otu_dict =otu_dict, alignment=alnfile, aln_schema = aln_schema, search_taxon=ott_mrca, workdir=workdir, configfile=configfile)
# newick should be bare, but alignment should be DNACharacterMatrix


Expand Down

0 comments on commit d6bde1f

Please sign in to comment.