Skip to content

Commit

Permalink
Merge pull request #121 from McTavishLab/tutorial
Browse files Browse the repository at this point in the history
Tutorial
  • Loading branch information
snacktavish committed Jun 30, 2020
2 parents bc20409 + 8b04ca4 commit 707b1f1
Show file tree
Hide file tree
Showing 40 changed files with 707 additions and 596 deletions.
24 changes: 7 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,35 +1,25 @@
<img align="left" width="250" src="https://cdn.rawgit.com/snacktavish/physcraper/main/docs/physcraper.svg">

# physcraper
# Physcraper

[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=main)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)[![codecov](https://codecov.io/gh/McTavishLab/physcraper/branch/dev/graph/badge.svg)](https://codecov.io/gh/McTavishLab/physcraper)
[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=main)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)[![codecov](https://codecov.io/gh/McTavishLab/physcraper/branch/main/graph/badge.svg)](https://codecov.io/gh/McTavishLab/physcraper)


<p></p>

<p></p>

### Continual gene tree updating!
## Continual gene tree updating!

Use a tree (from the literature, a synthetic tree from Open Tree of Life, or your own tree) and an alignment (of any size?) to find and add homologous sequences to (hopefully) improve and advance phylogenetic inference in a group.
Use a tree (from the literature, a synthetic tree from Open Tree of Life, or your own tree) and an alignment (of any size?) to find and add homologous sequences to (hopefully) improve and advance phylogenetic inference in a group.


The tool is under current development in the McTavish Lab.
Please contact ejmctavish@gmail if you need any help or have feedback.

Users:

- [Installation](mds/INSTALL.md)
- [Run](mds/running.md)
- [More examples](mds/examples.md)

Developers:

- [Documentation](https://physcraper.readthedocs.io/en/latest/)
- [Tests](mds/testing.md)



This is the code repository, please refer to Physcraper's [documentation website](https://physcraper.readthedocs.io/en/latest/) for more details on how to install it and run!

Here, some emojis for you :bowtie: :sparkles: :notes:


:hamster: :palm_tree: :frog: :ear_of_rice: :panda_face: :tulip: :octopus: :blossom: :whale: :mushroom: :ant: :cactus: :fish: :maple_leaf: :water_buffalo: 🦠 :shell: :bug: :octocat:
38 changes: 27 additions & 11 deletions bin/find_trees.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
import sys
import argparse
from opentree import OT

Expand All @@ -12,21 +13,44 @@
args = parser.parse_args()


args = parser.parse_args()
if len(sys.argv)==1:
parser.print_help(sys.stderr)
sys.exit(1)


assert(args.taxon_name or args.ottid), "A taxon name or an OTT id are required for search."

if args.taxon_name:
try:
ottid = OT.get_ottid_from_name(args.taxon_name)
except:
sys.stdout.write("no match to taxon name. Try finding your taxon on tree.opentreeoflife.org and inputting the taxon id using -ott")
sys.stdout.write("no match to taxon name. Try finding your taxon on tree.opentreeoflife.org and inputting the taxon id using -ott\n")
sys.exit()


if args.ott_id:
ottid = args.ott_id

#sys.stdout.write("OTT id of {}")
sys.stdout.write("OTT id {}\n".format(ottid))
phylesystem_studies_resp = OT.find_trees(ottid, search_property ='ot:ottId')



cites_phyl = "Members of {} present in the following studies in the OpenTree Phylesystem\n".format(args.taxon_name)


if args.treebase:
cites_phyl = cites_phyl + "Only returning studies with TreeBase links\n"


studies = dict()
trees = dict()
treebase_studies = set()
sys.stdout.write("Gathering references (slow)\n")
for study in phylesystem_studies_resp.response_dict['matched_studies']:
sys.stdout.write('.')
sys.stdout.flush()
study_id = study['ot:studyId']
studies[study_id] = dict()
study_info = OT.get_study(study_id)
Expand All @@ -46,15 +70,6 @@
studies[study_id]['opentree_url'] = "https://tree.opentreeoflife.org/curator/study/view/{}".format(study_id)
studies[study_id]['reference'] = nexson['nexml'].get('^ot:studyPublicationReference', 'no ref')
studies[study_id]['doi'] = nexson['nexml'].get('^ot:studyPublication', 'no study pub')


cites_phyl = "Members of {} present in the following studies in the OpenTree Phylesystem\n".format(args.taxon_name)


if args.treebase:
cites_phyl = cites_phyl + "Only returning studies with TreeBase links\n"

for study_id in studies:
if args.treebase and study_id not in treebase_studies:
continue
cites_phyl = cites_phyl + "\nStudy {} tree(s) {}\n".format(study_id, ', '.join(trees[study_id]))
Expand All @@ -63,6 +78,7 @@
cites_phyl = cites_phyl + "Data Deposit URL: " + studies[study_id]['data_deposit_url'] + '\n'



if args.output:
ofi = open(args.output, 'w')
ofi.write(cites_phyl)
Expand Down
80 changes: 49 additions & 31 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,46 @@
parser = argparse.ArgumentParser()
parser.add_argument("-s","--study_id", help="OpenTree study id")
parser.add_argument("-t","--tree_id", help="OpenTree tree id")
parser.add_argument("-tl", "--tree_link", help="Link to tree to update on OpenTree")
parser.add_argument("-a","--alignment", help="path to alignment")
parser.add_argument("-as","--aln_schema", help="alignment schema (nexus or fasta)")
parser.add_argument("-db", "--blast_db", help="local download of blast database")
parser.add_argument("-o","--output", help="path to output directory")
parser.add_argument("-bs","--bootstrap_reps", help="number of bootstrap reps")
parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
parser.add_argument("-c","--configfile", help="path to config file")
parser.add_argument("-e","--email", help="email address for ncbi blast searches")
parser.add_argument("-re","--reload_files", help="reload files and configureation from dir")
parser.add_argument("-r","--repeat", action='store_true', help="repeat search until no no sequences are found")
parser.add_argument("-tag","--tag", help="gene name or other specifier")
parser.add_argument("-tb","--treebase", action="store_true", help="download alignment from treebase")
parser.add_argument("-no_est","--no_estimate_tree", action='store_true', help="don't estimate tree")
parser.add_argument("-re","--reload_files", help="reload files and configuration from dir")



parser.add_argument("-tag","--tag", help="gene name or other specifier")
parser.add_argument("-st","--search_taxon", help="taxonomic id to constrain blast search. format ott:123 or ncbi:123. Deafult will use ingroup of tree on OpenTree, or MRCA of input tips ")

parser.add_argument("-spn","--species_number", help="max number of seqs to include per species")
parser.add_argument("-tp","--trim_perc", help="minimum percentage of seqs end of alignemnts")
parser.add_argument("-rlmax","--relative_length_max", help="max relative length of added seqs, compared to input seq len")
parser.add_argument("-rlmin","--relative_length_min", help="min relative length of added seqs, compared to input seq len")

parser.add_argument("-r","--repeat", action='store_true', help="repeat search until no no sequences are found")
parser.add_argument("-db", "--blast_db", help="local download of blast database")
parser.add_argument("-u", "--blast_url", help="a URL for your own mirrored blast database")
parser.add_argument("-e","--email", help="email address for ncbi blast searches")
parser.add_argument("-ev","--eval", help="blast evalue cutoff")
parser.add_argument("-hl","--hitlist_len", help="number of blast searches to save per taxon")
parser.add_argument("-tp","--trim_perc", help="minimum percentage of seqs end of alignemnts")
parser.add_argument("-rl","--relative_length", help="max relative length of added seqs, compared to input aln len")
parser.add_argument("-spn","--species_number", help="max number of seqs to include per species")

parser.add_argument("-nt","--num_threads", help="number of threads to use in processing")
parser.add_argument("-de","--delay", help="how long to wait before blasting the same sequence again")
parser.add_argument("-st","--search_taxon", help="taxonomic id to constrain blast search. format ott:123 or ncbi:123. Deafult will use ingroup of tree on OpenTree, or MRCA of input tips ")

parser.add_argument("-no_est","--no_estimate_tree", action='store_true', help="don't estimate tree")

parser.add_argument("-bs","--bootstrap_reps", help="number of bootstrap reps")

parser.add_argument("-tx","--taxonomy", help="path to taxonomy")

parser.add_argument("-v","--verbose", help="OpenTree study id")



#Not yet implemented
#parser.add_argument("-tl", "--tree_link", help="Link to tree to update on OpenTree")

#parser.add_argument("-bl","--blast_sequence", action='store_true', help="run blast search, and align but do not estimate tree")
#parser.add_argument("-d","--download_data", action='store_true', help="write out tree and alignment, without blasting")
#parser.add_argument("-bs","--bootstrap", help="number of bootstrap reps")
Expand All @@ -47,6 +59,12 @@


args = parser.parse_args()


if args.verbose:
physcraper.scrape.set_verbose()


if len(sys.argv)==1:
parser.print_help(sys.stderr)
sys.exit(1)
Expand Down Expand Up @@ -90,8 +108,11 @@
if args.trim_perc:
conf.trim_perc = args.trim_perc

if args.relative_length:
conf.maxlen = args.relative_length
if args.relative_length_max:
conf.maxlen = args.relative_length_max

if args.relative_length_min:
conf.minlen = args.relative_length_min

if args.species_number:
conf.spp_threshold = int(args.species_number)
Expand All @@ -109,11 +130,11 @@
sys.stdout.write(conf.config_str()+"\n")

study_id = None
if args.tree_link:
linkl = args.tree_link.split("/")
assert(linkl[5]=="view")
study_id = linkl[6]
tree_id = linkl[-1].split("=")[1]
#if args.tree_link:
# linkl = args.tree_link.split("/")
# assert(linkl[5]=="view")
# study_id = linkl[6]
# tree_id = linkl[-1].split("=")[1]


if args.study_id or args.tree_id:
Expand Down Expand Up @@ -198,23 +219,20 @@
boot_reps = 100



run = 1
if args.repeat:
rundir_path = scraper.rundir
blast_dir ="{}/blast_run_{}".format(workdir, scraper.data.tag)
if not os.path.exists(blast_dir):
os.mkdir(blast_dir)
scraper.blast_subdir = blast_dir
scraper.calculate_final_tree(boot_reps = boot_reps)
to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
to_be_blasted = ['first_pass']
while len(to_be_blasted) >= 1:
run += 1
os.rename(scraper.rundir, rundir_path+"_"+str(run))
os.mkdir(scraper.rundir)
print(scraper.blast_subdir)
scraper.run_blast_wrapper()
scraper.calculate_final_tree(boot_reps = boot_reps)
besttreepath = scraper.est_full_tree()
scraper.replace_tre(besttreepath)
scraper.data.write_labelled(filename="run_{}".format(run), label='^ot:ottTaxonName', direc=scraper.outputsdir)
scraper.data.write_otus(schema='table', direc=scraper.inputsdir)
scraper.data.write_otus(schema='json', direc=scraper.rundir)
to_be_blasted = [otu.label for otu in scraper.data.aln if ((scraper.data.otu_dict[otu.label]['^physcraper:ingroup'] == True) and (scraper.data.otu_dict[otu.label]['^physcraper:last_blasted']==None))]
scraper.calculate_final_tree(boot_reps = boot_reps)
elif not args.no_estimate_tree:
#scraper.read_blast_wrapper()
scraper.calculate_final_tree(boot_reps = boot_reps)
Expand Down
69 changes: 41 additions & 28 deletions bin/tree_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
parser.add_argument("-t1","--original_tree", help="Original tree")
parser.add_argument("-t2","--updated_tree", help="Updated Tree")
parser.add_argument("-otu", "--otu_info", help="File with taxon information JSON")
parser.add_argument("-og", "--outgroup", nargs='+', help="otu ids of outgroup taxa for rooting")
parser.add_argument("-o", "--outputdir", help="Name for output directory")


Expand All @@ -40,11 +41,11 @@


if args.results_dir:
assert(os.path_exists(args.results_dir)), "Results directory {} not found\n".format(args.results_dir)
assert(os.path.exists(args.results_dir)), "Results directory {} not found\n".format(args.results_dir)
else:
assert(os.path_exists(args.original_tree)), "Original tree {} not found\n".format(args.original_tree)
assert(os.path_exists(args.updated_tree)), "Updated tree {} not found\n".format(args.updated_tree)
assert(os.path_exists(args.otu_info)), "Otu_info {} not found\n".format(args.otu_info)
assert(os.path.exists(args.original_tree)), "Original tree {} not found\n".format(args.original_tree)
assert(os.path.exists(args.updated_tree)), "Updated tree {} not found\n".format(args.updated_tree)
assert(os.path.exists(args.otu_info)), "Otu_info {} not found\n".format(args.otu_info)


tns = dendropy.TaxonNamespace()
Expand Down Expand Up @@ -77,28 +78,54 @@
taxon_namespace=tns,
preserve_underscores=True)



otu_dict = json.load(open(otu_json_path, "r"))


leaves_t1 = set([leaf.taxon for leaf in tree1.leaf_nodes()])
leaves_t2 = set([leaf.taxon for leaf in tree2.leaf_nodes()])
leaves_t1 = set([leaf.taxon.label for leaf in tree1.leaf_nodes()])
leaves_t2 = set([leaf.taxon.label for leaf in tree2.leaf_nodes()])

old_spp = set()
new_spp = set()


for leaf in leaves_t1:
species = otu_dict[leaf.label]['^ot:ottId']
species = otu_dict[leaf]['^ot:ottId']
old_spp.add(species)

for leaf in leaves_t2:
species = otu_dict[leaf.label]['^ot:ottId']
species = otu_dict[leaf]['^ot:ottId']
new_spp.add(species)

if None in old_spp:
old_spp.remove(None)


if None in new_spp:
new_spp.remove(None)

new_tips = len(leaves_t2) - len(leaves_t1)
sys.stdout.write("{} new tips were added\n".format(new_tips))
tree2.write(path = "{}/before_rooting.tre".format(comparisondir), schema="newick")
#['otu376420','otu376439','otu376452']
if args.outgroup:
outgroup = args.outgroup
sys.stdout.write("Rooting tree using {} as outgroup\n".format(", ".join(outgroup)))
for tip in outgroup:
assert(tip in leaves_t2), "label {} not found in updated tree {}\n".format(tip, tree2_path)
mrca = tree2.mrca(taxon_labels=outgroup)
tree2.reroot_at_node(mrca, update_bipartitions=True)
else:
try:
rooted = root_tree_from_synth(ree2, otu_dict, base='ott')
##Write out t2 for conflict with opentree
except:
sys.stdout.write("Auto-rooting unsuccessful, conflict results may be spurious\n")

tree2.write(path = "{}/after_rooting.tre".format(comparisondir), schema="newick")



## THIS SECTION LOOKS AT TAXA

Expand All @@ -107,15 +134,15 @@
ottids_in_synth = ottids_in_synth()


sys.stdout.write("There were {} new taxa in the updated tree\n".format(len(new_spp) - len(old_spp)))
sys.stdout.write("\nThere were {} new taxa in the updated tree\n".format(len(new_spp) - len(old_spp)))
sys.stdout.write("Of the {} taxa in original tree {} are not included in synthesis phylogenies,\n".format(len(old_spp), len(old_spp.difference(ottids_in_synth))))
sys.stdout.write("Of the {} taxa in updated tree {} are not included in synthesis phylogenies \n".format(len(new_spp), len(new_spp.difference(ottids_in_synth))))
sys.stdout.write("Of the {} taxa in updated tree {} are not included in synthesis phylogenies \n\n".format(len(new_spp), len(new_spp.difference(ottids_in_synth))))


ids = physcraper.IdDicts()
sys.stdout.write("Taxa with only taxonomic information in the OpenTree synthetic tree (so far!) are:\n")
for tax in new_spp.difference(ottids_in_synth):
taxname = ids.ott_to_name[tax]
taxname = ids.ott_to_name.get(tax, '-')
sys.stdout.write("ott{}: {}\n".format(tax, taxname))

## This section does tree comparison
Expand All @@ -136,29 +163,15 @@

for tax in tns:
if tax.label in otu_dict:
tax.label = tax.label + "_" + otu_dict[tax.label]['^ot:ottTaxonName']
tax.label = tax.label + "_" + str(otu_dict[tax.label].get('^ot:ottTaxonName'))

## write put with tip labels that have taxon names
tree1.write(path = "{}/original.tre".format(comparisondir), schema="newick")
tree2.write(path = "{}/pruned_updated.tre".format(comparisondir), schema="newick")

sys.stdout.write("The RobinsonFoulds distance between these trees is {} and the weighted RF is {}\n".format(RF, weightedrf))

unpruned_tree2.write(path = "{}/before_rooting.tre".format(comparisondir), schema="newick")

rooted = root_tree_from_synth(unpruned_tree2, otu_dict, base='ott')
##Write out t2 for conflict with opentree
unpruned_tree2.write(path = "{}/rooting1.tre".format(comparisondir), schema="newick")

#CHECK ROOTING ON NEW TREE

unpruned_tree2.write(path = "{}/after_rooting.tre".format(comparisondir), schema="newick")
sys.stdout.write("\n\nThe RobinsonFoulds distance between the matched tips in the trees is {} and the weighted RF is {}\n".format(RF, weightedrf))

workdir = comparisondir




tree_updated = conflict_tree(unpruned_tree2, otu_dict)
tree_orig = conflict_tree(unpruned_tree1, otu_dict)

Expand All @@ -176,7 +189,7 @@
witness = conflict_orig[node]['witness_name']
orig_conf_taxa.add(witness)

sys.stdout.write("Original tree conflicts with {} taxa in the OpenTree taxonomy:\n".format(len(orig_conf_taxa)))
sys.stdout.write("\nOriginal tree conflicts with {} taxa in the OpenTree taxonomy:\n".format(len(orig_conf_taxa)))
for tax in orig_conf_taxa:
sys.stdout.write("{}\n".format(tax))

Expand Down

0 comments on commit 707b1f1

Please sign in to comment.