Skip to content

Commit

Permalink
Merge pull request #122 from McTavishLab/ownfile
Browse files Browse the repository at this point in the history
Ownfile
  • Loading branch information
snacktavish committed Jul 1, 2020
2 parents 707b1f1 + 6d7da98 commit 6400df5
Show file tree
Hide file tree
Showing 10 changed files with 223 additions and 53 deletions.
36 changes: 34 additions & 2 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import os
import sys
import physcraper
from physcraper.opentree_helpers import get_tree_from_study, scraper_from_opentree, get_max_match_aln, count_match_tree_to_aln, generate_ATT_from_phylesystem
from physcraper.aligntreetax import generate_ATT_from_run
from physcraper.opentree_helpers import get_tree_from_study, scraper_from_opentree, get_max_match_aln, count_match_tree_to_aln, generate_ATT_from_phylesystem, bulk_tnrs_load
from physcraper.aligntreetax import generate_ATT_from_run, generate_ATT_from_files

parser = argparse.ArgumentParser()
parser.add_argument("-s","--study_id", help="OpenTree study id")
Expand All @@ -16,6 +16,10 @@
parser.add_argument("-tb","--treebase", action="store_true", help="download alignment from treebase")
parser.add_argument("-re","--reload_files", help="reload files and configuration from dir")

parser.add_argument("-tf","--tree_file", help="a path to a tree file")
parser.add_argument("-tfs","--tree_schema", help="tree file format schema")
parser.add_argument("-ti","--taxon_info", help="taxon info file from OpenTree TNRS")



parser.add_argument("-tag","--tag", help="gene name or other specifier")
Expand Down Expand Up @@ -201,6 +205,34 @@
configfile = conf)
sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))


if args.tree_file:
treefile = args.tree_file
assert(args.tree_schema), "When passing in a treefile, a tree schema is required\n"
assert(args.taxon_info), "When passing in a treefile, a taxon mapping from from https://tree.opentreeoflife.org/curator/tnrs/ is required\n"
assert(args.taxon_info.split('.')[-1]=='json'), "JSON format file required for taxon info from https://tree.opentreeoflife.org/curator/tnrs/\n"
if args.tag:
tag = args.tag
elif args.alignment:
tag = args.alignment.split('/')[-1].split('.')[0]
otu_dict = bulk_tnrs_load(args.taxon_info)
if args.search_taxon:
search_taxon = args.search_taxon
else:
search_taxon = None
data_obj = generate_ATT_from_files(workdir= workdir,
configfile=conf,
alnfile = alnfile,
aln_schema = aln_schema,
treefile = treefile,
otu_json = otu_dict,
tree_schema = args.tree_schema,
search_taxon=search_taxon)
ids = physcraper.IdDicts(conf)
scraper = physcraper.PhyscraperScrape(data_obj, ids)
# sys.stdout.write("Read in tree {} taxa in alignment and tree\n".format(len(scraper.data.aln)))


if args.reload_files:
if args.tag:
tag = args.tag
Expand Down
85 changes: 85 additions & 0 deletions docs/PhyscraperRun.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ Tree information (required)
-t TREE_ID, --tree_id TREE_ID
OpenTree tree id

OR

-tf TREE_FILE, --tree_file TREE_FILE
a path to a tree file
-tfs TREE_SCHEMA, --tree_schema TREE_SCHEMA
tree file format schema
-ti TAXON_INFO, --taxon_info TAXON_INFO
taxon info file from OpenTree TNRS



Alignment information (required)

-a ALIGNMENT, --alignment ALIGNMENT
Expand Down Expand Up @@ -135,3 +146,77 @@ You can use your own blast database, for example set up on an AWS server.

-tx TAXONOMY, --taxonomy TAXONOMY
path to taxonomy


Example commands:



The simplest (but slowest) run is to choose a tree from opentree, and `physcraper` gets the alignment for you from treebase (argument `-tb`), using web blast:

physcraper_run.py -s pg_55 -t tree5864 -tb -o pg55_treebase


The fastest way is to choose a tree from opentree, give the path to the corresponding downloaded alignment (argument `-a`) and a local blast directory (argument `-db`). To set up the local blast DB see [Local DB](setting_up_local_database):

physcraper_run.py -s ot_350 -t Tr53297 -a docs/examples/ot_350Tr53297.aln -as "nexus" -db ~/ncbi/localblastdb/ -o ot_350


To check tree download and the matching of names across tree and alignment without running the blast and tree estimation steps, use the flag (-no_est):

physcraper_run.py -s pg_55-t tree5864 --treebase -db ~/ncbi/localblastdb/ -no_est -o pg55_C

Take a look at the tree, the alignemnt and the out_info csv file. It will list all taxa by their unique identifiers.


To then run a blast search and estimate an updated tree from that tree and alignemnt, you can re-load from that directory. It will use your same config settings (which weere automatically written out to outputdir/run.config).

If the run completed, re-run will use the final output ree and alignment. If the run was not compelte, it will reload the input files.


physcraper_run.py -re pg_55_C -o pg_55_C


To re-run with a different configuration file,

physcraper_run.py -re pg_55_C/ -c alt_config -o pg_55_D


Configuration parameters can be either set in a cofniguration file using -c (e.g. data.config)

physcraper_run.py -s ot_350 -t Tr53297 -a ot_350Tr53297.aln -nt 8 -spn 3 -hl 20 -as "nexus" -c data.config -o output4


Or they can be modified in the command line arguments. If you combine a configuration file with command line configuration paratemeters, the command line arguments will be used.

physcraper_run.py -s pg_55 -t tree5864 -a treebase_alns/pg_55tree5864_ndhf.aln -nt 8 -spn 3 -hl 20 -as "nexus" -db ~/ncbi/localblastdb/ -o output4


To run on local files, not on trees in Open Tree, you need to match the labels to taxa using https://tree.opentreeoflife.org/curator/tnrs/

physcraper_run.py -tf tests/data/tiny_test_example/test.tre -tfs newick -a tests/data/tiny_test_example/test.fas --taxon_info tests/data/tiny_test_example/main.json -as fasta -o owndata


The current configuration settings are written to standard out, and saved in the output directory as run.config
e.g.

[blast]
Entrez.email = None
e_value_thresh = 1e-05
hitlist_size = 20
location = local
localblastdb = /home/ejmctavish/ncbi/localblastdb/
url_base = None
num_threads = 8
delay = 90
[physcraper]
spp_threshold = 3
seq_len_perc = 0.8
trim_perc = 0.8
min_len = 0.8
max_len = 1.2
taxonomy_path = /home/ejmctavish/projects/otapi/physcraper/taxonomy




2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ Welcome to Physcraper's documentation!
physcraper_run
data_exploration
setting_up_local_database
apidocs
apidocs
11 changes: 10 additions & 1 deletion physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,16 @@ def generate_ATT_from_files(workdir,
if not os.path.exists(workdir):
os.makedirs(workdir)
# use replaced aln as input
otu_dict = json.load(open(otu_json, "r"))
if isinstance(otu_json, dict):
otu_dict = otu_json
elif isinstance(otu_json, str):
assert os.path.exists(otu_json)
with open(otu_json) as data_file:
input_dict = json.load(data_file)
if input_dict.keys() == set(['mappingHints', 'names', 'metadata']):
otu_dict = bulk_tnrs_load(otu_json)
else:
otu_dict = input_dict
if search_taxon:
mrca_ott = int(search_taxon)
else:
Expand Down
3 changes: 2 additions & 1 deletion physcraper/opentree_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def bulk_tnrs_load(filename, ids_obj = None):
input_dict = json.load(data_file)
for name in input_dict["names"]:
i = 1
otu = "Otu" + name['id']
otu = "Otu" + name['id'].strip('name')
while otu in otu_dict.keys():
otu = "{}_{}".format(otu, i)
i+=1
Expand All @@ -151,6 +151,7 @@ def bulk_tnrs_load(filename, ids_obj = None):
for otu in otu_dict:
otu_dict[otu]["^physcraper:status"] = "original"
otu_dict[otu]["^physcraper:last_blasted"] = None
otu_dict[otu]["^physcraper:ingroup"] = "unknown"
return otu_dict


Expand Down
2 changes: 1 addition & 1 deletion physcraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def run_blast_wrapper(self): # TODO Should this be happening elsewhere?
else:
time_passed = abs((datetime.datetime.strptime(today, "%Y/%m/%d") - datetime.datetime.strptime(
last_blast, "%Y/%m/%d")).days)
if self.data.otu_dict[otu_id].get("^physcraper:ingroup") != True:
if self.data.otu_dict[otu_id].get("^physcraper:ingroup") == False:
sys.stdout.write("tip {} not in ingroup. Will not blast, \n".format(otu_id))
continue
if time_passed > delay:
Expand Down
1 change: 1 addition & 0 deletions tests/data/bulk_tnrs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"metadata":{"name":"Untitled nameset","description":"","authors":[],"date_created":"2019-09-19T19:46:59.958Z","last_saved":"2019-09-19T19:49:03.778Z","save_count":1,"previous_filename":null,"latest_ott_version":null},"mappingHints":{"description":"Aids for mapping listed names to OTT taxa","searchContext":"All life","useFuzzyMatching":false,"substitutions":[{"active":true,"old":"","new":"","valid":true}]},"names":[{"id":"name1","originalLabel":"protopterus","selectedForAction":false,"defaultSortOrder":0,"ottId":199350,"ottTaxonName":"Protopterus","taxonomicSources":["ncbi:7885","gbif:2441252","irmng:1295830"]},{"id":"name2","originalLabel":"Anolis","selectedForAction":false,"defaultSortOrder":1,"ottId":705358,"ottTaxonName":"Anolis","taxonomicSources":["ncbi:28376","gbif:2468081","irmng:1301983"]},{"id":"name3","originalLabel":"Gallus","selectedForAction":false,"defaultSortOrder":2,"ottId":153562,"ottTaxonName":"Gallus","taxonomicSources":["ncbi:9030","gbif:2473720","irmng:1278118"]},{"id":"name4","originalLabel":"Homo","selectedForAction":false,"defaultSortOrder":3,"ottId":770309,"ottTaxonName":"Homo","taxonomicSources":["ncbi:9605","gbif:2436435","irmng:1160674","irmng:1035772"]},{"id":"name5","originalLabel":"Monodelphis","selectedForAction":false,"defaultSortOrder":4,"ottId":122359,"ottTaxonName":"Monodelphis","taxonomicSources":["ncbi:13615","gbif:7967492","irmng:1325350"]},{"id":"name6","originalLabel":"Ornithorhynchus","selectedForAction":false,"defaultSortOrder":5,"ottId":962391,"ottTaxonName":"Ornithorhynchus","taxonomicSources":["ncbi:9257","gbif:2433375","irmng:1107086"]},{"id":"name7","originalLabel":"Taeniopygia","selectedForAction":false,"defaultSortOrder":6,"ottId":708325,"ottTaxonName":"Taeniopygia","taxonomicSources":["ncbi:59728","gbif:2493632","irmng:1265687"]},{"id":"name8","originalLabel":"Xenopus","selectedForAction":false,"defaultSortOrder":7,"ottId":465090,"ottTaxonName":"Xenopus (genus in Deuterostomia)","taxonomicSources":["ncbi:8353","irmng:1382944"]},{"id":"name9","originalLabel":"alligator","selectedForAction":false,"defaultSortOrder":8,"ottId":335593,"ottTaxonName":"Alligator","taxonomicSources":["ncbi:8495","gbif:2441367","irmng:1039645"]},{"id":"name10","originalLabel":"emys_orbicularis","selectedForAction":false,"defaultSortOrder":9,"ottId":733093,"ottTaxonName":"Emys orbicularis","taxonomicSources":["ncbi:82168","gbif:5220538","irmng:11010173"]},{"id":"name11","originalLabel":"phrynops","selectedForAction":false,"defaultSortOrder":10,"ottId":66456,"ottTaxonName":"Phrynops","taxonomicSources":["ncbi:8462","gbif:2442114","irmng:1287776","irmng:1201383"]},{"id":"name12","originalLabel":"caiman","selectedForAction":false,"defaultSortOrder":11,"ottId":335589,"ottTaxonName":"Caiman","taxonomicSources":["ncbi:8497","gbif:5220195","irmng:1010136"]},{"id":"name13","originalLabel":"caretta","selectedForAction":false,"defaultSortOrder":12,"ottId":66463,"ottTaxonName":"Caretta","taxonomicSources":["ncbi:8466","worms:137066","gbif:2442177","irmng:1324374"]},{"id":"name14","originalLabel":"python","selectedForAction":false,"defaultSortOrder":13,"ottId":675102,"ottTaxonName":"Python","taxonomicSources":["ncbi:37579","gbif:2454645","irmng:1031494"]},{"id":"name15","originalLabel":"chelonoidis_nigra","selectedForAction":false,"defaultSortOrder":14,"ottId":284917,"ottTaxonName":"Chelonoidis nigra","taxonomicSources":["ncbi:66189","gbif:7661602","gbif:5220266"]},{"id":"name16","originalLabel":"podarcis","selectedForAction":false,"defaultSortOrder":15,"ottId":937560,"ottTaxonName":"Podarcis","taxonomicSources":["ncbi:42163","gbif:2468993","irmng:1304163"]}]}
1 change: 1 addition & 0 deletions tests/data/tiny_test_example/main.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"metadata":{"name":"Untitled nameset","description":"","authors":[],"date_created":"2019-08-09T18:42:40.146Z","last_saved":"2019-08-09T18:44:09.460Z","save_count":1,"previous_filename":null,"latest_ott_version":null},"mappingHints":{"description":"Aids for mapping listed names to OTT taxa","searchContext":"All life","useFuzzyMatching":false,"substitutions":[{"active":true,"old":"S_","new":"Senecio ","valid":true}]},"names":[{"id":"name1","originalLabel":"2029_doronicum","selectedForAction":false,"defaultSortOrder":0,"ottId":318436,"ottTaxonName":"Senecio doronicum","taxonomicSources":["ncbi:462523"]},{"id":"name2","originalLabel":"S_doronicum","selectedForAction":false,"defaultSortOrder":1,"ottId":318436,"ottTaxonName":"Senecio doronicum","taxonomicSources":["ncbi:462523"]},{"id":"name3","originalLabel":"S_lagascanus","selectedForAction":false,"defaultSortOrder":2,"ottId":640718,"ottTaxonName":"Senecio lagascanus","taxonomicSources":["ncbi:1268580"]},{"id":"name4","originalLabel":"S_lopezii","selectedForAction":false,"defaultSortOrder":3,"ottId":688688,"ottTaxonName":"Senecio lopezii","taxonomicSources":["ncbi:1268581"]},{"id":"name5","originalLabel":"S_scopolii","selectedForAction":false,"defaultSortOrder":4,"ottId":688671,"ottTaxonName":"Senecio scopolii","taxonomicSources":["ncbi:1268589"]}]}
130 changes: 86 additions & 44 deletions tests/test_owndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,113 @@
import json
from physcraper import generate_ATT_from_files, AlignTreeTax, OtuJsonDict, ConfigObj, IdDicts
from pytest import mark
from physcraper.opentree_helpers import bulk_tnrs_load



web = mark.web


def test_owndata():
seqaln= "tests/data/tiny_test_example/test.fas"
mattype="fasta"
trfn= "tests/data/tiny_test_example/test.tre"
schema_trf = "newick"
workdir="tests/output/owndata"
configfi = "tests/data/test.config"
id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
otu_jsonfi = "{}/otu_dict.json".format(workdir)

"""Tests if your own input files will generate a data object of class AlignTreeTax
"""
seqaln= "tests/data/tiny_test_example/test.fas"
mattype="fasta"
trfn= "tests/data/tiny_test_example/test.tre"
schema_trf = "newick"
workdir="tests/output/owndata"
configfi = "tests/data/test.config"
id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
otu_jsonfi = "{}/otu_dict.json".format(workdir)

if not os.path.exists("{}".format(workdir)):
os.makedirs("{}".format(workdir))
"""Tests if your own input files will generate a data object of class AlignTreeTax
"""

conf = ConfigObj(configfi)
ids = IdDicts(configfi)
if not os.path.exists("{}".format(workdir)):
os.makedirs("{}".format(workdir))

if os.path.exists(otu_jsonfi):
print("load json")
otu_json = json.load(open(otu_jsonfi))
else:
otu_json = OtuJsonDict(id_to_spn, ids)
json.dump(otu_json, open(otu_jsonfi,"w"))
ids = IdDicts()
if os.path.exists(otu_jsonfi):
print("load json")
otu_json = json.load(open(otu_jsonfi))
else:
otu_json = OtuJsonDict(id_to_spn, ids)
json.dump(otu_json, open(otu_jsonfi,"w"))

data_obj = generate_ATT_from_files(alnfile=seqaln,
aln_schema=mattype,
workdir=workdir,
configfile=configfi,
treefile=trfn,
tree_schema = schema_trf,
otu_json=otu_jsonfi,
search_taxon=None)
data_obj = generate_ATT_from_files(alnfile=seqaln,
aln_schema=mattype,
workdir=workdir,
configfile=configfi,
treefile=trfn,
tree_schema = schema_trf,
otu_json=otu_jsonfi,
search_taxon=None)


assert isinstance(data_obj, AlignTreeTax)
assert isinstance(data_obj, AlignTreeTax)


import physcraper
from dendropy import DnaCharacterMatrix

@web
def test_opentree():
# Use OpenTree phylesystem identifiers to get study and tree
study_id = "pg_873"
tree_id = "tree1679"
seqaln = "tests/data/minitest.fas"
mattype = "fasta"
workdir = "tests/output/opentree"
configfi = "tests/data/remotencbi.config"

sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
conf = physcraper.ConfigObj(configfi, interactive=False)
# Use OpenTree phylesystem identifiers to get study and tree
study_id = "pg_873"
tree_id = "tree1679"
seqaln = "tests/data/minitest.fas"
mattype = "fasta"
workdir = "tests/output/opentree"
configfi = "tests/data/remotencbi.config"

sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
conf = physcraper.ConfigObj(configfi, interactive=False)
# print "1. {}".format(conf.email)

aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
data_obj = physcraper.generate_ATT_from_phylesystem(alnfile=aln,
workdir=workdir,
aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
data_obj = physcraper.generate_ATT_from_phylesystem(alnfile=aln,
workdir=workdir,
configfile=conf,
study_id=study_id,
tree_id=tree_id)
assert isinstance(data_obj, AlignTreeTax)
assert isinstance(data_obj, AlignTreeTax)

def load_json():
inputfi = "tests/data/bulk_tnrs.json"
otu_dict = bulk_tnrs_load(inputfi)
assert len(otu_dict) == 16
print(otu_dict)



def test_owndata_bulktnrs():
seqaln= "tests/data/tiny_test_example/test.fas"
mattype="fasta"
trfn= "tests/data/tiny_test_example/test.tre"
schema_trf = "newick"
workdir="tests/output/owndata"
configfi = "tests/data/test.config"
otu_jsonfi = "tests/data/tiny_test_example/main.json"

"""Tests if your own input files will generate a data object of class AlignTreeTax
"""

if not os.path.exists("{}".format(workdir)):
os.makedirs("{}".format(workdir))

conf = ConfigObj(configfi)

otu_dict = bulk_tnrs_load(otu_jsonfi)
print(otu_dict)
data_obj = generate_ATT_from_files(alnfile=seqaln,
aln_schema=mattype,
workdir=workdir,
configfile=configfi,
treefile=trfn,
tree_schema = schema_trf,
otu_json=otu_dict,
search_taxon=None)


assert isinstance(data_obj, AlignTreeTax)


test_owndata_bulktnrs()

0 comments on commit 6400df5

Please sign in to comment.