Skip to content

Commit

Permalink
Merge pull request #75 from McTavishLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
snacktavish committed May 5, 2020
2 parents f4d3e50 + 25cc3c7 commit d157c99
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 132 deletions.
31 changes: 0 additions & 31 deletions bin/opentree_scrape.py

This file was deleted.

98 changes: 98 additions & 0 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python
import argparse
import os
import sys
import physcraper
from physcraper.opentree_helpers import get_tree_from_study, scraper_from_opentree, get_max_match_aln, count_match_tree_to_aln

parser = argparse.ArgumentParser()
parser.add_argument("-s","--study_id", help="OpenTree study id")
parser.add_argument("-t","--tree_id", help="OpenTree tree id")
parser.add_argument("-tl", "--tree_link", help="Link to tree to update on OpenTree")
parser.add_argument("-a","--alignment", help="path to alignment")
parser.add_argument("-as","--aln_schema", help="alignment schema (nexus or fasta)")
parser.add_argument("-db", "--blast_db", help="local download of blast database")
parser.add_argument("-o","--output", help="path to output directory")
parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
parser.add_argument("-c","--config_file", help="path to config file")
parser.add_argument("-tb","--treebase", action="store_true", help="download alignment from treebase")
parser.add_argument("-no_est","--no_estimate_tree", action='store_true', help="run blast search and estimate tree")


#Not yet implemented
parser.add_argument("-bl","--blast_sequence", action='store_true', help="run blast search, and align but do not estimate tree")
parser.add_argument("-d","--download_data", action='store_true', help="write out tree and alignment, without blasting")
parser.add_argument("-bs","--bootstrap", help="number of bootstrap reps")
parser.add_argument("-gt","--get_tree", action='store_true', help="get tree from opentree")
parser.add_argument("-ga","--get_aln", action='store_true', help="get alignment from opentree")
parser.add_argument("-tf", "--tree_file", help="path to your tree")
parser.add_argument("-l","--linker_file", help="path to .csv linking tip labels to taxon names")


args = parser.parse_args()

assert(args.output), "Output directory (-o) is required."
workdir = args.output

if args.config_file:
conf = physcraper.ConfigObj(args.configfile)
else:
conf = physcraper.ConfigObj()

if args.taxonomy:
conf.taxonomy_dir = args.taxonomy


if args.blast_db:
conf.blast_loc = "local"
conf.blastdb = args.blast_db
conf.set_local()

study_id = None
if args.tree_link:
linkl = args.tree_link.split("/")
assert(linkl[4]=="view")
study_id == linkl[5]
tree_id = linkl[-1].split("=")[1]


if args.study_id:
study_id = args.study_id
tree_id = args.tree_id

if args.alignment:
alnfile = args.alignment
assert(args.aln_schema)
aln_schema = args.aln_schema


if args.treebase:
aln_schema = "nexus"
alnfile = "{}/{}{}.aln".format(workdir, study_id, tree_id)
if not os.path.exists(workdir):
os.makedirs(workdir)

tre, cite = get_tree_from_study(study_id, tree_id)
sys.stdout.write("downloading best match alignment from treebase")
tre.write(path="{}/{}{}.tre".format(workdir, study_id, tree_id), schema="nexus")
if not os.path.exists(alnfile):
sys.stdout.write("downloading best match alignment from treebase")
dataset = physcraper.opentree_helpers.get_dataset_from_treebase(study_id)
aln = get_max_match_aln(tre, dataset)
aln.write(path=alnfile, schema = aln_schema)
else:
sys.stdout.write("Using alignment file found at {}.".format(alnfile))

if study_id:
scraper = scraper_from_opentree(study_id =study_id,
tree_id = tree_id,
alnfile = alnfile,
aln_schema = aln_schema,
workdir = workdir,
configfile = conf)
sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))

if not args.no_estimate_tree:
#scraper.read_blast_wrapper()
scraper.est_full_tree()
scraper.data.write_labelled(label='^ot:ottTaxonName')
29 changes: 29 additions & 0 deletions cond_env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

name: physcraper_env
channels:
- bioconda
- conda-forge
dependencies:
- python>=3.5
- pip

- biopython
- configparser
- coverage
- dendropy
- numpy

- pandas
- requests
- sh
- urllib3>=1.23
- pytest
- pytest-cov
- pytest-xdist
- recommonmark

- raxml
- blast
- muscle

##datetime and opentree need install seperately
14 changes: 0 additions & 14 deletions dev/peyotl.config

This file was deleted.

59 changes: 55 additions & 4 deletions mds/INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,21 @@
git clone git@github.com:McTavishLab/physcraper.git
```

# Install using conda
Install anaconda

```
conda env create -f cond_env.yml
conda activate physcraper_env
# This next step is temprary until opentree changes are uploaded to pypi
pip install -e git+https://github.com/OpenTreeOfLife/python-opentree@get-tree#egg=opentree
```


# INstall using Virtual Env
## Create a python virtual environment

Recommended install procedure is using a virtual environment (are there any other ways?). From your terminal do:

```
virtualenv venv-physcraper
Expand Down Expand Up @@ -41,10 +53,50 @@ which raxmlHPC
```


#### Python packages:
# Databases

The tool can be run locally using databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)).

To blast locally you will need to install blast command line tools.
Instructions at
https://www.ncbi.nlm.nih.gov/books/NBK279671/
https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/


e.g. on linux:
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.10.0+-x64-linux.tar.gz
tar -xzvf ncbi-blast-2.10.0+-x64-linux.tar.gz

The binaries are in /bin


If you want to download the blast database and taxonomy for faster local searches
NOTE: this download can take several hours, depending on your internet connection.

```
mkdir local_blast_db
cd local_blast_db
update_blastdb nt
cat *.tar.gz | tar -xvzf - -i
update_blastdb taxdb
gunzip -cd taxdb.tar.gz | (tar xvf - )
```

# Download the the nodes and names dowloads in tothe physcraper/taxonomy directory

```
cd physcraper/taxonomy
wget 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
gunzip -f -cd taxdump.tar.gz | (tar xvf - names.dmp nodes.dmp)
```





# Python packages:
These will all be installed if you install physcraper using `python setup.py install`

(but note, if you are using virtualenv there are some weird interactions with setuptools and python 2.7.6)

- Dendropy https://pythonhosted.org/DendroPy/
- Peyotl https://github.com/OpenTreeOfLife/peyotl (currently needs to be on physcraper branch)
Expand All @@ -53,7 +105,6 @@ These will all be installed if you install physcraper using `python setup.py ins

## Databases

The tool can be run locally using databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)).

[Previous: Back home](../README.md)

Expand Down
9 changes: 9 additions & 0 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=
self.config = ConfigObj(configfile)
if not os.path.exists("{}/run.config".format(self.workdir)):
shutil.copyfile(self.config.configfi, "{}/run.config".format(self.workdir))
else:
assert(isinstance(configfile, ConfigObj)),type(configfile)
self.config = configfile
assert(self.config)
## Read Alignment
assert (self.tre.taxon_namespace is self.aln.taxon_namespace), "tre and aln taxon_namespace are not identical"
Expand Down Expand Up @@ -231,8 +234,14 @@ def read_in_aln(self, alignment, aln_schema, namespace=None):
assert os.path.exists(alignment)
##Check namespace
self.aln = DnaCharacterMatrix.get(path=alignment, schema=aln_schema, taxon_namespace = self.tns)
empty = set()
for tax, seq in self.aln.items():
tax.label = tax.label.replace(" ","_")
if len(str(seq).replace("?","").replace("-","")) == 0:
empty.add(tax)
self.aln.remove_sequences(empty)
msg = ", ".join([str(tax) for tax in list(empty)])
sys.stdout.write("All gap taxa {}\n".format(msg))
#elif isinstance(alignment, datamodel.charmatrixmodel.DnaCharacterMatrix):
# self.aln = alignment
assert isinstance(self.aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \
Expand Down
63 changes: 32 additions & 31 deletions physcraper/configobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import datetime
import configparser
import shutil

from physcraper import ncbi_data_parser # is the ncbi data parser class and associated functions

Expand Down Expand Up @@ -65,8 +66,8 @@ class ConfigObj(object):
* if blastloc == local:
* self.blastdb: this defines the path to the local blast database
* self.ncbi_parser_nodes_fn: path to 'nodes.dmp' file, that contains the hierarchical information
* self.ncbi_parser_names_fn: path to 'names.dmp' file, that contains the different ID's
* self.ncbi_nodes: path to 'nodes.dmp' file, that contains the hierarchical information
* self.ncbi_names: path to 'names.dmp' file, that contains the different ID's
"""

def __init__(self, configfile = None, interactive=False):
Expand Down Expand Up @@ -119,40 +120,13 @@ def read_config(self, configfi, interactive):
assert os.path.isfile(self.ott_ncbi), (
"file `%s` does not exists" % self.ott_ncbi
)


self.blast_loc = config["blast"]["location"]
assert self.blast_loc in ["local", "remote"], (
"your blast location `%s` is not remote or local" % self.email
)
if self.blast_loc == "local":
self.blastdb = config["blast"]["localblastdb"]
self.ncbi_parser_nodes_fn = "{}/nodes.dmp".format(self.taxonomy_dir)
self.ncbi_parser_names_fn = "{}/names.dmp".format(self.taxonomy_dir)
if not os.path.isdir(self.blastdb):
sys.stderr.write("Local Blast DB not found at {}, please use a remote search, or update as described in 'taxonomy/update_blast_db'\n".format(self.blastdb))
sys.exit()
if not os.path.exists("{}/nt.60.nhr".format(self.blastdb)):
sys.stderr.write("Errors with local Blast DB at {}, please use a remote search, or update as described in 'taxonomy/update_blast_db'\n".format(self.blastdb))
sys.exit()
else:
download_date = os.path.getmtime("{}/nt.60.nhr".format(self.blastdb))
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your databases might not be up to date anymore. You downloaded them {} days ago. Continuing, but perhaps use a remote search, or update as decribed in 'taxonomy/update_blast_db'\n".format(time_passed))
if not os.path.exists(self.ncbi_parser_nodes_fn):
sys.stderr.write("NCBI taxonomy not found at {} - please update nodes and names.dmp, as described in 'taxonomy/update_blast_db'\n".format(self.ncbi_parser_nodes_fn))
sys.exit()
else:
download_date = os.path.getmtime(self.ncbi_parser_nodes_fn)
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your taxonomy databases from NCBI were dowloaded {} days ago. Please update nodes and names.dmp, as described in 'taxonomy/update_blast_db'\n".format(time_passed))
self.url_base = None
self.set_local()
if self.blast_loc == "remote":
self.url_base = config["blast"].get("url_base")
if _DEBUG:
Expand Down Expand Up @@ -190,4 +164,31 @@ def read_config(self, configfi, interactive):
assert 1 < self.maxlen, (
"value `%s` is not larger than 1" % self.maxlen
)

def set_local(self):
self.ncbi_nodes = "{}/nodes.dmp".format(self.taxonomy_dir)
self.ncbi_names = "{}/names.dmp".format(self.taxonomy_dir)
if not os.path.isdir(self.blastdb):
sys.stderr.write("Local Blast DB not found at {}, please use a remote search, or update as described in 'taxonomy/update_blast_db'\n".format(self.blastdb))
sys.exit()
if not os.path.exists("{}/nt.23.nhr".format(self.blastdb)):
sys.stderr.write("Errors with local Blast DB at {}, may be incomplete. please use a remote search, or update as described in 'taxonomy/update_blast_db'\n".format(self.blastdb))
sys.exit()
else:
download_date = os.path.getmtime("{}/nt.23.nhr".format(self.blastdb))
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your databases might not be up to date anymore. You downloaded them {} days ago. Continuing, but perhaps use a remote search, or update as decribed in 'taxonomy/update_blast_db'\n".format(time_passed))
if not os.path.exists(self.ncbi_nodes):
sys.stderr.write("NCBI taxonomy not found at {} - please update nodes and names.dmp, as described in 'taxonomy/update_blast_db'\n".format(self.ncbi_nodes))
sys.exit()
else:
download_date = os.path.getmtime(self.ncbi_nodes)
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your taxonomy databases from NCBI were dowloaded {} days ago. Please update nodes and names.dmp, as described in 'taxonomy/update_blast_db'\n".format(time_passed))
assert(shutil.which("blastn")), "blastn not found in path"
self.url_base = None

0 comments on commit d157c99

Please sign in to comment.