Skip to content

Commit

Permalink
Merge pull request #79 from McTavishLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
snacktavish committed May 8, 2020
2 parents d157c99 + c026122 commit 0024799
Show file tree
Hide file tree
Showing 69 changed files with 6,218 additions and 12,783 deletions.
31 changes: 28 additions & 3 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import physcraper
from physcraper.opentree_helpers import get_tree_from_study, scraper_from_opentree, get_max_match_aln, count_match_tree_to_aln
from physcraper.aligntreetax import generate_ATT_from_run

parser = argparse.ArgumentParser()
parser.add_argument("-s","--study_id", help="OpenTree study id")
Expand All @@ -15,10 +16,14 @@
parser.add_argument("-o","--output", help="path to output directory")
parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
parser.add_argument("-c","--config_file", help="path to config file")
parser.add_argument("-e","--email", help="email address for ncbi balst searches")
parser.add_argument("-re","--reload_files", help="reload files and configureation from dir")
parser.add_argument("-tag","--tag", help="gene name or other specifier")
parser.add_argument("-tb","--treebase", action="store_true", help="download alignment from treebase")
parser.add_argument("-no_est","--no_estimate_tree", action='store_true', help="run blast search and estimate tree")



#Not yet implemented
parser.add_argument("-bl","--blast_sequence", action='store_true', help="run blast search, and align but do not estimate tree")
parser.add_argument("-d","--download_data", action='store_true', help="write out tree and alignment, without blasting")
Expand All @@ -34,8 +39,13 @@
assert(args.output), "Output directory (-o) is required."
workdir = args.output


if args.config_file:
conf = physcraper.ConfigObj(args.configfile)
elif args.reload_files:
configfile = "{}/run.config".format(args.reload_files)
conf = physcraper.ConfigObj(configfile)
sys.stdout.write("Using config file {}\n".format(configfile))
else:
conf = physcraper.ConfigObj()

Expand All @@ -55,6 +65,8 @@
study_id == linkl[5]
tree_id = linkl[-1].split("=")[1]

if args.email:
conf.email = args.email

if args.study_id:
study_id = args.study_id
Expand All @@ -73,15 +85,14 @@
os.makedirs(workdir)

tre, cite = get_tree_from_study(study_id, tree_id)
sys.stdout.write("downloading best match alignment from treebase")
tre.write(path="{}/{}{}.tre".format(workdir, study_id, tree_id), schema="nexus")
if not os.path.exists(alnfile):
sys.stdout.write("downloading best match alignment from treebase")
sys.stdout.write("downloading best match alignment from treebase\n")
dataset = physcraper.opentree_helpers.get_dataset_from_treebase(study_id)
aln = get_max_match_aln(tre, dataset)
aln.write(path=alnfile, schema = aln_schema)
else:
sys.stdout.write("Using alignment file found at {}.".format(alnfile))
sys.stdout.write("Using alignment file found at {}.\n".format(alnfile))

if study_id:
scraper = scraper_from_opentree(study_id =study_id,
Expand All @@ -91,6 +102,20 @@
workdir = workdir,
configfile = conf)
sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))

scraper.data.write_files()
scraper.data.write_otus(schema="json")

if args.reload_files:
if args.tag:
tag = args.tag
elif args.alignment:
tag = args.alignment.split('/')[-1].split('.')[0]
data_obj = generate_ATT_from_run(args.reload_files, configfile=conf)
ids = physcraper.IdDicts(conf)
scraper = physcraper.PhyscraperScrape(data_obj, ids)
sys.stdout.write("Reloaded {} taxa in alignment and tree\n".format(len(scraper.data.aln)))


if not args.no_estimate_tree:
#scraper.read_blast_wrapper()
Expand Down
18 changes: 1 addition & 17 deletions docs/examples/example.config
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,5 @@ trim_perc = 0.75
# max length for values to add to aln
max_len = 2.5


#######
## INTERNAL PHYSCRAPER SETTINGS
#---------------------------------------------------------------------------------
#Things below here you should not need to change!

#Only required if blast location is local
[ncbi_parser]
nodes_fn = ./taxonomy/nodes.dmp
names_fn = ./taxonomy/names.dmp

[phylesystem]
location = api
#local or api, leave set to api unless you have installed phylesystem locally

[taxonomy]
#You should not need to change any of these!
#path = taxonomy
taxonomy_path = taxonomy
95 changes: 55 additions & 40 deletions mds/INSTALL.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
[Back home](../README.md)


# Installing `physcraper`
# I. Installing physcraper

## Preinstallation requirements

## Download `physcraper`
## 1. Downloading `physcraper`

```
git clone git@github.com:McTavishLab/physcraper.git
```

# Install using conda
## 2A. Install using conda
Install anaconda

```
Expand All @@ -23,24 +21,48 @@ Install anaconda
```


# INstall using Virtual Env
## Create a python virtual environment
## 2B. Install using Virtual Env
### 1. Create a python virtual environment


```
virtualenv venv-physcraper
```


## Activate the installed virtual environment
### 2. Activate the installed virtual environment

Once you have a venv-physcraper directory, **_activate_** it with:

```
source venv-physcraper/bin/activate
```
Remember that you will have to activate the virtual environment every time you want to run `physcraper`.

### 3. Install `physcraper` inside the virtual environment with

```
python setup.py install
```

## Dependencies
This will install the following python packages also:

- Dendropy https://pythonhosted.org/DendroPy/
- Peyotl https://github.com/OpenTreeOfLife/peyotl (currently needs to be on physcraper branch)
- Biopython http://biopython.org/wiki/Download
- ConfigParser


### 4. Come out of the virtual environment:

```
deactivate
```

Do this after you are finisged working with physcraper.


# II. Checking for dependencies

Currently complete phylogenetic updating WITH `physcraper` requires
[raxmlHPC](http://sco.h-its.org/exelixis/web/software/raxml/index.html) and [MUSCLE](install-muscle.md) to be installed and in the path.
Expand All @@ -53,36 +75,44 @@ which raxmlHPC
```


# Databases
# III. Local Databases

The tool can be run using local databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)).

The tool can be run locally using databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)).
### 1. Installing blast command line tools

To blast locally you will need to install blast command line tools.
Instructions at
To blast locally you will need to install blast command line tools first.
Find general instructions at
https://www.ncbi.nlm.nih.gov/books/NBK279671/
https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/


e.g. on linux:
e.g. installing blast command line tools on linux:

```
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.10.0+-x64-linux.tar.gz
tar -xzvf ncbi-blast-2.10.0+-x64-linux.tar.gz

The binaries are in /bin
```

The binaries/scripts/executables will be installed in the `/bin` folder.

### 2. Downloading the NCBI database

If you want to download the NCBI blast database and taxonomy for faster local searches
note that the download can take several hours, depending on your internet connection.

If you want to download the blast database and taxonomy for faster local searches
NOTE: this download can take several hours, depending on your internet connection.
This is what you should do:

```
mkdir local_blast_db
cd local_blast_db
update_blastdb nt
cat *.tar.gz | tar -xvzf - -i
update_blastdb taxdb
gunzip -cd taxdb.tar.gz | (tar xvf - )
mkdir local_blast_db # create the folder to save the database
cd local_blast_db # move to the newly created folder
update_blastdb nt # download the NCBI nucleotide databases
cat *.tar.gz | tar -xvzf - -i # unzip the nucleotide databases
update_blastdb taxdb # download the NCBI taxonomy database
gunzip -cd taxdb.tar.gz | (tar xvf - ) # unzip the taxonomy database
```

# Download the the nodes and names dowloads in tothe physcraper/taxonomy directory
### 3. Downloading the nodes and names into the physcraper/taxonomy directory

```
cd physcraper/taxonomy
Expand All @@ -91,21 +121,6 @@ NOTE: this download can take several hours, depending on your internet connectio
```





# Python packages:
These will all be installed if you install physcraper using `python setup.py install`


- Dendropy https://pythonhosted.org/DendroPy/
- Peyotl https://github.com/OpenTreeOfLife/peyotl (currently needs to be on physcraper branch)
- Biopython http://biopython.org/wiki/Download
- ConfigParser

## Databases


[Previous: Back home](../README.md)

[Next: Running `physcraper`](running.md)
48 changes: 34 additions & 14 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,23 +55,27 @@ def generate_ATT_from_files(workdir,
return AlignTreeTax(treefile, otu_dict, alnfile, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tree_schema=tree_schema)

def generate_ATT_from_run(workdir, configfile=None):
def generate_ATT_from_run(workdir, tag=None, configfile=None):
"""Build an ATT object without phylesystem, use your own files instead.
:return: object of class ATT
"""

files = [f for f in os.listdir(workdir)]
for file in files:
if file.startswith('physcraper_'):
tag = file.split('.')[0].replace('physcraper_', '')
sys.stdout.write("Reloading files with tag {}\n".format(tag))
assert os.path.exists(workdir)
# use replaced aln as input
if configfile == None:
if os.path.exists("{}/run.config".format(workdir)):
configfile = "{}/run.config".format(workdir)
alnfi = "{}/physcraper.fas".format(workdir)
treefile = "{}/physcraper.tre".format(workdir)
otu_json = "{}/otu_info.json".format(workdir)
alnfi = "{}/physcraper_{}.fas".format(workdir, tag)
treefile = "{}/physcraper_{}.tre".format(workdir, tag)
otu_json = "{}/otu_info_{}.json".format(workdir, tag)
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(workdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tree_schema='newick')
configfile=configfile, tag=tag, tree_schema='newick')


#def concatenate_ATTs(att_list, number_per_taxon='max', level='spp'):
Expand Down Expand Up @@ -164,8 +168,13 @@ class AlignTreeTax(object):
"""

def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=None,
tree_schema='newick',aln_schema ='fasta',taxon_namespace=None):
tree_schema='newick',aln_schema ='fasta',taxon_namespace=None, tag=None):
debug("build ATT class")
if tag == None:
self.tag = alignment.split('/')[-1].split('.')[0]
else:
self.tag = tag
print("alignment tag is {}".format(self.tag))
self.workdir = os.path.abspath(workdir)
if not os.path.exists(self.workdir):
os.makedirs(self.workdir)
Expand Down Expand Up @@ -241,7 +250,8 @@ def read_in_aln(self, alignment, aln_schema, namespace=None):
empty.add(tax)
self.aln.remove_sequences(empty)
msg = ", ".join([str(tax) for tax in list(empty)])
sys.stdout.write("All gap taxa {}\n".format(msg))
if len(empty) >= 1:
sys.stdout.write("All gap taxa {}\n".format(msg))
#elif isinstance(alignment, datamodel.charmatrixmodel.DnaCharacterMatrix):
# self.aln = alignment
assert isinstance(self.aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \
Expand Down Expand Up @@ -563,16 +573,22 @@ def write_random_resolve_tre(self, treefilename='random_resolve.tre'):
fi.close()
return treepath

def write_aln(self, alnname="physcraper.fas", alnschema="fasta"):
def write_aln(self, alnname=None, alnschema="fasta"):
if alnname == None:
alnname = "physcraper_{}.fas".format(self.tag)
alnpath = "{}/{}".format(self.workdir, alnname)
self.aln.write(path=alnpath,
schema=alnschema)
return os.path.abspath(alnpath)

def write_files(self, treepath="physcraper.tre", treeschema="newick", alnpath="physcraper.fas", alnschema="fasta"):
def write_files(self, treepath=None, treeschema="newick", alnpath=None, alnschema="fasta"):
"""Outputs both the streaming files, labeled with OTU ids.
Can be mapped to original labels using otu_dict.json or otu_seq_info.csv"""
#debug("write_files")
if alnpath == None:
alnpath = "physcraper_{}.fas".format(self.tag)
if treepath == None:
treepath = "physcraper_{}.tre".format(self.tag)
self.tre.write(path="{}/{}".format(self.workdir, treepath),
schema=treeschema, unquoted_underscores=True)
self.aln.write(path="{}/{}".format(self.workdir, alnpath),
Expand All @@ -598,8 +614,12 @@ def write_labelled(self, label, filename = "labelled", direc='workdir', norepeat
#debug("write labelled files")
if direc == 'workdir':
direc = self.workdir
treepath = "{}/{}".format(direc, "{}.tre".format(filename))
alnpath = "{}/{}".format(direc, '{}.fas'.format(filename))
if filename == "labelled":
treepath = "{}/{}_{}.tre".format(direc, filename, self.tag)
alnpath = "{}/{}_{}.fas".format(direc, filename, self.tag)
else:
treepath = "{}/{}.tre".format(direc, filename)
alnpath = "{}/{}.fas".format(direc, filename)
debug(treepath)
assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
Expand Down Expand Up @@ -652,15 +672,15 @@ def write_otus(self, filename = "otu_info", schema="table"):

assert schema in ["table", "json"]
if schema == "json":
with open("{}/{}.json".format(self.workdir, filename), "w") as outfile:
with open("{}/{}_{}.json".format(self.workdir, filename, self.tag), "w") as outfile:
json.dump(self.otu_dict, outfile)
if schema == "table":
all_keys = set()
for otu in self.otu_dict:
all_keys.update(self.otu_dict[otu].keys())
keys = list(all_keys)
header = ["otu_id"] + keys
with open("{}/{}.csv".format(self.workdir, filename), "w") as outfile:
with open("{}/{}_{}.csv".format(self.workdir, filename, self.tag), "w") as outfile:
outfile.write("\t".join(header)+"\n")
for otu in self.otu_dict:
vals = [str(self.otu_dict[otu].get(key, "-")) for key in keys]
Expand Down

0 comments on commit 0024799

Please sign in to comment.