Merge pull request #79 from McTavishLab/dev

Dev
McTavishLab · May 8, 2020 · 0024799 · 0024799
2 parents d157c99 + c026122
commit 0024799
Show file tree

Hide file tree

Showing 69 changed files with 6,218 additions and 12,783 deletions.
diff --git a/bin/physcraper_run.py b/bin/physcraper_run.py
@@ -4,6 +4,7 @@
 import sys
 import physcraper
 from physcraper.opentree_helpers import get_tree_from_study, scraper_from_opentree, get_max_match_aln, count_match_tree_to_aln
+from physcraper.aligntreetax import generate_ATT_from_run
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-s","--study_id", help="OpenTree study id")
@@ -15,10 +16,14 @@
 parser.add_argument("-o","--output", help="path to output directory")
 parser.add_argument("-tx","--taxonomy", help="path to taxonomy")
 parser.add_argument("-c","--config_file", help="path to config file")
+parser.add_argument("-e","--email", help="email address for ncbi balst searches")
+parser.add_argument("-re","--reload_files",  help="reload files and configureation from dir")
+parser.add_argument("-tag","--tag", help="gene name or other specifier")
 parser.add_argument("-tb","--treebase", action="store_true", help="download alignment from treebase")
 parser.add_argument("-no_est","--no_estimate_tree", action='store_true', help="run blast search and estimate tree")
 
 
+
 #Not yet implemented
 parser.add_argument("-bl","--blast_sequence", action='store_true', help="run blast search, and align but do not estimate tree")
 parser.add_argument("-d","--download_data", action='store_true', help="write out tree and alignment, without blasting")
@@ -34,8 +39,13 @@
 assert(args.output), "Output directory (-o) is required."
 workdir = args.output
 
+
 if args.config_file:
     conf = physcraper.ConfigObj(args.configfile)
+elif args.reload_files:
+    configfile = "{}/run.config".format(args.reload_files)
+    conf = physcraper.ConfigObj(configfile)
+    sys.stdout.write("Using config file {}\n".format(configfile))
 else:
     conf = physcraper.ConfigObj()
 
@@ -55,6 +65,8 @@
     study_id == linkl[5]
     tree_id = linkl[-1].split("=")[1]
 
+if args.email:
+    conf.email = args.email
 
 if args.study_id:
     study_id = args.study_id
@@ -73,15 +85,14 @@
         os.makedirs(workdir)
 
     tre, cite = get_tree_from_study(study_id, tree_id)
-    sys.stdout.write("downloading best match alignment from treebase")
     tre.write(path="{}/{}{}.tre".format(workdir, study_id, tree_id), schema="nexus")
     if not os.path.exists(alnfile):
-        sys.stdout.write("downloading best match alignment from treebase")
+        sys.stdout.write("downloading best match alignment from treebase\n")
         dataset = physcraper.opentree_helpers.get_dataset_from_treebase(study_id)
         aln = get_max_match_aln(tre, dataset)
         aln.write(path=alnfile, schema = aln_schema)
     else:
-        sys.stdout.write("Using alignment file found at {}.".format(alnfile))
+        sys.stdout.write("Using alignment file found at {}.\n".format(alnfile))
 
 if study_id:
     scraper = scraper_from_opentree(study_id =study_id, 
@@ -91,6 +102,20 @@
                                     workdir = workdir,
                                     configfile = conf)
     sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))
+
+    scraper.data.write_files()
+    scraper.data.write_otus(schema="json")
+
+if args.reload_files:
+    if args.tag:
+        tag = args.tag
+    elif args.alignment:
+        tag = args.alignment.split('/')[-1].split('.')[0]
+    data_obj = generate_ATT_from_run(args.reload_files, configfile=conf)
+    ids = physcraper.IdDicts(conf)
+    scraper = physcraper.PhyscraperScrape(data_obj, ids)
+    sys.stdout.write("Reloaded {} taxa in alignment and tree\n".format(len(scraper.data.aln)))
+
 
 if not args.no_estimate_tree:
 #scraper.read_blast_wrapper()

diff --git a/docs/examples/example.config b/docs/examples/example.config
@@ -49,21 +49,5 @@ trim_perc = 0.75
 # max length for values to add to aln
 max_len = 2.5
 
-
-#######
-## INTERNAL PHYSCRAPER SETTINGS
-#---------------------------------------------------------------------------------
-#Things below here you should not need to change!
-
-#Only required if blast location is local
-[ncbi_parser]
-nodes_fn = ./taxonomy/nodes.dmp
-names_fn = ./taxonomy/names.dmp
-
-[phylesystem]
-location = api
-#local or api, leave set to api unless you have installed phylesystem locally
-
-[taxonomy]
 #You should not need to change any of these!
-#path = taxonomy
+taxonomy_path = taxonomy
diff --git a/mds/INSTALL.md b/mds/INSTALL.md
@@ -1,17 +1,15 @@
 [Back home](../README.md)
 
 
-# Installing `physcraper`
+# I. Installing physcraper
 
-## Preinstallation requirements
-
-## Download `physcraper`
+## 1. Downloading `physcraper`
 
 ```
 git clone git@github.com:McTavishLab/physcraper.git
 ```
 
-# Install using conda
+## 2A. Install using conda
 Install anaconda  
 
 ```
@@ -23,24 +21,48 @@ Install anaconda
 ```
 
 
-# INstall using Virtual Env
-## Create a python virtual environment
+## 2B. Install using Virtual Env
+### 1. Create a python virtual environment
 
 
 ```
 virtualenv venv-physcraper
 ```
 
 
-## Activate the installed virtual environment
+### 2. Activate the installed virtual environment
+
 Once you have a venv-physcraper directory, **_activate_** it with:
 
 ```
 source venv-physcraper/bin/activate
+```
+Remember that you will have to activate the virtual environment every time you want to run `physcraper`.
+
+### 3. Install `physcraper` inside the virtual environment with
+
+```
 python setup.py install
 ```
 
-## Dependencies
+This will install the following python packages also:
+
+- Dendropy https://pythonhosted.org/DendroPy/
+- Peyotl https://github.com/OpenTreeOfLife/peyotl (currently needs to be on physcraper branch)
+- Biopython http://biopython.org/wiki/Download
+- ConfigParser
+
+
+### 4. Come out of the virtual environment:
+
+```
+deactivate
+```
+
+Do this after you are finisged working with physcraper.
+
+
+# II. Checking for dependencies
 
 Currently complete phylogenetic updating WITH `physcraper` requires
 [raxmlHPC](http://sco.h-its.org/exelixis/web/software/raxml/index.html) and [MUSCLE](install-muscle.md) to be installed and in the path.
@@ -53,36 +75,44 @@ which raxmlHPC
 ```
 
 
-# Databases
+# III. Local Databases
+
+The tool can be run using local databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)). 
 
-The tool can be run locally using databases, which can be downloaded and updated from the National Center for Biotechnology Information ([NCBI](https://www.ncbi.nlm.nih.gov/)). 
+### 1. Installing blast command line tools
 
-To blast locally you will need to install blast command line tools.  
-Instructions at
+To blast locally you will need to install blast command line tools first.  
+Find general instructions at
 https://www.ncbi.nlm.nih.gov/books/NBK279671/
 https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/
 
 
-e.g. on linux:
+e.g. installing blast command line tools on linux:
+
+```
     wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.10.0+-x64-linux.tar.gz
     tar -xzvf ncbi-blast-2.10.0+-x64-linux.tar.gz 
-
- The binaries are in /bin
+ ```
+
+The binaries/scripts/executables will be installed in the `/bin` folder.
+
+### 2. Downloading the NCBI database
 
+If you want to download the NCBI blast database and taxonomy for faster local searches
+note that the download can take several hours, depending on your internet connection.
 
-If you want to download the blast database and taxonomy for faster local searches
-NOTE: this download can take several hours, depending on your internet connection.
+This is what you should do:
 
 ``` 
-    mkdir local_blast_db
-    cd local_blast_db
-    update_blastdb nt
-    cat *.tar.gz | tar -xvzf - -i
-    update_blastdb taxdb
-    gunzip -cd taxdb.tar.gz | (tar xvf - )
+    mkdir local_blast_db  # create the folder to save the database
+    cd local_blast_db  # move to the newly created folder
+    update_blastdb nt  # download the NCBI nucleotide databases
+    cat *.tar.gz | tar -xvzf - -i  # unzip the nucleotide databases
+    update_blastdb taxdb  # download the NCBI taxonomy database
+    gunzip -cd taxdb.tar.gz | (tar xvf - )  # unzip the taxonomy database
 ```
 
-# Download the the nodes and names dowloads in tothe physcraper/taxonomy directory
+### 3. Downloading the nodes and names into the physcraper/taxonomy directory
 
 ```
     cd physcraper/taxonomy
@@ -91,21 +121,6 @@ NOTE: this download can take several hours, depending on your internet connectio
 ```
 
 
-
-
-
-# Python packages:
-These will all be installed if you install physcraper using `python setup.py install`
-
-
-- Dendropy https://pythonhosted.org/DendroPy/
-- Peyotl https://github.com/OpenTreeOfLife/peyotl (currently needs to be on physcraper branch)
-- Biopython http://biopython.org/wiki/Download
-- ConfigParser
-
-## Databases
-
-
 [Previous: Back home](../README.md)
 
 [Next: Running  `physcraper`](running.md)
diff --git a/physcraper/aligntreetax.py b/physcraper/aligntreetax.py
@@ -55,23 +55,27 @@ def generate_ATT_from_files(workdir,
     return AlignTreeTax(treefile, otu_dict, alnfile, ingroup_mrca=mrca_ott, workdir=workdir,
                         configfile=configfile, tree_schema=tree_schema)
 
-def generate_ATT_from_run(workdir, configfile=None):
+def generate_ATT_from_run(workdir, tag=None, configfile=None):
     """Build an ATT object without phylesystem, use your own files instead.
     :return: object of class ATT
     """
-
+    files = [f for f in os.listdir(workdir)]
+    for file in files:
+        if file.startswith('physcraper_'):
+            tag = file.split('.')[0].replace('physcraper_', '')
+    sys.stdout.write("Reloading files with tag {}\n".format(tag))
     assert os.path.exists(workdir)
     # use replaced aln as input
     if configfile == None:
         if os.path.exists("{}/run.config".format(workdir)):
             configfile = "{}/run.config".format(workdir)
-    alnfi = "{}/physcraper.fas".format(workdir)
-    treefile = "{}/physcraper.tre".format(workdir)
-    otu_json = "{}/otu_info.json".format(workdir)
+    alnfi = "{}/physcraper_{}.fas".format(workdir, tag)
+    treefile = "{}/physcraper_{}.tre".format(workdir, tag)
+    otu_json = "{}/otu_info_{}.json".format(workdir, tag)
     otu_dict = json.load(open(otu_json, "r"))
     mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(workdir)).readline().split()[-1])
     return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
-                        configfile=configfile, tree_schema='newick')
+                        configfile=configfile, tag=tag, tree_schema='newick')
 
 
 #def concatenate_ATTs(att_list, number_per_taxon='max', level='spp'):
@@ -164,8 +168,13 @@ class AlignTreeTax(object):
     """
 
     def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, configfile=None,
-                 tree_schema='newick',aln_schema ='fasta',taxon_namespace=None):
+                 tree_schema='newick',aln_schema ='fasta',taxon_namespace=None, tag=None):
         debug("build ATT class")
+        if tag == None:
+            self.tag = alignment.split('/')[-1].split('.')[0]
+        else:
+            self.tag = tag
+        print("alignment tag is {}".format(self.tag))
         self.workdir = os.path.abspath(workdir)
         if not os.path.exists(self.workdir):
             os.makedirs(self.workdir)
@@ -241,7 +250,8 @@ def read_in_aln(self, alignment, aln_schema, namespace=None):
                 empty.add(tax)
         self.aln.remove_sequences(empty)
         msg = ", ".join([str(tax) for tax in list(empty)])
-        sys.stdout.write("All gap taxa {}\n".format(msg))
+        if len(empty) >= 1:
+            sys.stdout.write("All gap taxa {}\n".format(msg))
         #elif isinstance(alignment, datamodel.charmatrixmodel.DnaCharacterMatrix):
         #    self.aln = alignment
         assert isinstance(self.aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \
@@ -563,16 +573,22 @@ def write_random_resolve_tre(self, treefilename='random_resolve.tre'):
         fi.close()
         return treepath
 
-    def write_aln(self, alnname="physcraper.fas", alnschema="fasta"):
+    def write_aln(self, alnname=None, alnschema="fasta"):
+        if alnname == None:
+            alnname = "physcraper_{}.fas".format(self.tag)
         alnpath = "{}/{}".format(self.workdir, alnname)
         self.aln.write(path=alnpath,
                        schema=alnschema)
         return os.path.abspath(alnpath)
 
-    def write_files(self, treepath="physcraper.tre", treeschema="newick", alnpath="physcraper.fas", alnschema="fasta"):
+    def write_files(self, treepath=None, treeschema="newick", alnpath=None, alnschema="fasta"):
         """Outputs both the streaming files, labeled with OTU ids.
         Can be mapped to original labels using otu_dict.json or otu_seq_info.csv"""
         #debug("write_files")
+        if alnpath == None:
+            alnpath = "physcraper_{}.fas".format(self.tag)
+        if treepath == None:
+            treepath = "physcraper_{}.tre".format(self.tag)
         self.tre.write(path="{}/{}".format(self.workdir, treepath),
                        schema=treeschema, unquoted_underscores=True)
         self.aln.write(path="{}/{}".format(self.workdir, alnpath),
@@ -598,8 +614,12 @@ def write_labelled(self, label, filename = "labelled", direc='workdir', norepeat
         #debug("write labelled files")
         if direc == 'workdir':
             direc = self.workdir
-        treepath = "{}/{}".format(direc, "{}.tre".format(filename))
-        alnpath = "{}/{}".format(direc, '{}.fas'.format(filename))
+        if filename == "labelled":
+            treepath = "{}/{}_{}.tre".format(direc, filename, self.tag)
+            alnpath = "{}/{}_{}.fas".format(direc, filename, self.tag)
+        else:
+            treepath = "{}/{}.tre".format(direc, filename)
+            alnpath = "{}/{}.fas".format(direc, filename)
         debug(treepath)
         assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
                          "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
@@ -652,15 +672,15 @@ def write_otus(self, filename = "otu_info", schema="table"):
 
         assert schema in ["table", "json"]
         if schema == "json":
-            with open("{}/{}.json".format(self.workdir, filename), "w") as outfile:
+            with open("{}/{}_{}.json".format(self.workdir, filename, self.tag), "w") as outfile:
                 json.dump(self.otu_dict, outfile)
         if schema == "table":
             all_keys =  set()
             for otu in self.otu_dict:
                 all_keys.update(self.otu_dict[otu].keys())
             keys = list(all_keys) 
             header = ["otu_id"] + keys
-            with open("{}/{}.csv".format(self.workdir, filename), "w") as outfile:
+            with open("{}/{}_{}.csv".format(self.workdir, filename, self.tag), "w") as outfile:
                 outfile.write("\t".join(header)+"\n")
                 for otu in self.otu_dict:
                     vals = [str(self.otu_dict[otu].get(key, "-")) for key in keys]