Merge branch 'dev' into master

McTavishLab · Dec 4, 2018 · 88a385a · 88a385a
2 parents bf91e23 + 5ca3cfc
commit 88a385a
Show file tree

Hide file tree

Showing 53 changed files with 865 additions and 875 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,10 @@ tests/data/precooked/fixed/local-blast/Senecio_scopolii_subsp._scopolii_db.nhr
 tests/data/precooked/fixed/local-blast/Senecio_scopolii_subsp._scopolii_db.nsq
 shared_runs*
 runs/*
+docs/example_scripts/output/*
+MS3*
+runs_for_*
+tests/data/tmp/*
+tests/debugging/*
+tests/new_implementations/*
+backbone*
diff --git a/.travis.yml b/.travis.yml
@@ -2,13 +2,55 @@ language: python
 python:
   - "2.7"
 # command to install dependencies
+
+before_install:
+  - sudo apt-get update
+
+  ### install blast+
+  # this fails often with connection errors
+  - sudo apt-get install ncbi-blast+
+
+
+  #### install papara
+  - wget 'https://sco.h-its.org/exelixis/resource/download/software/papara_nt-2.5-static_x86_64.tar.gz'
+  - gunzip  -cd papara_nt-2.5-static_x86_64.tar.gz | (tar xvf - )
+  - mv papara_static_x86_64 papara
+  - export PATH="$PATH:$(pwd)"
+  - papara
+
+  ##### to use RAXML we need conda
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
+  - source activate test-environment
+  - conda install -c bioconda raxml
+  - python setup.py install
+
+  #### ete fails now, because of conda
+  - conda install -c etetoolkit ete2
+  - export PATH=~/anaconda_ete/bin:$PATH
+
+  #### check build documentation with sphinx
+  - pip install sphinx
+  - make -C docs
+
+
 install:
+  # install requirements of physcraper
   - pip install --quiet -r requirements.txt
   - pip install --quiet . 
+
+
 # command to run tests
 script:
   - wget 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
   - gunzip  -cd taxdump.tar.gz | (tar xvf - names.dmp nodes.dmp)
   - mv *.dmp tests/data/
   - echo 'no' | python tests/testfilesetup.py
-  - py.test tests/ --setup-only
+  #- py.test tests/ --setup-only
+  - sh tests/run_tests.sh
+
diff --git a/How_to_start.md b/How_to_start.md
@@ -69,7 +69,8 @@ Depending on the size of your tree to be updated, there are things to consider.
     *  install the taxonomy database:
 
         install ncbi taxonomy database to retrieve taxon information from BLAST searches into the same directory as your blastdb from the step before.
-
+
+                  
           * `cd /to/the/folder/of/your/blastdb`
           * `wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz'` # Download the taxdb archive
           * `gunzip -cd taxdb.tar.gz | (tar xvf - )`  # Install it in the BLASTDB directory
@@ -78,6 +79,7 @@ Depending on the size of your tree to be updated, there are things to consider.
          *  `wget 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'`
          *  `gunzip  -cd taxdump.tar.gz | (tar xvf - names.dmp nodes.dmp)`  
          *  move files into `tests/data/`
+
     * updating the databases:
 
          The databases need to be update regularly, the program will check the dates of your databases and will ask you to update the databases after 60 days. If your databases are older, you will be asked for input, if you want to update the databases. 
@@ -138,25 +140,26 @@ There is an example config file in `tests/data/localblast.config`
       There is an example file in `docs/example.py` it is based on the wrapper function `standard_run()`
 
       To obtain the study and tree ID's for an OToL run, either go to the website and query your lineage or you can run `find_studies.py` by typing in the terminal `python ./path/to/file/find_studies.py LINEAGENAME`. It will give you a studyID and a treeID, if there is a study available.
-        * **study_id**: the ID of the corresponding study from OToL
-        * **tree_id**: the ID of the corresponding tree from OToL
-        * **seqaln**: give the path to your alignment file, must be a single gene alignment
-        * **mattype**: file format of your alignment - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
-        * **workdir**: path to your working directory, the folder where the intermediate and result files shall be stored.
-        * **configfi**: path to your config-file, which edited in step 1.
-        * **otu_jsonfi**: path to the otu json file, this will contain all the information of the sequences retrieved during the run. Usually, does not need to be edited.
+
+    * **study_id**: the ID of the corresponding study from OToL
+    * **tree_id**: the ID of the corresponding tree from OToL
+    * **seqaln**: give the path to your alignment file, must be a single gene alignment
+    * **mattype**: file format of your alignment - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
+    * **workdir**: path to your working directory, the folder where the intermediate and result files shall be stored.
+    * **configfi**: path to your config-file, which edited in step 1.
+    * **otu_jsonfi**: path to the otu json file, this will contain all the information of the sequences retrieved during the run. Usually, does not need to be edited.
 
     b) using your own files:
 
       There is an example file in `tests/tiny_standard_ownfile.py`, it comes with a tiny sample dataset in `tests/data/tiny_example`. The corresponding wrapper function to use in your file setup is `own_data_run()`.
-        * **seqaln**: give the path to your alignment file, must be a single gene alignment
-        * **mattype**: file format of your alignment - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
-        * **trfn**: give the path to the file containing the corresponding phylogeny, all tips must be represented in the alignment file as well.
-        * **schema_trf**: file format of your phylogeny file - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
-        * **id_to_spn**: path to a comma-delimited file where tip labels correspond to species names: example file can be found in `tests/data/tiny_test_example/test_nicespl.csv`
-        * **workdir**: path to your working directory, the folder where intermediate and result files shall be stored.
-        * **configfi**: path to your config-file, which was edited in step 1.
-        * **otu_jsonfi**: path to the otu json file, this will contain all the information of the sequences retrieved during the run. Usually, does not need to be edited.
+    * **seqaln**: give the path to your alignment file, must be a single gene alignment
+    * **mattype**: file format of your alignment - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
+    * **trfn**: give the path to the file containing the corresponding phylogeny, all tips must be represented in the alignment file as well.
+    * **schema_trf**: file format of your phylogeny file - currently supported: “fasta”, “newick”, “nexus”, “nexml”, “phylip”
+    * **id_to_spn**: path to a comma-delimited file where tip labels correspond to species names: example file can be found in `tests/data/tiny_test_example/test_nicespl.csv`
+    * **workdir**: path to your working directory, the folder where intermediate and result files shall be stored.
+    * **configfi**: path to your config-file, which was edited in step 1.
+    * **otu_jsonfi**: path to the otu json file, this will contain all the information of the sequences retrieved during the run. Usually, does not need to be edited.
 
 2. filter run:
 

diff --git a/README.md b/README.md
@@ -59,7 +59,3 @@ The Documentation about the different classes can be found [here](./docs/).
 ### Tests
 
 There are some tests [here](./test/) and [here](./ws-test/), which test the major functionality of the code. If you want to test if the code works on your machine, please run `python tests/testfilesetup.py` and then `sh tests/run_test.sh`,  `sh ws-tests/run_ws-tests.sh`.
-
-
-
-
diff --git a/docs/example_scripts/OToL_filter_run.py b/docs/example_scripts/OToL_filter_run.py
@@ -4,8 +4,9 @@
 study_id = "pg_873"
 tree_id = "tree1679"
 seqaln = "tests/data/minitest.fas"
-workdir="example_output"
-configfi = "tests/data/test.config"
+mattype = "fasta"
+workdir="docs/example_scripts/output/OToL_filter"
+configfi = "tests/data/localblast.config"
 
 threshold = 2
 selectby = "blast"
@@ -20,6 +21,7 @@
 wrappers.filter_OTOL(study_id,
                 tree_id,
                 seqaln,
+		mattype,
                 workdir,
                 configfi,
                 threshold,

diff --git a/docs/example_scripts/OToL_standard_run.py b/docs/example_scripts/OToL_standard_run.py
@@ -5,8 +5,8 @@
 tree_id = "tree1679"
 seqaln = "tests/data/minitest.fas"
 mattype="fasta"
-workdir="example_output"
-configfi = "tests/data/test.config"
+workdir="docs/example_scripts/output/OToL_standard"
+configfi = "tests/data/localblast.config"
 
 wrappers.standard_run(study_id,
                       tree_id,

diff --git a/docs/example_scripts/_own_data_standard_ncbi.py b/docs/example_scripts/_own_data_standard_ncbi.py
@@ -0,0 +1,37 @@
+import sys
+import os
+import json
+from physcraper import wrappers, OtuJsonDict, ConfigObj, IdDicts
+
+
+seqaln = "tests/data/tiny_test_example/test.fas"
+mattype = "fasta"
+trfn = "tests/data/tiny_test_example/test.tre"
+schema_trf = "newick"
+workdir = "docs/example_scripts/output/own_ncbi"
+configfi = "tests/data/test.config"
+# configfi = "tests/data/aws.config"
+id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
+otu_jsonfi = "{}/otu_dict.json".format(workdir)
+
+if not os.path.exists("{}".format(workdir)):
+    os.makedirs("{}".format(workdir))
+
+conf = ConfigObj(configfi)
+ids = IdDicts(conf, workdir=workdir)
+
+
+if os.path.exists(otu_jsonfi):
+	print("load json")
+	otu_json = json.load(open(otu_jsonfi))
+else:
+	otu_json = OtuJsonDict(id_to_spn, ids)
+	json.dump(otu_json, open(otu_jsonfi, "w"))
+
+wrappers.own_data_run(seqaln,
+                  mattype,
+                  trfn,
+                  schema_trf,
+                  workdir,
+                  otu_jsonfi,
+                  configfi)
diff --git a/docs/example_scripts/concat_example.py b/docs/example_scripts/concat_example.py
@@ -8,6 +8,8 @@
 from copy import deepcopy
 
 
+# run tiny_comb_... files first
+
 workdir_ITS = "./MS3_data/output/ITS_filter"
 workdir_ETS = "./MS3_data/output/ETS_expand"
 email = "mk@xy.zt"

diff --git a/docs/example_scripts/own_data_filter_blast.py b/docs/example_scripts/own_data_filter_blast.py
@@ -0,0 +1,55 @@
+import sys
+import os
+import json
+from physcraper import wrappers, OtuJsonDict
+#
+
+
+seqaln = "tests/data/tiny_test_example/test.fas"
+mattype = "fasta"
+trfn = "tests/data/tiny_test_example/test.tre"
+schema_trf = "newick"
+workdir = "docs/example_scripts/output/own_data_filter"
+configfi = "tests/data/localblast.config"
+id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
+otu_jsonfi = "{}/otu_dict.json".format(workdir)
+
+threshold = 2
+selectby = "blast"
+
+ingroup_mrca = None  # must be OToL ID
+shared_blast_folder = None # location to share blast runs across runs, see documentation
+downtorank = None
+blacklist = None
+add_unpubl_seq = None
+id_to_spn_addseq_json = None
+
+if not os.path.exists(workdir):
+    os.mkdir(workdir)
+
+
+if os.path.exists(otu_jsonfi):
+	otu_json = json.load(open(otu_jsonfi))
+else:
+	otu_json = OtuJsonDict(id_to_spn, configfi)
+	json.dump(otu_json, open(otu_jsonfi, "w"))
+
+
+
+wrappers.filter_data_run(seqaln,
+                     mattype,
+                     trfn,
+                     schema_trf,
+                     workdir,
+                     threshold,
+                     otu_jsonfi,
+                     configfi,
+                     downtorank=downtorank,
+                     selectby=selectby,
+					 blacklist=blacklist,
+                     add_unpubl_seq=add_unpubl_seq,
+                     id_to_spn_addseq_json=id_to_spn_addseq_json,
+                     ingroup_mrca=ingroup_mrca,
+                     shared_blast_folder=shared_blast_folder
+                     )
+
diff --git a/docs/example_scripts/own_data_standard_local.py b/docs/example_scripts/own_data_standard_local.py
@@ -0,0 +1,44 @@
+import sys
+import os
+import json
+from physcraper import wrappers, OtuJsonDict, ConfigObj, IdDicts
+
+
+
+seqaln = "tests/data/tiny_test_example/test.fas"
+mattype = "fasta"
+trfn = "tests/data/tiny_test_example/test.tre"
+schema_trf = "newick"
+workdir = "docs/example_scripts/output/own_local"
+configfi = "tests/data/localblast.config"
+# configfi = "tests/data/aws.config"
+id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
+otu_jsonfi = "{}/otu_dict.json".format(workdir)
+
+
+ingroup_mrca = None
+shared_blast_folder = None
+
+if not os.path.exists("{}".format(workdir)):
+	os.makedirs("{}".format(workdir))
+
+conf = ConfigObj(configfi)
+ids = IdDicts(conf, workdir=workdir)
+
+if os.path.exists(otu_jsonfi):
+	print("load json")
+else:
+	otu_json = OtuJsonDict(id_to_spn, ids)
+	json.dump(otu_json, open(otu_jsonfi, "w"))
+
+wrappers.own_data_run(seqaln,
+                  mattype,
+                  trfn,
+                  schema_trf,
+                  workdir,
+                  otu_jsonfi,
+                  configfi,
+                  ingroup_mrca=ingroup_mrca,
+                  shared_blast_folder=shared_blast_folder)
+
+
diff --git a/tests/tiny_comb_ets.py → docs/example_scripts/tiny_comb_ets.py b/tests/tiny_comb_ets.py → docs/example_scripts/tiny_comb_ets.py
diff --git a/tests/tiny_comb_its.py → docs/example_scripts/tiny_comb_its.py b/tests/tiny_comb_its.py → docs/example_scripts/tiny_comb_its.py
@@ -10,7 +10,7 @@
 schema_trf = "newick"
 id_to_spn = r"tests/data/tiny_comb_its/nicespl.csv"
 
-workdir = "tiny_comb_its"
+workdir = "runs/tiny_comb_its"
 configfi = "tests/data/localblast.config"
 otu_jsonfi = "{}/otu_dict.json".format(workdir)
 threshold = 2

diff --git a/docs/example_scripts/tiny_filter_ownfile.py b/docs/example_scripts/tiny_filter_ownfile.py