Merge pull request #53 from McTavishLab/JanMerge

Jan merge - pulled commits until Jan 22 from dev.
McTavishLab · Jan 24, 2019 · 07376a3 · 07376a3
2 parents 88a385a + 6c2ff89
commit 07376a3
Show file tree

Hide file tree

Showing 77 changed files with 26,345 additions and 2,296 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -28,6 +28,7 @@ before_install:
   - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
   - source activate test-environment
   - conda install -c bioconda raxml
+  # - pip install pytest-cov pytest-xdist
   - python setup.py install
 
   #### ete fails now, because of conda
@@ -38,7 +39,10 @@ before_install:
   - pip install sphinx
   - make -C docs
 
-
+  #### needed for mpi4py which is in requirements
+  #- sudo apt install libopenmpi-dev
+  - sudo apt install mpich
+  - export MPICC=mpiicc
 install:
   # install requirements of physcraper
   - pip install --quiet -r requirements.txt
@@ -54,3 +58,6 @@ script:
   #- py.test tests/ --setup-only
   - sh tests/run_tests.sh
 
+after_success:
+  - curl -s https://codecov.io/bash | bash
+
diff --git a/How_to_start.md b/How_to_start.md
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # physcraper
 
-[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=master)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)
+[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=dev)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)[![codecov](https://codecov.io/gh/McTavishLab/physcraper/branch/dev/graph/badge.svg)](https://codecov.io/gh/McTavishLab/physcraper)
 
 Continual gene tree updating. 
 Uses a tree from Open tree of Life (or your own tree) and an alignment to search for and adds homologous sequences to phylogenetic inference. 

diff --git a/docs/example.py b/docs/example.py
diff --git a/docs/example_scripts/OToL_filter_run.py b/docs/example_scripts/OToL_filter_run.py
@@ -8,16 +8,20 @@
 workdir="docs/example_scripts/output/OToL_filter"
 configfi = "tests/data/localblast.config"
 
-threshold = 2
-selectby = "blast"
-downtorank = "species"
-ingroup_mrca = None
 
-blacklist = None
+threshold = 2  # amount of sequences being kept by FilterBlast
+selectby = "blast"  # how to select sequences in FilterBlast, either "length" or "blast"
+
+ingroup_mrca = None  # must be OToL ID
+shared_blast_folder = None # location to share blast runs across runs, see documentation
+
+downtorank = None  # define filter rank, e.g. "species", "genus", if not defined, goes down to var/subsp
+blacklist = None  # list with accession numbers, e.g. [XXX.1, YYY.1]
 add_unpubl_seq = None
 id_to_spn_addseq_json = None
-shared_blast_folder = None
 
+
+## function to filter the blast results, if you want to keep all sequences found by blast, use standard_run()
 wrappers.filter_OTOL(study_id,
                 tree_id,
                 seqaln,

diff --git a/docs/example_scripts/concat_example.py b/docs/example_scripts/concat_example.py
@@ -10,18 +10,19 @@
 
 # run tiny_comb_... files first
 
-workdir_ITS = "./MS3_data/output/ITS_filter"
-workdir_ETS = "./MS3_data/output/ETS_expand"
+workdir_ITS = "tests/data/PS_tiny_comb_its"
+workdir_ETS = "tests/data/PS_tiny_comb_ets"
 email = "mk@xy.zt"
 percentage = 0.4
 
-pickle_fn = "scrape_checkpoint.p"
+num_threads = 4  # number of threads to use, to make it run faster
 
-workdir_comb = ".example/output/nr"
+pickle_fn = "final_ATT_checkpoint.p"
+
+workdir_comb = "docs/example_scripts/output/nr_concat"
 genelist = {"ITS": {"workdir": workdir_ITS, "pickle": pickle_fn}, 
             "ETS": {"workdir": workdir_ETS, "pickle": pickle_fn}
             }
 
 conc = wrappers.concat(genelistdict=genelist, workdir_comb=workdir_comb,
-                       email=email, percentage=percentage, user_concat_fn=None)
-
+                       email=email, num_threads=num_threads, percentage=percentage, user_concat_fn=None, backbone=None)
diff --git a/docs/example_scripts/own_data_filter_blast.py b/docs/example_scripts/own_data_filter_blast.py
@@ -14,13 +14,13 @@
 id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
 otu_jsonfi = "{}/otu_dict.json".format(workdir)
 
-threshold = 2
-selectby = "blast"
+threshold = 2  # amount of sequences being kept by FilterBlast
+selectby = "blast"  # how to select sequences in FilterBlast, either "length" or "blast"
 
 ingroup_mrca = None  # must be OToL ID
 shared_blast_folder = None # location to share blast runs across runs, see documentation
-downtorank = None
-blacklist = None
+downtorank = None  # define filter rank, e.g. "species", "genus"
+blacklist = None  # list with accession numbers, e.g. [XXX.1, YYY.1]
 add_unpubl_seq = None
 id_to_spn_addseq_json = None
 
@@ -35,7 +35,7 @@
 	json.dump(otu_json, open(otu_jsonfi, "w"))
 
 
-
+## function to filter the blast results, if you want to keep all sequences found by blast, use own_data_run()
 wrappers.filter_data_run(seqaln,
                      mattype,
                      trfn,
@@ -46,7 +46,7 @@
                      configfi,
                      downtorank=downtorank,
                      selectby=selectby,
-					 blacklist=blacklist,
+		     blacklist=blacklist,
                      add_unpubl_seq=add_unpubl_seq,
                      id_to_spn_addseq_json=id_to_spn_addseq_json,
                      ingroup_mrca=ingroup_mrca,

diff --git a/docs/example_scripts/own_data_localdb.py b/docs/example_scripts/own_data_localdb.py
@@ -0,0 +1,63 @@
+from physcraper import wrappers, OtuJsonDict, ConfigObj, IdDicts
+import os
+import json
+
+#################################
+seqaln = "tests/data/tiny_comb_its/tiny_comb_its.fasta"
+mattype = "fasta"
+trfn = "tests/data/tiny_comb_its/tiny_comb_its.tre"
+schema_trf = "newick"
+blacklist = None
+workdir="tests/output/addLocal"
+
+id_to_spn = r"tests/data/tiny_comb_its/nicespl.csv"
+otu_jsonfi = "{}/otu_dict.json".format(workdir)
+otu_jsonfi_local = "{}/otu_dict_local.json".format(workdir)
+
+configfi = "tests/data/localblast.config"
+threshold=10
+selectby="blast" 
+downto= None
+ingroup_mrca = None
+add_unpubl_seq = "tests/data/local_seqs"
+id_to_spn_addseq = "tests/data/tipnTOspn_localAdd.csv"
+
+
+if not os.path.exists("{}".format(workdir)):
+        os.makedirs("{}".format(workdir))
+
+conf = ConfigObj(configfi)
+ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)
+
+
+if os.path.exists(otu_jsonfi):
+    print("load json")
+    otu_json = json.load(open(otu_jsonfi))
+else:
+    otu_json = OtuJsonDict(id_to_spn, ids)
+    json.dump(otu_json, open(otu_jsonfi,"w"))
+
+if os.path.exists(otu_jsonfi_local):
+    print("load json local")
+    otu_json_local = json.load(open(otu_jsonfi_local))
+    print(otu_json_local)
+else:
+    otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
+    json.dump(otu_json_local, open(otu_jsonfi_local,"w"))
+    print(otu_json_local)
+
+# print(id_to_spn_addseq_json)
+
+wrappers.filter_data_run(seqaln,
+                         mattype,
+                         trfn,
+                         schema_trf,
+                         workdir,
+                         threshold,
+                         otu_jsonfi,
+                         configfi,
+                         selectby=selectby, 
+                         downtorank=downto,
+      			         ingroup_mrca=ingroup_mrca,
+                         add_unpubl_seq=add_unpubl_seq,
+                         id_to_spn_addseq_json=otu_json_local)
diff --git a/docs/example_scripts/own_data_standard_local.py b/docs/example_scripts/own_data_standard_local.py
@@ -9,9 +9,8 @@
 mattype = "fasta"
 trfn = "tests/data/tiny_test_example/test.tre"
 schema_trf = "newick"
-workdir = "docs/example_scripts/output/own_local"
+workdir = "docs/example_scripts/output/own_standard_local"
 configfi = "tests/data/localblast.config"
-# configfi = "tests/data/aws.config"
 id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
 otu_jsonfi = "{}/otu_dict.json".format(workdir)
 
@@ -31,6 +30,7 @@
 	otu_json = OtuJsonDict(id_to_spn, ids)
 	json.dump(otu_json, open(otu_jsonfi, "w"))
 
+# this function will keep all sequences found by blast which belong to the mrca, if you want to filter use filter_data_run()
 wrappers.own_data_run(seqaln,
                   mattype,
                   trfn,

diff --git a/docs/example_scripts/test.config b/docs/example_scripts/test.config
@@ -1,54 +1,77 @@
 ### never have in-line comments in this file
 
-[blast]
-e_value_thresh = 0.001
-#The statistcal cutoff for matches
-unmapped = keep
-# unmapped remove: remove tips, keep = set to id of mrca
 
-Entrez.email = xz@xz.com
+#######
+## BLAST SETTINGS
+
+[blast]
 #Use your email address, please, this is just for NCBI records
+Entrez.email = ejmctavish@gmail.com
+
+#The statistcal cutoff for matches
+e_value_thresh = 0.001
 
-hitlist_size = 100
-#hitlist_size =5000
-#the max number of matches for each search
+# the max number of matches for each blast search
+hitlist_size = 25
 
+# define location for blast database, should be local for the moment.
 location = local
 #Options [local, remote]
-#Unless you have set up a local blast database, leave as remote
 
-#url_base =
-#default url_base is ncbi, to run on AWS set url here
+# if location = local:
 localblastdb = /shared/localblastdb_meta/
+#localblastdb = /home/mkandziora/blastdb_ncbi/
+#localblastdb = /home/blubb/local_blast_db/
 # localblastdb path must have '/'
 
+# if location = remote:
+#default url_base is ncbi, to run on AWS set url here
+#url_base =
+
+# number of cores to use
+num_threads = 8
 
-#Only required if blast location is local
-num_threads = 2
+# use Genbank identifier as blast output file, if False it will use the otuID
 gb_id_filename = True
 
+# when to reblast the included sequences
+delay = 90
+
+#######
+## PHYSCRAPER SETTINGS
+
 [physcraper]
+# what to do whith tips from OToL that are not known: unmapped remove: remove tips, keep = set to id of mrca
+unmapped = keep
+
+#This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
 seq_len_perc = 0.8
-#This is how much shorter new sequences are alllowed to be compared to your original sequence lengths.
 
-[ncbi_parser]
-nodes_fn = ./tests/data/nodes.dmp
-names_fn = ./tests/data/names.dmp
+# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
+trim_perc = 0.75
+
+# max length for values to add to aln
+max_len = 2.5
+
 
+#######
+## INTERNAL PHYSCRAPER SETTINGS
 #---------------------------------------------------------------------------------
 #Things below here you should not need to change!
 
+#Only required if blast location is local
+[ncbi_parser]
+nodes_fn = ./tests/data/nodes.dmp
+names_fn = ./tests/data/names.dmp
+
 [phylesystem]
 location = api
 #local or api, leave set to api unless you have installed phylesystem locally
 
-
 [taxonomy]
+#You should not need to change any of these!
 ott_ncbi = taxonomy/ott_ncbi
 get_ncbi_taxonomy = taxonomy/get_ncbi_taxonomy.sh
 ncbi_dmp = taxonomy/gi_taxid_nucl.dmp
-#acc2taxid = taxonomy/nucl_gb.accession2taxid.gz
-#rankedlineages = rankedlineage.dmp.gz
 id_pickle = taxonomy/id_dmp.p
-#You should not need to change any of these!