Skip to content

Commit

Permalink
Merge pull request #53 from McTavishLab/JanMerge
Browse files Browse the repository at this point in the history
Jan merge - pulled commits until Jan 22 from dev.
  • Loading branch information
snacktavish committed Jan 24, 2019
2 parents 88a385a + 6c2ff89 commit 07376a3
Show file tree
Hide file tree
Showing 77 changed files with 26,345 additions and 2,296 deletions.
9 changes: 8 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ before_install:
- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
- source activate test-environment
- conda install -c bioconda raxml
# - pip install pytest-cov pytest-xdist
- python setup.py install

#### ete fails now, because of conda
Expand All @@ -38,7 +39,10 @@ before_install:
- pip install sphinx
- make -C docs


#### needed for mpi4py which is in requirements
#- sudo apt install libopenmpi-dev
- sudo apt install mpich
- export MPICC=mpiicc
install:
# install requirements of physcraper
- pip install --quiet -r requirements.txt
Expand All @@ -54,3 +58,6 @@ script:
#- py.test tests/ --setup-only
- sh tests/run_tests.sh

after_success:
- curl -s https://codecov.io/bash | bash

266 changes: 148 additions & 118 deletions How_to_start.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# physcraper

[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=master)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)
[![Build Status](https://travis-ci.org/McTavishLab/physcraper.svg?branch=dev)](https://travis-ci.org/McTavishLab/physcraper)[![Documentation](https://readthedocs.org/projects/physcraper/badge/?version=latest&style=flat)](https://physcraper.readthedocs.io/en/latest/)[![codecov](https://codecov.io/gh/McTavishLab/physcraper/branch/dev/graph/badge.svg)](https://codecov.io/gh/McTavishLab/physcraper)

Continual gene tree updating.
Uses a tree from Open tree of Life (or your own tree) and an alignment to search for and adds homologous sequences to phylogenetic inference.
Expand Down
16 changes: 0 additions & 16 deletions docs/example.py

This file was deleted.

16 changes: 10 additions & 6 deletions docs/example_scripts/OToL_filter_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@
workdir="docs/example_scripts/output/OToL_filter"
configfi = "tests/data/localblast.config"

threshold = 2
selectby = "blast"
downtorank = "species"
ingroup_mrca = None

blacklist = None
threshold = 2 # amount of sequences being kept by FilterBlast
selectby = "blast" # how to select sequences in FilterBlast, either "length" or "blast"

ingroup_mrca = None # must be OToL ID
shared_blast_folder = None # location to share blast runs across runs, see documentation

downtorank = None # define filter rank, e.g. "species", "genus", if not defined, goes down to var/subsp
blacklist = None # list with accession numbers, e.g. [XXX.1, YYY.1]
add_unpubl_seq = None
id_to_spn_addseq_json = None
shared_blast_folder = None


## function to filter the blast results, if you want to keep all sequences found by blast, use standard_run()
wrappers.filter_OTOL(study_id,
tree_id,
seqaln,
Expand Down
13 changes: 7 additions & 6 deletions docs/example_scripts/concat_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,19 @@

# run tiny_comb_... files first

workdir_ITS = "./MS3_data/output/ITS_filter"
workdir_ETS = "./MS3_data/output/ETS_expand"
workdir_ITS = "tests/data/PS_tiny_comb_its"
workdir_ETS = "tests/data/PS_tiny_comb_ets"
email = "mk@xy.zt"
percentage = 0.4

pickle_fn = "scrape_checkpoint.p"
num_threads = 4 # number of threads to use, to make it run faster

workdir_comb = ".example/output/nr"
pickle_fn = "final_ATT_checkpoint.p"

workdir_comb = "docs/example_scripts/output/nr_concat"
genelist = {"ITS": {"workdir": workdir_ITS, "pickle": pickle_fn},
"ETS": {"workdir": workdir_ETS, "pickle": pickle_fn}
}

conc = wrappers.concat(genelistdict=genelist, workdir_comb=workdir_comb,
email=email, percentage=percentage, user_concat_fn=None)

email=email, num_threads=num_threads, percentage=percentage, user_concat_fn=None, backbone=None)
12 changes: 6 additions & 6 deletions docs/example_scripts/own_data_filter_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
otu_jsonfi = "{}/otu_dict.json".format(workdir)

threshold = 2
selectby = "blast"
threshold = 2 # amount of sequences being kept by FilterBlast
selectby = "blast" # how to select sequences in FilterBlast, either "length" or "blast"

ingroup_mrca = None # must be OToL ID
shared_blast_folder = None # location to share blast runs across runs, see documentation
downtorank = None
blacklist = None
downtorank = None # define filter rank, e.g. "species", "genus"
blacklist = None # list with accession numbers, e.g. [XXX.1, YYY.1]
add_unpubl_seq = None
id_to_spn_addseq_json = None

Expand All @@ -35,7 +35,7 @@
json.dump(otu_json, open(otu_jsonfi, "w"))



## function to filter the blast results, if you want to keep all sequences found by blast, use own_data_run()
wrappers.filter_data_run(seqaln,
mattype,
trfn,
Expand All @@ -46,7 +46,7 @@
configfi,
downtorank=downtorank,
selectby=selectby,
blacklist=blacklist,
blacklist=blacklist,
add_unpubl_seq=add_unpubl_seq,
id_to_spn_addseq_json=id_to_spn_addseq_json,
ingroup_mrca=ingroup_mrca,
Expand Down
63 changes: 63 additions & 0 deletions docs/example_scripts/own_data_localdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from physcraper import wrappers, OtuJsonDict, ConfigObj, IdDicts
import os
import json

#################################
seqaln = "tests/data/tiny_comb_its/tiny_comb_its.fasta"
mattype = "fasta"
trfn = "tests/data/tiny_comb_its/tiny_comb_its.tre"
schema_trf = "newick"
blacklist = None
workdir="tests/output/addLocal"

id_to_spn = r"tests/data/tiny_comb_its/nicespl.csv"
otu_jsonfi = "{}/otu_dict.json".format(workdir)
otu_jsonfi_local = "{}/otu_dict_local.json".format(workdir)

configfi = "tests/data/localblast.config"
threshold=10
selectby="blast"
downto= None
ingroup_mrca = None
add_unpubl_seq = "tests/data/local_seqs"
id_to_spn_addseq = "tests/data/tipnTOspn_localAdd.csv"


if not os.path.exists("{}".format(workdir)):
os.makedirs("{}".format(workdir))

conf = ConfigObj(configfi)
ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)


if os.path.exists(otu_jsonfi):
print("load json")
otu_json = json.load(open(otu_jsonfi))
else:
otu_json = OtuJsonDict(id_to_spn, ids)
json.dump(otu_json, open(otu_jsonfi,"w"))

if os.path.exists(otu_jsonfi_local):
print("load json local")
otu_json_local = json.load(open(otu_jsonfi_local))
print(otu_json_local)
else:
otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
json.dump(otu_json_local, open(otu_jsonfi_local,"w"))
print(otu_json_local)

# print(id_to_spn_addseq_json)

wrappers.filter_data_run(seqaln,
mattype,
trfn,
schema_trf,
workdir,
threshold,
otu_jsonfi,
configfi,
selectby=selectby,
downtorank=downto,
ingroup_mrca=ingroup_mrca,
add_unpubl_seq=add_unpubl_seq,
id_to_spn_addseq_json=otu_json_local)
4 changes: 2 additions & 2 deletions docs/example_scripts/own_data_standard_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
mattype = "fasta"
trfn = "tests/data/tiny_test_example/test.tre"
schema_trf = "newick"
workdir = "docs/example_scripts/output/own_local"
workdir = "docs/example_scripts/output/own_standard_local"
configfi = "tests/data/localblast.config"
# configfi = "tests/data/aws.config"
id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
otu_jsonfi = "{}/otu_dict.json".format(workdir)

Expand All @@ -31,6 +30,7 @@
otu_json = OtuJsonDict(id_to_spn, ids)
json.dump(otu_json, open(otu_jsonfi, "w"))

# this function will keep all sequences found by blast which belong to the mrca, if you want to filter use filter_data_run()
wrappers.own_data_run(seqaln,
mattype,
trfn,
Expand Down
67 changes: 45 additions & 22 deletions docs/example_scripts/test.config
Original file line number Diff line number Diff line change
@@ -1,54 +1,77 @@
### never have in-line comments in this file

[blast]
e_value_thresh = 0.001
#The statistcal cutoff for matches
unmapped = keep
# unmapped remove: remove tips, keep = set to id of mrca

Entrez.email = xz@xz.com
#######
## BLAST SETTINGS

[blast]
#Use your email address, please, this is just for NCBI records
Entrez.email = ejmctavish@gmail.com

#The statistcal cutoff for matches
e_value_thresh = 0.001

hitlist_size = 100
#hitlist_size =5000
#the max number of matches for each search
# the max number of matches for each blast search
hitlist_size = 25

# define location for blast database, should be local for the moment.
location = local
#Options [local, remote]
#Unless you have set up a local blast database, leave as remote

#url_base =
#default url_base is ncbi, to run on AWS set url here
# if location = local:
localblastdb = /shared/localblastdb_meta/
#localblastdb = /home/mkandziora/blastdb_ncbi/
#localblastdb = /home/blubb/local_blast_db/
# localblastdb path must have '/'

# if location = remote:
#default url_base is ncbi, to run on AWS set url here
#url_base =

# number of cores to use
num_threads = 8

#Only required if blast location is local
num_threads = 2
# use Genbank identifier as blast output file, if False it will use the otuID
gb_id_filename = True

# when to reblast the included sequences
delay = 90

#######
## PHYSCRAPER SETTINGS

[physcraper]
# what to do whith tips from OToL that are not known: unmapped remove: remove tips, keep = set to id of mrca
unmapped = keep

#This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
seq_len_perc = 0.8
#This is how much shorter new sequences are alllowed to be compared to your original sequence lengths.

[ncbi_parser]
nodes_fn = ./tests/data/nodes.dmp
names_fn = ./tests/data/names.dmp
# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
trim_perc = 0.75

# max length for values to add to aln
max_len = 2.5


#######
## INTERNAL PHYSCRAPER SETTINGS
#---------------------------------------------------------------------------------
#Things below here you should not need to change!

#Only required if blast location is local
[ncbi_parser]
nodes_fn = ./tests/data/nodes.dmp
names_fn = ./tests/data/names.dmp

[phylesystem]
location = api
#local or api, leave set to api unless you have installed phylesystem locally


[taxonomy]
#You should not need to change any of these!
ott_ncbi = taxonomy/ott_ncbi
get_ncbi_taxonomy = taxonomy/get_ncbi_taxonomy.sh
ncbi_dmp = taxonomy/gi_taxid_nucl.dmp
#acc2taxid = taxonomy/nucl_gb.accession2taxid.gz
#rankedlineages = rankedlineage.dmp.gz
id_pickle = taxonomy/id_dmp.p
#You should not need to change any of these!

0 comments on commit 07376a3

Please sign in to comment.