Skip to content

Commit

Permalink
Merge pull request #127 from McTavishLab/relabel
Browse files Browse the repository at this point in the history
Relabel
  • Loading branch information
snacktavish committed Jul 8, 2020
2 parents 2dddf65 + d1664ad commit 5a3118a
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 49 deletions.
7 changes: 7 additions & 0 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
parser.add_argument("-st","--search_taxon", help="taxonomic id to constrain blast search. format ott:123 or ncbi:123. Deafult will use ingroup of tree on OpenTree, or MRCA of input tips ")

parser.add_argument("-spn","--species_number", help="max number of seqs to include per species")
parser.add_argument("-bl","--block_list", nargs='+', help="ncbi accession numbers to exclude")

parser.add_argument("-tp","--trim_perc", help="minimum percentage of seqs end of alignemnts")
parser.add_argument("-rlmax","--relative_length_max", help="max relative length of added seqs, compared to input seq len")
parser.add_argument("-rlmin","--relative_length_min", help="min relative length of added seqs, compared to input seq len")
Expand Down Expand Up @@ -255,6 +257,11 @@
else:
boot_reps = 100

if args.block_list:
for acc in args.block_list:
scraper.blocklist.append(acc)
sys.stdout.write("Excluding accession numbers {}\n".format(','.join(scraper.blocklist)))


run = 1
if args.repeat:
Expand Down
6 changes: 5 additions & 1 deletion docs/mds/PhyscraperRun.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,13 @@ You can use your own blast database, for example set up on an AWS server.

-tp TRIM_PERC, --trim_perc TRIM_PERC
minimum percentage of seqs end of alignemnts
-rl RELATIVE_LENGTH, --relative_length RELATIVE_LENGTH
-rlmax RELATIVE_LENGTH_MAX, --relative_length_max RELATIVE_LENGTH_MAX
max relative length of added seqs, compared to input
aln len (blast matches not within lenegth cutoffs stored in outputs/seqlen_mismatch.txt)
-rlmin RELATIVE_LENGTH_MIN, --relative_length_min RELATIVE_LENGTH_MIN
min relative length of added seqs, compared to input
aln len
(blast matches not within lenegth cutoffs stored in outputs/seqlen_mismatch.txt)
-spn SPECIES_NUMBER, --species_number SPECIES_NUMBER
max number of seqs to include per species

Expand Down
3 changes: 2 additions & 1 deletion docs/mds/Tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,14 @@ The structure consists of:
-- blast results for each tip in both the tree and the alignment

- run
-- This is where intermediate processing files, and the json formatted otu information are stored
-- This is where intermediate processing files, and the json formatted otu information are stored. Many fo tehse files are re-used ifthe crashes and is restarted. Make sure you use a new output directory or otherwise empty this folder if you want to change run parameters.

- outputs
-- final tree and alignment

-- CSV file with information about each sequence

-- By default sequences less than 80% or greater than 120% (these values are configurable) of the average input sequence length are not added to the alignment, as they can cause alignment errors. The acession numbers, taxa, and sequence lengths are written out to seqlen_mismatch.txt.


### Compare your new tree to existing relationships
Expand Down
11 changes: 8 additions & 3 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,18 @@ def write_otu_file(treetax, filepath, schema="table"):
if schema == "json":
with open(filepath, "w") as outfile:
json.dump(treetax.otu_dict, outfile)
leaves =[tax.label for tax, seq in treetax.aln.items()]
if schema == "table":
keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
header = ["otu_id"] + keys
header = ["otu_id"] + keys + ['^physcraper:in_current_aln']
with open(filepath, "w") as outfile:
outfile.write("\t".join(header)+"\n")
for otu in treetax.otu_dict:
vals = [str(treetax.otu_dict[otu].get(key, "-")) for key in keys]
if otu in leaves:
vals.append('True')
else:
vals.append('False')
outfile.write("\t".join([to_string(otu)]+vals)+"\n")


Expand All @@ -149,7 +154,7 @@ def write_labelled_tree(treetax, label, filepath, schema = "newick", norepeats=T
"""
#debug("write labelled files")
assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon", '^ncbi:accession']
tmp_newick = treetax.tre.as_string(schema="newick")
tmp_tre = Tree.get(data=tmp_newick,
schema="newick",
Expand Down Expand Up @@ -193,7 +198,7 @@ def write_labelled_aln(aligntreetax, label, filepath, schema = "fasta", norepeat
"""
#debug("write labelled files")
assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon", '^ncbi:accession']
tmp_fasta = aligntreetax.aln.as_string(schema="fasta")
tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta,
schema="fasta")
Expand Down
5 changes: 2 additions & 3 deletions physcraper/ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def get_ncbiid_from_acc(self, acc):


#removed function find_tax_id because it wasn't being used

def get_tax_seq_acc(self, acc):
if not os.path.exists(self.full_seq_path):
os.makedirs(self.full_seq_path)
Expand All @@ -142,7 +141,7 @@ def get_tax_seq_acc(self, acc):
#try:
assert(header.split()[1].startswith('taxname:'))
tax_name = header.split()[1].strip('taxname:')
ncbi_id = header.split()[2].strip('ncbi_id:')
ncbi_id = header.split()[2].strip('ncbi:')
seq = "".join(fi.readlines())
# except IndexError:
# print("IndexError")
Expand Down Expand Up @@ -204,7 +203,7 @@ def entrez_efetch(self, gb_id):
tries = 10
Entrez.email = self.config.email
if self.config.api_key:
Entrez.api_key = self.config.apikey
Entrez.api_key = self.config.api_key
handle = None

# method needs delay because of ncbi settings
Expand Down
49 changes: 13 additions & 36 deletions physcraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class PhyscraperScrape(object):
* **self.date**: Date of the run - may lag behind real date!
* **self.repeat**: either 1 or 0, it is used to determine if we continue updating the tree, no new seqs found = 0
* **self.newseqs_acc**: list of all gi_ids that were passed into remove_identical_seq(). Used to speed up adding process
* **self.blacklist**: list of gi_id of sequences that shall not be added or need to be removed. Supplied by user.
* **self.blocklist**: list of gi_id of sequences that shall not be added or need to be removed. Supplied by user.
* **self.seq_filter**: list of words that may occur in otu_dict.status and which shall not be used in the building of FilterBlast.sp_d (that's the main function), but it is also used as assert statement to make sure unwanted seqs are not added.
* **self.unpublished**: True/False. Used to look for local unpublished seq that shall be added if True.
* **self.path_to_local_seq:** Usually False, contains path to unpublished sequences if option is used.
Expand Down Expand Up @@ -150,7 +150,7 @@ def __init__(self, data_obj, ids_obj=None, search_taxon=None):
self.threshold = self.config.spp_threshold
#markers for status
#holders for new data
self.blacklist = []
self.blocklist = []

def map_taxa_to_ncbi(self):
for otu in self.data.otu_dict:
Expand Down Expand Up @@ -719,13 +719,17 @@ def remove_identical_seqs(self):
# debug("gb_id is {}".format(gb_id) )
assert seq
if seq_len_min < len(seq) < seq_len_max:
if self.blacklist is not None and gb_id in self.blacklist:
debug("gb_id {} in blacklist, not added".format(gb_id))
if self.blocklist is not None and gb_id in self.blocklist:
debug("gb_id {} in blocklist, not added".format(gb_id))
pass
else:
otu_id = self.data.add_otu(gb_id, self.ids)
tmp_dict = self.seq_dict_build(seq, otu_id, tmp_dict)
else:
lr = open("{}/seqlen_mismatch.txt".format(self.outputsdir),"a")
taxid,taxname, seq = self.ids.get_tax_seq_acc(gb_id)
lr.write("taxon: {}, ncbi: {}, acc: {}, len: {}\n".format(taxname, taxid, gb_id, len(seq)))
lr.close()
debug("\nlen {}:{} was not between {} and {}\n".format(gb_id, len(seq), seq_len_min, seq_len_max))
otu_in_aln = set([taxon.label for taxon in self.data.aln])
for otu in otu_in_aln:
Expand Down Expand Up @@ -781,6 +785,8 @@ def filter_seqs(self, tmp_dict, type="random", threshold=None):
for otu in tmp_dict:
if otu in selected_otus:
filtered_dict[otu] = tmp_dict[otu]
else:
self.data.otu_dict[otu]['^physcraper:status'] = "removed in sampling down to {} per spp.".format(self.threshold)
return filtered_dict


Expand Down Expand Up @@ -851,13 +857,9 @@ def select_seq_by_length(self, otu_list, seq_dict, count):
lens.append(len(seq_dict[otu]))
lens.sort(reverse=True)
cutoff = lens[count]
# debug("cutoff is {}".format(cutoff))
# debug("lengths is {}".format(lens))
selected_otus = []
for otu in otu_len_dict:
# debug("otu {} has len {}".format(otu, otu_len_dict[otu]))
if otu_len_dict[otu] >= cutoff:
# debug("selected!")
selected_otus.append(otu)
if len(selected_otus) == count:
return selected_otus
Expand Down Expand Up @@ -885,31 +887,6 @@ def select_seq_at_random(self, otu_list, count):
return selected_otus



# def pickle_dump(self, filename=None, recursion=100000):
# """writes out class to pickle file.
# We need to increase the recursion depth here, as it currently fails with the standard run.
#
# :param filename: optional filename
# :param recursion: pickle often failed with recursion depth, that's why it's increased here
# :return: writes out file
# """
# current = sys.getrecursionlimit()
# sys.setrecursionlimit(recursion)#
#
# if filename:
# ofi = open(filename, "wb")
# else:
# ofi = open("{}/scrape_checkpoint.p".format(self.workdir), "wb")
# pickle.dump(self, ofi, pickle.HIGHEST_PROTOCOL)
# sys.setrecursionlimit(current)

def dump(self):
self.data.write_otus('otu_dict.json')
with open("{}/{}".format(self.rundir, 'config.json'), "w") as outfile:
json.dump(self.config.__dict__, outfile)


def write_new_seqs(self, filename='date'):
"""writes out the query sequence file"""
debug("write query seq")
Expand Down Expand Up @@ -1230,16 +1207,16 @@ def calculate_final_tree(self, boot_reps = 100):



def remove_blacklistitem(self):
"""This removes items from aln, and tree, if the corresponding Genbank identifer were added to the blacklist.
def remove_blocklistitem(self):
"""This removes items from aln, and tree, if the corresponding Genbank identifer were added to the blocklist.
Note, that seq that were not added because they were similar to the one being removed here, are lost
(that should not be a major issue though, as in a new blast_run, new seqs from the taxon can be added.)
"""
for tax in self.data.aln.taxon_namespace:
gi_id = self.data.otu_dict[tax.label].get("^ncbi:gi")
acc = self.data.otu_dict[tax.label].get("^ncbi:accession")
if gi_id in self.blacklist or acc in self.blacklist:
if gi_id in self.blocklist or acc in self.blocklist:
self.data.remove_taxa_aln_tre(tax.label)
self.data.otu_dict[tax.label]['^physcraper:status'] = "deleted, Genbank identifier is part of blocklist"
# this should not need to happen here: prune_short; instead...
Expand Down
29 changes: 24 additions & 5 deletions tests/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,26 @@ def test_remove_identical_seqs():
assert len(scraper.new_seqs_otu_id) == 14
#Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.


def test_blocklist():
ids = copy.deepcopy(ids_base)
data_obj = copy.deepcopy(data_obj_base)
# print("start")
scraper = PhyscraperScrape(data_obj, ids)
scraper.blocklist.append('JX895264.1')
scraper._blasted = 1
blast_dir = "tests/data/precooked/fixed/tte_blast_files"
#scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
scraper.read_blast_wrapper(blast_dir=blast_dir)
#print scraper.ncbi_mrca
assert(len(scraper.new_seqs) == 0)
assert(len(scraper.data.aln) == 5)
assert len(scraper.new_seqs_otu_id) == 13
aln_path1 = scraper.data.write_aln()
scraper.align_new_seqs()
assert len(scraper.data.aln) == 18
os.remove(aln_path1)

#TODO find an example where we do get identical sequences and need to discard them


Expand All @@ -142,9 +162,7 @@ def test_remove_identical_seqs():
# status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
# assert(status in ('original', 'query'))

aln_path1 = scraper.data.write_aln()
scraper.align_new_seqs()
assert len(scraper.data.aln) == 19




Expand Down Expand Up @@ -277,7 +295,7 @@ def test_run_align():
# scraper.generate_streamed_alignment()
#assert os.path.exists("{}/RAxML_bestTree.{}".format(scraper.workdir, scraper.date))

test_run_align()


@mark.xfail
def test_mpi():
Expand Down Expand Up @@ -390,4 +408,5 @@ def test_write_labelled():
assert os.path.isfile('tests/data/tmp/otu_info_test.json')
assert os.path.isfile('tests/data/tmp/otu_info_test.csv')
os.remove('tests/data/tmp/otu_info_test.json')
os.remove('tests/data/tmp/otu_info_test.csv')
os.remove('tests/data/tmp/otu_info_test.csv')

0 comments on commit 5a3118a

Please sign in to comment.