Skip to content

Commit

Permalink
small updates
Browse files Browse the repository at this point in the history
  • Loading branch information
mkandziora committed Nov 28, 2018
1 parent 1093d92 commit f5d0563
Showing 1 changed file with 44 additions and 23 deletions.
67 changes: 44 additions & 23 deletions physcraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ def __init__(self, configfi, interactive=True):
"your unmapped statement `%s` in the config file is not remove or keep"
% self.unmapped
)

debug("shared blast folder?")
debug(self.gb_id_filename)
debug("check db file status?")
Expand All @@ -216,7 +215,7 @@ def _download_localblastdb(self):
# nest line of codes exists to have interactive mode enabled while testing
# this allows to not actually have a local nsbi database downloaded
if not os.path.isfile("{}/empty_local_db_for_testing.nhr".format(self.blastdb)):
if not os.path.isfile("{}/nt.01.nhr".format(self.blastdb)):
if not os.path.isfile("{}/nt.60.nhr".format(self.blastdb)):
print("Do you want to download the blast nt databases from ncbi? Note: "
"This is a US government website! You agree to their terms")
x = get_raw_input()
Expand Down Expand Up @@ -764,7 +763,8 @@ def prune_short(self, min_seqlen_perc=0.75):
fi.close()
for tax in prune:
self.otu_dict[tax.label]["^physcraper:status"] = "deleted in prune short"
assert self.aln.taxon_namespace == self.tre.taxon_namespace
# out-comented next line, as this does not run if we prune aln before placing new seq in tre
# assert self.aln.taxon_namespace == self.tre.taxon_namespace
assert treed_taxa.issubset(aln_ids)
self.orig_seqlen = [len(self.aln[tax].symbols_as_string().replace("-", "").replace("N", "")) for tax in self.aln]
self.trim()
Expand Down Expand Up @@ -851,6 +851,15 @@ def add_otu(self, gb_id, ids_obj):
tax_name = self.gb_dict[gb_id]["^ot:ottTaxonName"]
ncbi_id = self.gb_dict[gb_id]["^ncbi:taxon"]
ott_id = self.gb_dict[gb_id]["^ot:ottId"]
if tax_name is None:
tax_name = self.gb_dict[gb_id][u'^user:TaxonName']
if ncbi_id is None:
debug(tax_name.split(" ")[0])
tax_lin_name = tax_name.split(" ")[0]
tax_lin_name = tax_lin_name.split("_")[0]
print(tax_lin_name)
ncbi_id = ids_obj.ncbi_parser.get_id_from_name(tax_lin_name)
# ncbi_id = 00000
elif len(gb_id.split(".")) >= 2: # used to figure out if gb_id is from Genbank
if gb_id in self.gb_dict.keys() and "staxids" in self.gb_dict[gb_id].keys():
tax_name = self.gb_dict[gb_id]["sscinames"]
Expand Down Expand Up @@ -1061,7 +1070,7 @@ def get_mrca_ott(ott_ids):
tree_of_life.mrca(ott_ids=[ott], wrap_response=False)
synth_tree_ott_ids.append(ott)
except:
# except HTTPError as err: # TODO: this is not working, program fails with HTTPError
# except HTTPError as err: # TODO: this is not working, program fails with HTTPError
# sys.stderr.write(err)
debug("except")
ott_ids_not_in_synth.append(ott)
Expand Down Expand Up @@ -1103,7 +1112,7 @@ class IdDicts(object):
"""Class contains different taxonomic identifiers and helps to find the corresponding ids between ncbi and OToL
To build the class the following is needed:
* **config_obj**: Object of class config (see above)
* **workdir**: the path to the assigned working directory
* **mrca**: mrca as defined by input, can be a class
Expand Down Expand Up @@ -1159,6 +1168,7 @@ def __init__(self, config_obj, workdir, mrca=None):
self.spn_to_ncbiid = {} # spn to ncbi_id, it's only fed by the ncbi_data_parser, but makes it faster
self.ncbiid_to_spn = {}
self.mrca_ott = mrca # mrca_list
assert type(self.mrca_ott) in [int, list] or self.mrca_ott is None
self.mrca_ncbi = set() # corresponding ids for mrca_ott list
fi = open(config_obj.ott_ncbi)
for lin in fi:
Expand Down Expand Up @@ -1203,7 +1213,6 @@ def ott_id_to_ncbiid(self, ott_id):
ncbi_id = self.otu_rank[ott_name]["taxon id"]
else:
ncbi_id = self.ncbi_parser.get_id_from_name(ott_name)

else:
tx = APIWrapper().taxomachine
nms = tx.taxon(ott_id)
Expand Down Expand Up @@ -1667,7 +1676,7 @@ def run_blast_wrapper(self, delay=14):
if self.config.blast_loc == 'local':
self.run_local_blast_cmd(query, taxon.label, fn_path)
if self.config.blast_loc == 'remote':
if len(self.ids.mrca_ncbi) >=2:
if len(self.ids.mrca_ncbi) >= 2:
len_ncbi = len(self.ids.mrca_ncbi)
equery = ''
for ncbi_id in self.ids.mrca_ncbi:
Expand Down Expand Up @@ -1764,6 +1773,11 @@ def read_local_blast_query(self, fn_path):
if gb_acc not in self.data.gb_dict: # skip ones we already have
self.new_seqs[gb_acc] = query_dict[key]["sseq"]
self.data.gb_dict[gb_acc] = query_dict[key]
else:
fn = open("{}/blast_threshold_not_passed.csv".format(self.workdir), "a")
fn.write("blast_threshold_not_passed:\n")
fn.write("{}, {}, {}".format(query_dict[key]["sscinames"], query_dict[key]["accession"], query_dict[key]["evalue"]))
fn.close()

def read_unpublished_blast_query(self):
"""
Expand Down Expand Up @@ -1854,7 +1868,7 @@ def read_blast_wrapper(self, blast_dir=None):
file_ending = "xml"
if self.config.gb_id_filename is True:
fn = self.data.otu_dict[taxon.label].get('^ncbi:accession', taxon.label)
if fn == None:
if fn is None:
fn = self.data.otu_dict[taxon.label].get('^user:TaxonName', taxon.label)
fn_path = "{}/{}.{}".format(self.blast_subdir, fn, file_ending)
else:
Expand Down Expand Up @@ -2046,8 +2060,19 @@ def remove_identical_seqs(self):
# get rank to delimit seq to ingroup_mrca
# get name first
if gb_id[:6] == "unpubl":
tax_name = self.data.gb_dict[gb_id]["^ot:ottTaxonName"]
ncbi_id = self.data.gb_dict[gb_id]["^ncbi:taxon"]
debug("unpubl data")
debug(self.data.gb_dict[gb_id])
tax_name = self.data.gb_dict[gb_id][u"^ot:ottTaxonName"]
ncbi_id = self.data.gb_dict[gb_id][u"^ncbi:taxon"]
if tax_name is None:
tax_name = self.data.gb_dict[gb_id][u'^user:TaxonName']
if ncbi_id is None:
debug(tax_name.split(" ")[0])
tax_lin_name = tax_name.split(" ")[0]
tax_lin_name = tax_lin_name.split("_")[0]
print(tax_lin_name)
ncbi_id = self.ids.ncbi_parser.get_id_from_name(tax_lin_name)
# ncbi_id = 00000
elif len(gb_id.split(".")) >= 2:
if gb_id in self.data.gb_dict.keys() and 'staxids' in self.data.gb_dict[gb_id].keys():
tax_name = self.data.gb_dict[gb_id]['sscinames']
Expand Down Expand Up @@ -2238,7 +2263,6 @@ def est_full_tree(self):
os.rename(filename, "{}/{}_tmp".format(self.workdir, filename.split("/")[-1]))
try:
num_threads = int(self.config.num_threads)

if self.backbone is not True:
subprocess.call(["raxmlHPC-PTHREADS", "-T", "{}".format(num_threads), "-m", "GTRCAT",
"-s", "papara_alignment.extended",
Expand Down Expand Up @@ -2368,7 +2392,7 @@ def generate_streamed_alignment(self):
self.write_query_seqs()
self.align_query_seqs()
self.place_query_seqs()
self.prune_short()
self.data.prune_short()
self.est_full_tree()
self.data.tre = Tree.get(path="{}/RAxML_bestTree.{}".format(self.workdir, self.date),
schema="newick",
Expand Down Expand Up @@ -2407,7 +2431,6 @@ def generate_streamed_alignment(self):
sys.stdout.write("No new sequences after filtering.\n")
self.repeat = 0
self.calculate_bootstrap()

else:
if _VERBOSE:
sys.stdout.write("No new sequences found.\n")
Expand Down Expand Up @@ -2466,12 +2489,13 @@ class FilterBlast(PhyscraperScrape):
The second option (downtorank), which is optional, allows to filter according to taxonomic levels, e.g. getting
a number of representative sequences for a genus or lineage it can also be used to deal with subspecies.
existing self objects are:
Existing self objects are:
self.sp_d: dictionary
key = species name/id
value = list with otu_dict entry
self.sp_seq_d: dictionary
key = species name/id
Expand Down Expand Up @@ -2505,7 +2529,6 @@ def add_setting_to_self(self, downtorank, threshold):
self.threshold = threshold
self.downtorank = downtorank


def sp_dict(self, downtorank=None):
"""Takes the information from the Physcraper otu_dict and makes a dict with species name as key and
the corresponding seq information from aln and blast seq, it returns self.sp_d.
Expand Down Expand Up @@ -2822,7 +2845,7 @@ def count_num_seq(self, tax_id):
query_count += 1
if item_split == "added," or item_split == "original":
seq_present += 1
if new_taxon == False:
if new_taxon is False:
assert seq_present != 0
count_dict = {
"seq_present": seq_present,
Expand All @@ -2849,7 +2872,7 @@ def how_many_sp_to_keep(self, threshold, selectby):
seq_present = count_dict["seq_present"]
query_count = count_dict["query_count"]
new_taxon = count_dict["new_taxon"]
if seq_present <= threshold: # add seq to aln
if seq_present <= threshold: # add seq to aln
if seq_present + query_count <= threshold: # add all stuff to self.filtered_seq[gi_n]
self.add_all(tax_id)
else: # filter number of sequences
Expand All @@ -2873,12 +2896,12 @@ def how_many_sp_to_keep(self, threshold, selectby):
elif len(self.sp_seq_d[tax_id]) + seq_present < threshold:
self.add_all(tax_id)
elif 1 <= seq_present < threshold and new_taxon is False and query_count != 0:
# species is not new in alignment, make blast with existing seq
# species is not new in alignment, make blast with existing seq
debug("old_taxon")
debug([seq_present, query_count])
if query_count + seq_present > threshold:
taxonfn = self.loop_for_write_blast_files(tax_id)
debug([tax_id,taxonfn])
debug([tax_id, taxonfn])
if self.downtorank is not None:
taxonfn = tax_id
local_blast.run_filter_blast(self.workdir, taxonfn, taxonfn)
Expand Down Expand Up @@ -2910,11 +2933,9 @@ def replace_new_seq(self):
if '^ncbi:accession' in self.data.otu_dict[key]:
if self.data.otu_dict[key]['^ncbi:accession'] == gb_id:
self.data.otu_dict[key]['^physcraper:last_blasted'] = "1900/01/01"

print(self.data.otu_dict[key]['^physcraper:status'])
if self.data.otu_dict[key]['^physcraper:status'] == "query" or self.data.otu_dict[key]['^physcraper:status'].split(" ")[0] == 'new' :
debug(self.data.otu_dict[key]['^physcraper:status'])
if self.data.otu_dict[key]['^physcraper:status'] == "query" or self.data.otu_dict[key]['^physcraper:status'].split(" ")[0] == 'new':
self.data.otu_dict[key]['^physcraper:status'] = 'not added, there are enough seq per sp in tre'

for gb_id in keylist:
if gb_id.split(".") == 1:
debug(gb_id)
Expand Down

0 comments on commit f5d0563

Please sign in to comment.