Skip to content

Commit

Permalink
Merge pull request #87 from McTavishLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
snacktavish committed May 22, 2020
2 parents 0df4ad7 + 442d870 commit 21007d4
Show file tree
Hide file tree
Showing 19 changed files with 134 additions and 310 deletions.
3 changes: 2 additions & 1 deletion bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
conf.email = args.email

sys.stdout.write("Configuration Settings\n")
sys.stdout.write(conf.config_str())
sys.stdout.write(conf.config_str()+"\n")

study_id = None
if args.tree_link:
Expand Down Expand Up @@ -154,6 +154,7 @@
configfile = conf)
sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))
scraper.data.write_files()
scraper.data.write_files(treepath = "before_otu_{}.tre".format(scraper.data.tag), alnpath = "before_otu_{}.aln".format(scraper.data.tag))
scraper.data.write_labelled(filename="before_labelled_{}".format(scraper.data.tag), label='^ot:ottTaxonName')
scraper.data.write_otus(schema="json")

Expand Down
2 changes: 1 addition & 1 deletion docs/ConfigObj.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Pulls out the configuration information from
* **self.e_value_thresh**: the defined threshold for the e-value during Blast searches, check out:
https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
* **self.hitlist_size**: the maximum number of sequences retrieved by a single blast search
* **self.seq_len_perc**: value from 0 to 1. Defines how much shorter new seq can be compared to input
* **self.minlen**: value from 0 to 1. Defines how much shorter new seq can be compared to input
* **self.get_ncbi_taxonomy**: Path to sh file doing something...
* **self.ncbi_dmp**: path to file that has gi numbers and the corresponding ncbi tax id's
* **self.phylesystem_loc**: defines which phylesystem for OpenTree datastore is used.
Expand Down
7 changes: 2 additions & 5 deletions docs/examples/example.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@ unmapped = keep
spp_threshold = 5

#This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
seq_len_perc = 0.8

# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
trim_perc = 0.75
min_length = 0.8

# max length for values to add to aln
max_len = 2.5
max_length = 1.5

#You should not need to change any of these!
taxonomy_path = taxonomy
100 changes: 49 additions & 51 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def prune_short(self):
self.aln]
avg_seqlen = sum(self.orig_seqlen) / len(self.orig_seqlen)
#debug("average sequence length is {}".format(avg_seqlen))
seq_len_cutoff = avg_seqlen * self.config.seq_len_perc
seq_len_cutoff = avg_seqlen * self.config.minlen
prune = []
aln_ids = set()
for tax, seq in self.aln.items():
Expand All @@ -376,56 +376,6 @@ def prune_short(self):
self.check_tre_in_aln()
self._reconciled = 1

def trim(self):
""" It removes bases at the start and end of alignments, if they are represented by less than the value
specified in the config file. E.g. 0.75 given in config means, that 75% of the sequences need to have a
base present
Ensures, that not whole chromosomes get dragged in. It's cutting the ends of long sequences.
has test: test_trim.py
"""
# debug('in trim')
taxon_missingness = self.config.trim_perc
i = 0
seqlen = len(self.aln[i])
while seqlen == 0:
i = i + 1
seqlen = len(self.aln[i])
for tax in self.aln:
if len(self.aln[tax]) != seqlen:
sys.stderr.write("can't trim un-aligned inputs, moving on")
return
start = 0
stop = seqlen
cutoff = len(self.aln) * taxon_missingness
for i in range(seqlen):
counts = {"?": 0, "-": 0}
for tax in self.aln:
call = self.aln[tax][i].label
if call in ["?", "-"]:
counts[call] += 1
if counts['?'] + counts['-'] <= cutoff: # first ok column
start = i
break
for i in range(seqlen, 0, -1): # previously seqlen-1, that cuts off last character of aln, I changed it.
counts = {'?': 0, '-': 0}
for tax in self.aln:
call = self.aln[tax][i - 1].label # changing seqlen-1 to seqlen requires that we have here i-1
if call in ['?', '-']:
counts[call] += 1
if counts['?'] + counts['-'] <= cutoff:
stop = i
break
# here alignment gets shortened to start:stop
for taxon in self.aln:
self.aln[taxon] = self.aln[taxon][start:stop]
# make sure that tre is presented in aln
self.check_tre_in_aln()
if _VERBOSE:
sys.stdout.write("trimmed alignment ends to < {} missing taxa, "
"start {}, stop {}\n".format(taxon_missingness, start, stop))
return


def check_tre_in_aln(self):
Expand Down Expand Up @@ -474,6 +424,54 @@ def remove_taxa_aln_tre(self, taxon_label):
else:
self.otu_dict[taxon_label]['^physcraper:status'] = "deleted, updated otu_dict but was never in tre or aln!"

def trim(self, min_taxon_perc):
""" It removes bases at the start and end of alignments, if they are represented by less than the value
specified. E.g. 0.75 that 75% of the sequences need to have a base present
Ensures, that not whole chromosomes get dragged in. It's cutting the ends of long sequences.
has test: test_trim.py
"""
# debug('in trim')
i = 0
seqlen = len(self.aln[i])
while seqlen == 0:
i = i + 1
seqlen = len(self.aln[i])
for tax in self.aln:
if len(self.aln[tax]) != seqlen:
sys.stderr.write("can't trim un-aligned inputs, moving on")
return
start = 0
stop = seqlen
cutoff = len(self.aln) * min_taxon_perc
for i in range(seqlen):
counts = {"?": 0, "-": 0}
for tax in self.aln:
call = self.aln[tax][i].label
if call in ["?", "-"]:
counts[call] += 1
if counts['?'] + counts['-'] <= cutoff: # first ok column
start = i
break
for i in range(seqlen, 0, -1): # previously seqlen-1, that cuts off last character of aln, I changed it.
counts = {'?': 0, '-': 0}
for tax in self.aln:
call = self.aln[tax][i - 1].label # changing seqlen-1 to seqlen requires that we have here i-1
if call in ['?', '-']:
counts[call] += 1
if counts['?'] + counts['-'] <= cutoff:
stop = i
break
# here alignment gets shortened to start:stop
for taxon in self.aln:
self.aln[taxon] = self.aln[taxon][start:stop]
# make sure that tre is presented in aln
self.check_tre_in_aln()
if _VERBOSE:
sys.stdout.write("trimmed alignment ends to < {} missing taxa, "
"start {}, stop {}\n".format(min_taxon_perc, start, stop))
return

def get_otu_for_acc(self, gb_id):
if gb_id in set([self.otu_dict[otu].get("^ncbi:accession",'UNK') for otu in self.otu_dict]):
Expand Down
27 changes: 10 additions & 17 deletions physcraper/configobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class ConfigObj(object):
* **self.e_value_thresh**: the defined threshold for the e-value during Blast searches, check out: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
* **self.hitlist_size**: the maximum number of sequences retrieved by a single blast search
* **self.seq_len_perc**: value from 0 to 1. Defines how much shorter new seq can be compared to input
* **self.minlen**: value from 0 to 1. Defines how much shorter new seq can be compared to input
* **self.trim_perc**: value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
* **self.maxlen**: max length for values to add to aln
* **self.get_ncbi_taxonomy**: Path to sh file doing something...
Expand Down Expand Up @@ -86,8 +86,7 @@ def set_defaults(self):
self.num_threads = 4
self.delay = 90
self.spp_threshold = 5
self.seq_len_perc = 0.8
self.trim_perc = 0.8
self.minlen = 0.8
self.maxlen = 1.2
self.url_base = None
self.taxonomy_dir = "{}/taxonomy".format(physcraper_dir)
Expand All @@ -104,9 +103,8 @@ def config_str(self):
delay = {delay}
[physcraper]
spp_threshold = {sppt}
seq_len_perc = {perc}
trim_perc = {t_perc}
max_len = {max_len}
min_length = {perc}
max_length = {maxlen}
taxonomy_path = {taxonomy}'''.format(
email=self.email,
e_val=self.e_value_thresh,
Expand All @@ -117,9 +115,8 @@ def config_str(self):
nt=self.num_threads,
delay=self.delay,
sppt=self.spp_threshold,
perc=self.seq_len_perc,
t_perc=self.trim_perc,
max_len=self.maxlen,
perc=self.minlen,
maxlen=self.maxlen,
taxonomy = self.taxonomy_dir)
return(config_text)
def write_file(self, workdir, filename = "run.config"):
Expand Down Expand Up @@ -184,16 +181,12 @@ def read_config(self, configfi, interactive):
)
# #############
# read in physcraper settings
self.seq_len_perc = float(config["physcraper"]["seq_len_perc"])
assert 0 < self.seq_len_perc <= 1, (
"value `%s` is not between 0 and 1" % self.seq_len_perc
self.minlen = float(config["physcraper"]["min_length"])
assert 0 < self.minlen <= 1, (
"value `%s` is not between 0 and 1" % self.minlen
)
self.spp_threshold = int(config["physcraper"]["spp_threshold"])
self.trim_perc = float(config["physcraper"]["trim_perc"])
assert 0 < self.trim_perc < 1, (
"value `%s` is not between 0 and 1" % self.trim_perc
)
self.maxlen = float(config["physcraper"]["max_len"])
self.maxlen = float(config["physcraper"]["max_length"])
assert 1 < self.maxlen, (
"value `%s` is not larger than 1" % self.maxlen
)
Expand Down
25 changes: 17 additions & 8 deletions physcraper/ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,17 +130,24 @@ def get_tax_seq_acc(self, acc):
if len(gb_id.split(".")) == 1:
debug("accession number {} not recognized".format(gb_id))
return None, None, None
seq_path = "{}/{}.json".format(self.full_seq_path, gb_id)
seq_path = "{}/{}_remote.fasta".format(self.full_seq_path, gb_id)
ncbi_id = tax_name = seq = None
if gb_id in self.acc_tax_seq_dict:
tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:"]
seq = self.acc_tax_seq_dict[gb_id]["seq"]
elif os.path.exists(seq_path):
self.acc_tax_seq_dict[gb_id] = json.load(open(seq_path))
tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
seq = self.acc_tax_seq_dict[gb_id]["seq"]
else:
fi = open(seq_path)
header = fi.readline().strip('>')
#try:
assert(header.split()[1].startswith('taxname:'))
tax_name = header.split()[1].strip('taxname:')
ncbi_id = header.split()[2].strip('ncbi_id:')
seq = "".join(fi.readlines())
# except IndexError:
# print("IndexError")
# pass
if seq == None:
read_handle = self.entrez_efetch(gb_id)
tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle)
ncbi_id = ncbi_data_parser.get_ncbi_tax_id(read_handle)
Expand All @@ -149,7 +156,9 @@ def get_tax_seq_acc(self, acc):
self.ncbiid_to_spn[ncbi_id] = tax_name
self.acc_ncbi_dict[gb_id] = ncbi_id
self.acc_tax_seq_dict[gb_id] = {'taxname':tax_name, "^ncbi:taxon":ncbi_id, 'seq':seq} #This is going to be a memory hog...
json.dump(self.acc_tax_seq_dict[gb_id], open(seq_path, 'w'))
with open(seq_path, 'w') as fi:
fi.write("> {} taxname:{} ncbi:{}\n".format(gb_id, tax_name, ncbi_id))
fi.write(self.acc_tax_seq_dict[gb_id]['seq'])
assert ncbi_id is not None
return ncbi_id, tax_name, seq

Expand Down
2 changes: 1 addition & 1 deletion physcraper/opentree_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def OtuJsonDict(id_to_spn, id_dict):
else:
ncbi_spn = id_dict.ott_to_ncbi[ottid]
sp_info_dict[otu_id] = {
"^ncbi:taxon": ncbiid,
"^ncbi:taxon": int(ncbiid),
"^ot:ottTaxonName": ottname,
"^ot:ottId": ottid,
"^ot:originalLabel": tipname,
Expand Down
17 changes: 7 additions & 10 deletions physcraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ def run_local_blast_cmd(self, query, taxon_label, fn_path):
outfmt = "6 sseqid staxids sscinames pident evalue bitscore sseq salltitles sallseqid"
# outfmt = " -outfmt 5" # format for xml file type
# TODO query via stdin
#blastn -query /home/ejmctavish/projects/otapi/data_physcraper/output_pg55_local/current_blast_run/tmp.fas -db /home/ejmctavish/ncbi/localblastdb//nt -out /home/ejmctavish/projects/otapi/data_physcraper/output_pg55_local/current_blast_run/otu376447.txt -outfmt '6 sseqid staxids sscinames pident evalue bitscore sseq salltitles sallseqid' -num_threads 2 -max_target_seqs 10 -max_hsps 10

blastcmd = ["blastn",
"-query",
"{}/tmp.fas".format(abs_blastdir),
Expand Down Expand Up @@ -616,18 +614,18 @@ def seq_dict_build(self, seq, new_otu_label, seq_dict):
#debug("seq {} is shorter than {}".format(new_otu_label, otu_lab))
if new_seq in inc_seq:# if seq is identical and shorter
#debug("seq is identical and shorter")
if existing_tax_id != tax_new_seq: # different taxa
if int(existing_tax_id) != int(tax_new_seq): # different taxa
if _VERBOSE or _DEBUG:
sys.stdout.write("\nseq {} is subsequence of {}, "
sys.stdout.write("\nseq {} is identical and/or subsequence of {}, "
"but different species name \n".format(new_otu_label, otu_lab))
reason = "new seq added; subsequence of {}, but different taxon".format(otu_lab)
reason = "new seq added; identical and/or subsequence of {}, but different taxon".format(otu_lab)
#debug("otu {}, tmp status {}".format(new_otu_label, reason))
#still should be added, but need to check other samples
else: # subseq of same otu
reason = "seq {} is subsequence or identical to {}, not added ".format(new_otu_label, otu_lab)
if _VERBOSE:
sys.stdout.write("\n{}\n".format(reason))
self.data.otu_dict[new_otu_label]['^physcraper:status'] = "subsequence, not added"
self.data.otu_dict[new_otu_label]['^physcraper:status'] = "identical and/or subsequence of {}, not added".format(otu_lab)
#debug("{} not added, subseq of {}".format(new_otu_label, otu_lab))
#debug("{} was NOT added to seq_dict: {}".format(new_otu_label, reason))
return seq_dict
Expand All @@ -652,7 +650,7 @@ def seq_dict_build(self, seq, new_otu_label, seq_dict):
del seq_dict[otu_lab]
seq_dict[new_otu_label] = seq
self.data.remove_taxa_aln_tre(otu_lab)
reason = "\nseq {} is supersequence of {}, {} added and {} removed ".format(new_otu_label, otu_lab, new_otu_label, otu_lab)
reason = "seq {} is supersequence of {}, {} added and {} removed ".format(new_otu_label, otu_lab, new_otu_label, otu_lab)
if _VERBOSE or _DEBUG:
sys.stdout.write("\n{}\n".format(reason))
self.data.otu_dict[otu_lab]['^physcraper:status'] = "deleted, {} is supersequence ".format(new_otu_label)
Expand Down Expand Up @@ -686,7 +684,7 @@ def remove_identical_seqs(self):
self.data.orig_seqlen = [len(self.data.aln[tax].symbols_as_string().replace("-", "").replace("N", "")) for tax in
self.data.aln]
avg_seqlen = sum(self.data.orig_seqlen) / len(self.data.orig_seqlen) # HMMMMMMMM
seq_len_min = avg_seqlen * self.config.seq_len_perc
seq_len_min = avg_seqlen * self.config.minlen
seq_len_max = avg_seqlen * self.config.maxlen
#debug("we already have {}".format(all_added_gi))
for gb_id, seq in self.new_seqs.items():
Expand All @@ -706,6 +704,7 @@ def remove_identical_seqs(self):
for otu in otu_in_aln:
del tmp_dict[otu]
filter_dict = self.filter_seqs(tmp_dict, type='random', threshold = self.config.spp_threshold)
##EJM TODO add to notes in OTU_DICT
self.new_seqs_otu_id = filter_dict # renamed new seq to their otu_ids from GI's, but all info is in self.otu_dict
self.new_seqs = {} #Wipe clean
# debug("len new seqs otu dict after remove identical{}".format(len(self.new_seqs_otu_id)))
Expand Down Expand Up @@ -912,7 +911,6 @@ def align_new_seqs(self, aligner = 'muscle'):
self.run_papara()
if aligner == 'muscle':
self.run_muscle()
self.data.trim()
alnfi = self.data.write_aln()
self.data.write_otus(schema="table")
self.data.write_otus(schema="json")
Expand Down Expand Up @@ -1207,7 +1205,6 @@ def calculate_final_tree(self):
debug("calculate final tree")
self.data.write_files(treepath="physcraper_final_notrim.tre", alnpath="physcraper_final_notrim.fas")
self.data.prune_short()
self.data.trim()
self.data.write_files(treepath="physcraper_final_trim.tre", alnpath="physcraper_final_trim.fas")
if os.path.exists("[]/previous_run".format(self.workdir)):
self.est_full_tree(path="previous_run")
Expand Down
29 changes: 2 additions & 27 deletions tests/data/aws.config
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,8 @@ delay = 90
unmapped = keep

#This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
seq_len_perc = 0.8

# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
trim_perc = 0.75
min_length = 0.8

# max length for values to add to aln
max_len = 2.5


#######
## INTERNAL PHYSCRAPER SETTINGS
#---------------------------------------------------------------------------------
#Things below here you should not need to change!

#Only required if blast location is local
[ncbi_parser]
nodes_fn = ./taxonomy/nodes.dmp
names_fn = ./taxonomy/names.dmp

[phylesystem]
location = api
#local or api, leave set to api unless you have installed phylesystem locally

[taxonomy]
#You should not need to change any of these!
ott_ncbi = taxonomy/ott_ncbi
get_ncbi_taxonomy = taxonomy/get_ncbi_taxonomy.sh
ncbi_dmp = taxonomy/gi_taxid_nucl.dmp
id_pickle = taxonomy/id_dmp.p
max_length = 2.5

0 comments on commit 21007d4

Please sign in to comment.