Skip to content

Commit

Permalink
Merge pull request #100 from McTavishLab/conflict
Browse files Browse the repository at this point in the history
Conflict
  • Loading branch information
snacktavish committed Jun 4, 2020
2 parents c2522c3 + 43b6e2a commit 0aa5245
Show file tree
Hide file tree
Showing 21 changed files with 857 additions and 11,473 deletions.
12 changes: 8 additions & 4 deletions bin/physcraper_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,12 @@
if args.configfile:
conf = physcraper.ConfigObj(args.configfile)
elif args.reload_files:
configfile = "{}/run.config".format(args.reload_files)
files = [f for f in os.listdir(workdir)]
for file in files:
if file.startswith('run_'):
tag = file.split('.')[0].replace('run_', '')
configfile = "{}/run_{}/run.config".format(args.reload_files, tag)
conf = physcraper.ConfigObj(configfile)
sys.stdout.write("Using config file {}\n".format(configfile))
else:
conf = physcraper.ConfigObj()

Expand Down Expand Up @@ -105,10 +108,11 @@
study_id = None
if args.tree_link:
linkl = args.tree_link.split("/")
assert(linkl[4]=="view")
study_id == linkl[5]
assert(linkl[5]=="view")
study_id = linkl[6]
tree_id = linkl[-1].split("=")[1]


if args.study_id or args.tree_id:
try:
study_id = args.study_id
Expand Down
65 changes: 46 additions & 19 deletions physcraper/aligntreetax.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,54 @@ def generate_ATT_from_files(workdir,
return AlignTreeTax(treefile, otu_dict, alnfile, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tree_schema=tree_schema)

def generate_ATT_from_run(workdir, tag=None, configfile=None):
def generate_ATT_from_run(workdir, start_files='output', tag=None, configfile=None):
"""Build an ATT object without phylesystem, use your own files instead.
:return: object of class ATT
"""
files = [f for f in os.listdir(workdir)]
for file in files:
if file.startswith('physcraper_'):
tag = file.split('.')[0].replace('physcraper_', '')
sys.stdout.write("Reloading files with tag {}\n".format(tag))
if file.startswith('inputs_'):
tag = file.split('.')[0].replace('inputs_', '')
assert os.path.exists(workdir)
# use replaced aln as input
rundir = "{}/run_{}".format(workdir, tag)
outputsdir = "{}/outputs_{}".format(workdir, tag)
inputsdir = "{}/inputs_{}".format(workdir, tag)
if configfile == None:
if os.path.exists("{}/run.config".format(workdir)):
configfile = "{}/run.config".format(workdir)
alnfi = "{}/physcraper_{}.fas".format(workdir, tag)
treefile = "{}/physcraper_{}.tre".format(workdir, tag)
otu_json = "{}/otu_info_{}.json".format(workdir, tag)
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(workdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tag=tag, tree_schema='newick')
configfile = "{}/run.config".format(rundir)
try:
alnfi = "{}/physcraper_{}.fas".format(outputsdir, tag)
treefile = "{}/physcraper_{}.tre".format(inputsdir, tag)
otu_json = "{}/otu_info_{}.json".format(rundir, tag)
assert(os.path.exists(alnfi))
assert(os.path.exists(treefile))
assert(os.path.exists(otu_json))
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tag=tag, tree_schema='newick')
except AssertionError:
sys.stdout.write("No output files found in {}, loading files from {}\n".format(outputsdir, inputsdir))
alnfi = "{}/physcraper_{}.fas".format(inputsdir, tag)
treefile = "{}/physcraper_{}.tre".format(inputsdir, tag)
otu_json = "{}/otu_info_{}.json".format(rundir, tag)
assert(os.path.exists(alnfi)), alnfi
assert(os.path.exists(treefile)), treefile
assert(os.path.exists(otu_json)), otu_json
otu_dict = json.load(open(otu_json, "r"))
mrca_ott = mrca_ott = int(open("{}/mrca.txt".format(inputsdir)).readline().split()[-1])
return AlignTreeTax(tree = treefile, otu_dict= otu_dict, alignment = alnfi, ingroup_mrca=mrca_ott, workdir=workdir,
configfile=configfile, tag=tag, tree_schema='newick')
#except AssertionError:
# sys.stdout.write("No run files found in {} or {}. Data not loaded\n".format(outputsdir, inputsdir))



# use replaced aln as input







#def concatenate_ATTs(att_list, number_per_taxon='max', level='spp'):
Expand Down Expand Up @@ -522,6 +549,7 @@ def add_otu(self, gb_id, ids_obj):
self.otu_dict[otu_id]["^physcraper:status"] = "query"
self.otu_dict[otu_id]["^ot:ottTaxonName"] = ott_name
self.otu_dict[otu_id]["^physcraper:last_blasted"] = None
self.otu_dict[otu_id]["^physcraper:ingroup"] = True
if gb_id[:6] == "unpubl":
self.otu_dict[otu_id]["^physcraper:status"] = "local seq"
self.otu_dict[otu_id]["^ot:originalLabel"] = self.gb_dict[gb_id]["localID"]
Expand Down Expand Up @@ -638,7 +666,7 @@ def write_labelled(self, label, filename = "labelled", direc='workdir', norepeat
alnpath = "{}/{}.fas".format(direc, filename)
debug(treepath)
assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
"^ot:originalLabel", "^ot:ottId", "^ncbi:taxon", 'name_and_id']
tmp_newick = self.tre.as_string(schema="newick")
tmp_tre = Tree.get(data=tmp_newick,
schema="newick",
Expand All @@ -654,8 +682,7 @@ def write_labelled(self, label, filename = "labelled", direc='workdir', norepeat
if self.otu_dict[taxon.label].get("^ot:originalLabel"):
new_label = "orig_{}".format(self.otu_dict[taxon.label]["^ot:originalLabel"])
else:
new_label = "ncbi_{}_ottname_{}".format(self.otu_dict[taxon.label].get("^ncbi:taxon", "unk"),
self.otu_dict[taxon.label].get('^physcraper:TaxonName', "unk"))
new_label = taxon.label
new_label = str(new_label).replace(' ', '_')
if add_gb_id:
gb_id = self.otu_dict[taxon.label].get('^ncbi:accession')
Expand All @@ -667,7 +694,7 @@ def write_labelled(self, label, filename = "labelled", direc='workdir', norepeat
new_label = "_".join([new_label, str(sp_counter)])
sp_counter += 1
else:
if new_label in new_names and norepeats:
if norepeats:
new_label = "_".join([new_label, taxon.label])
taxon.label = new_label
new_names.add(new_label)
Expand Down Expand Up @@ -699,7 +726,7 @@ def write_otus(self, filename = "otu_info", schema="table", direc='workdir'):
# all_keys.update(self.otu_dict[otu].keys())
#keys = list(all_key)
#keys.sort()
keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^ncbi:gi','^physcraper:last_blasted','^physcraper:status','^ot:originalLabel','^ncbi:title']
keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^ncbi:gi','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
header = ["otu_id"] + keys
with open("{}/{}_{}.csv".format(direc, filename, self.tag), "w") as outfile:
outfile.write("\t".join(header)+"\n")
Expand Down
13 changes: 6 additions & 7 deletions physcraper/configobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,26 +98,23 @@ def config_str(self):
hitlist_size = {hls}
location = {bl}
localblastdb = {db}
url_base = {ul}
num_threads = {nt}
delay = {delay}
[physcraper]
spp_threshold = {sppt}
min_length = {perc}
max_length = {maxlen}
taxonomy_path = {taxonomy}'''.format(
'''.format(
email=self.email,
e_val=self.e_value_thresh,
hls=self.hitlist_size,
bl=self.blast_loc,
db=self.blastdb,
ul=self.url_base,
nt=self.num_threads,
delay=self.delay,
sppt=self.spp_threshold,
perc=self.minlen,
maxlen=self.maxlen,
taxonomy = self.taxonomy_dir)
maxlen=self.maxlen)
return(config_text)
def write_file(self, direc, filename = "run.config"):
config_text = self.config_str()
Expand Down Expand Up @@ -163,10 +160,12 @@ def read_config(self, configfi, interactive):
self.set_local()
if self.blast_loc == "remote":
self.url_base = config["blast"].get("url_base")
if self.url_base == 'None':
self.url_base = None
if _DEBUG:
sys.stdout.write("{}\n".format(self.email))
if self.blast_loc == "remote":
sys.stdout.write("url base = {}\n".format(self.url_base))
#if self.blast_loc == "remote":
# sys.stdout.write("url base = {}\n".format(self.url_base))
sys.stdout.write("{}\n".format(self.blast_loc))
if self.blast_loc == "local":
sys.stdout.write("local blast db {}\n".format(self.blastdb))
Expand Down
25 changes: 16 additions & 9 deletions physcraper/opentree_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def generate_ATT_from_phylesystem(alnfile,
otu_dict = {tn.taxon.otu:{} for tn in tree_obj.leaf_node_iter()}
orig_lab_to_otu = {}
treed_taxa = {}
ingroup_otus = set(nexson_helpers.get_subtree_otus(study_nexson,
tree_id=tree_id,
subtree_id="ingroup",
return_format="otu_id"))
for leaf in tree_obj.leaf_node_iter():
tn = leaf.taxon
otu_id = tn.otu
Expand All @@ -179,6 +183,10 @@ def generate_ATT_from_phylesystem(alnfile,
otu_dict[otu_id]["^ot:originalLabel"] = tn.original_label.replace(" ", "_")
otu_dict[otu_id]["^physcraper:status"] = "original"
otu_dict[otu_id]["^physcraper:last_blasted"] = None
if otu_id in ingroup_otus:
otu_dict[otu_id]["^physcraper:ingroup"] = True
else:
otu_dict[otu_id]["^physcraper:ingroup"] = False
orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_")
orig_lab_to_otu[orig] = otu_id
if tip_label == 'otu':
Expand All @@ -187,10 +195,6 @@ def generate_ATT_from_phylesystem(alnfile,
tn.label = otu_dict[otu_id].get(tip_label)
treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId")
# need to prune tree to seqs and seqs to tree...
ott_ids = nexson_helpers.get_subtree_otus(study_nexson,
tree_id=tree_id,
subtree_id="ingroup",
return_format="ottid")
ott_mrca = None
if ingroup_mrca:
if type(ingroup_mrca) == list:
Expand All @@ -199,11 +203,14 @@ def generate_ATT_from_phylesystem(alnfile,
else:
ott_mrca = int(ingroup_mrca)
if ott_mrca == None:
ott_ids = set([otu_dict[otu_id].get(u"^ot:ottId") for otu_id in otu_dict])
if None in ott_ids:
ott_ids.remove(None)
assert(len(ott_ids)>=1)
ott_mrca = get_mrca_ott(ott_ids)
ingroup_ott_ids = set()
for otu_id in otu_dict:
if otu_dict[otu_id]["^physcraper:ingroup"] == True:
ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId"))
if None in ingroup_ott_ids:
ingroup_ott_ids.remove(None)
assert(len(ingroup_ott_ids)>=1)
ott_mrca = get_mrca_ott(ingroup_ott_ids)
otu_newick = tree_obj.as_string(schema="newick")
return physcraper.aligntreetax.AlignTreeTax(tree = otu_newick, otu_dict =otu_dict, alignment=alnfile, aln_schema = aln_schema, ingroup_mrca=ott_mrca, workdir=workdir, configfile=configfile)
# newick should be bare, but alignment should be DNACharacterMatrix
Expand Down
11 changes: 10 additions & 1 deletion physcraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,10 @@ def replace_tre(self, filename, schema = 'newick'):
preserve_underscores=True,
taxon_namespace = self.data.aln.taxon_namespace)
aln_tax = [taxon.label for taxon in self.data.aln]
outgroup = [otu_id for otu_id in self.data.otu_dict if self.data.otu_dict[otu_id].get('^physcraper:ingroup', False) == False]
debug("rerooting tree using {} as outgroup".format(outgroup))
mrca = newtre.mrca(taxon_labels = outgroup)
newtre.reroot_at_node(mrca)
for taxon in newtre.leaf_nodes():
assert taxon.taxon.label in self.data.otu_dict, taxon.taxon.label
assert taxon.taxon.label in aln_tax
Expand Down Expand Up @@ -980,8 +984,13 @@ def run_muscle(self, input_aln_path = None, new_seqs_path = None, outname = 'all
sys.stderr.write("error code {}, {}".format(grepexc.returncode, grepexc.output))
f = open('{}/muscle.log'.format(self.rundir), 'a')
try:
cleaned_align_path = "{}/original_cleaned.fas".format(self.rundir)
cleaned_align_file = open(cleaned_align_path, 'w')
subprocess.check_call(["sed", "s/?/-/g",
input_aln_path], stdout=cleaned_align_file, stderr=f)
cleaned_align_file.close()
subprocess.check_call(["muscle", "-profile",
"-in1", input_aln_path,
"-in1", cleaned_align_path,
"-in2", outpath_NEW,
"-out", outpath_ALL], stdout=f, stderr=subprocess.STDOUT)
if _VERBOSE:
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
otu_id ^ot:ottTaxonName ^ot:ottId ^ncbi:taxon ^ncbi:accession ^ncbi:gi ^physcraper:last_blasted ^physcraper:status ^ot:originalLabel ^ncbi:title
Tl805473 Infundibulomyces cupulata 751650 415312 - - None original Infundibulomyces_cupulata_EF113979 -
Tl805456 Tainosphaeria crassiparies 368152 197737 - - None original Tainosphaeria_crassiparies_AF466089 -
Tl805444 Dinemasporium decipiens 878252 1196447 - - None original Dinemasporium_decipiens_CBS_592.73 -
Tl805443 Dinemasporium pseudostrigosum 509897 1196448 - - None original Dinemasporium_pseudostrigosum_CBS_825.91 -
Tl805445 Phomatospora dinemasporium 5308896 0 - - None original Dinemasporium_strigosum_CBS_828.84 -
Tl805424 Dinemasporium americana 1029188 1196450 - - None original Dinemasporium_americana_CBS_127127 -
Tl805447 Dinemasporium polygonum 1029184 1196451 - - None original Dinemasporium_polygonum_CBS_516.95 -
Tl805442 Dinemasporium morbidum 637763 1196453 - - None original Dinemasporium_morbidum_CBS_995.97 -
Tl805436 Dinemasporium morbidum 637763 1196453 - - None original Dinemasporium_morbidum_CBS_129.66 -
Tl805432 Dinemasporium pseudoindicum 1029200 1196452 - - None original Dinemasporium_pseudoindicum_CBS_127402 -
Tl805431 Dinemasporium pseudostrigosum 509897 1196448 - - None original Dinemasporium_pseudostrigosum_CBS_717.85 -
Tl805450 Phomatospora dinemasporium 5308896 0 - - None original Dinemasporium_strigosum_CBS_520.78 -
Tl805452 Phomatospora dinemasporium 5308896 0 - - None original Dinemasporium_strigosum_CPC_18898 -
Tl805470 Rattania setulifera 63948 858408 - - None original Rattania_setulifera_HM171322 -
Tl805467 Chaetosphaeria vermicularioides 597718 2082110 - - None original Melanopsammella_vermicularioides_AF466087 -
Tl805455 Melanopsammella gonytrichii 368155 2571400 - - None original Melanopsammella_gonytrichii_AF466085 -
Tl805469 Melanochaeta aotearoae 368150 1293526 - - None original Melanochaeta_aotearoae_AF466082 -
Tl805463 Melanochaeta hemipsila 368149 1866870 - - None original Melanochaeta_hemipsila_AY346292 -
Tl805448 Pyrigemmula aurantiaca 910985 871251 - - None original Pyrigemmula_aurantiaca_HM241692 -
Tl805453 Lecythothecium duriligni 1026131 156755 - - None original Lecythothecium_duriligni_AF261071 -
Tl805475 Ellisembia brachypus 1008023 2571403 - - None original Ellisembia_brachypus_DQ408563 -
Tl805454 Chaetosphaeria luquillensis 943806 197725 - - None original Chaetosphaeria_luquillensis_AF466074 -
Tl805474 Chaetosphaeria innumera 250265 2082109 - - None original Chaetosphaeria_innumera_AY017375 -
Tl805468 Chloridium lignicola 300282 139962 - - None original Chloridium_lignicola_AF178544 -
Tl805458 Chaetosphaeria tropicalis 368151 197730 - - None original Chaetosphaeria_tropicalis_AF466080 -
Tl805464 Chaetosphaeria capitata 418010 197718 - - None original Chaetosphaeria_capitata_AF466061 -
Tl805471 Umbrinosphaeria caesariata 1026125 156753 - - None original Umbrinosphaeria_caesariata_AF261069 -
Tl805477 Chaetosphaeria conirostris 943809 197720 - - None original Chaetosphaeria_conirostris_AF466066 -
Tl805478 Chaetosphaeria fuegiana 1088172 426109 - - None original Chaetosphaeria_fuegiana_EF063574 -
Tl805466 Chaetosphaeria callimorpha 250260 139944 - - None original Chaetosphaeria_callimorpha_AF466062 -
Tl805457 Chaetosphaeria preussii 773202 139952 - - None original Chaetosphaeria_preussii_AF178561 -
Tl805476 Striatosphaeria codinaeaphora 773196 0 - - None original Striatosphaeria_codinaeophora_AF466088 -
Tl805441 Dendrophoma cytisporoides 590921 1196583 - - None original Dendrophoma_cytisporoides_CBS_223.95 -
Tl805465 Chaetosphaeria ovoidea 301828 1296267 - - None original Chaetosphaeria_ovoidea_AF064641 -
Tl805459 Chaetosphaeria ciliata 420569 796322 - - None original Chaetosphaeria_ciliata_GU180637 -
Tl805462 Chaetosphaeria pulviscula 773200 2075150 - - None original Zignoella_pulviscula_AF466091 -
Tl805460 Chaetosphaeria pulviscula 773200 2075150 - - None original Zignoella_pulviscula_AF466090 -
Tl805433 Dictyochaetopsis gonytrichoides 300280 139964 - - None original Codinaeopsis_gonytrichoides_AF178556 -
Tl805440 Pseudolachnea fraxini 1029202 1196455 - - None original Pseudolachnea_fraxini_CBS_113701 -
Tl805434 Brunneodinemasporium brasiliense 209299 1196445 - - None original Brunneodinemasporium_brasiliense_CBS_112007 -
Tl805461 Thozetella nivea 169746 557972 - - None original Thozetella_nivea_EU825200 -
Tl805472 Saccharomyces cerevisiae 908549 0 - - None original Saccharomyces_cerevisiae_Z73326 -

0 comments on commit 0aa5245

Please sign in to comment.