Skip to content

Commit

Permalink
added alternative ftp url for silva reference data, in case https url…
Browse files Browse the repository at this point in the history
… does not work (fixes #33)

fixed error messages for makedb. bumped version

simplified overview-report a bit (fixes #34)
  • Loading branch information
jvollme committed Aug 12, 2022
1 parent 592b08e commit ad9ae06
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 38 deletions.
2 changes: 1 addition & 1 deletion mdmcleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def main():
if args.get_pub_data:
args.outdir = "./db"
else:
assert "db_basedir" in configs.settings, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
assert "db_basedir" in configs.settings and len(configs.settings["db_basedir"]) != 0, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
args.outdir = os.path.join(configs.settings["db_basedir"][0], configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
elif not args.get_pub_data:
args.outdir = os.path.join(args.outdir, configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
Expand Down
2 changes: 1 addition & 1 deletion mdmcleaner/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.3"
__version__ = "0.8.4"
2 changes: 1 addition & 1 deletion mdmcleaner/mdmcleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def main():
if args.get_pub_data:
args.outdir = "./db"
else:
assert "db_basedir" in configs.settings, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
assert "db_basedir" in configs.settings and len(configs.settings["db_basedir"]) != 0, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
args.outdir = os.path.join(configs.settings["db_basedir"][0], configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
elif not args.get_pub_data:
args.outdir = os.path.join(args.outdir, configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
Expand Down
28 changes: 19 additions & 9 deletions mdmcleaner/read_gtdb_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@
"gtdb_vs_ncbi_lookup" : { "url" : "{}/auxillary_files".format(gtdb_server), "pattern" : "*_vs_*.xlsx" } } #todo: remove gtdb_vs_ncbi_lookuptables

silva_server = "https://www.arb-silva.de/fileadmin/silva_databases/current"
silva_source_dict = { "silva_version" : { "url" : "{}/".format(silva_server), "wishlist" : [ "VERSION.txt" ]}, \
"silva_taxfiles" : { "url" : "{}/Exports/taxonomy/".format(silva_server), "wishlist" : ["taxmap_slv_lsu_ref_nr_{}.txt.gz", "taxmap_slv_lsu_ref_nr_{}.txt.gz.md5", "taxmap_slv_ssu_ref_nr_{}.txt.gz", "taxmap_slv_ssu_ref_nr_{}.txt.gz.md5"] }, \
"silva_fastas" : { "url" : "{}/Exports/".format(silva_server), "wishlist" : ["SILVA_{}_LSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_LSURef_NR99_tax_silva.fasta.gz.md5", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz.md5"] } } #currently, silva does not seem to allow recursive downloads based on filename-patterns --> Using this workaround instead. format function will need to replace '{}' with the database version later
ALT_silva_server = "ftp://arb-silva.de/current" # apparently sometimes one or the other of the silva servers is not reachable. therefore always trying both alternately

silva_source_dict = { "silva_version" : { "url" : "{}/".format(silva_server), "alturl" : "{}/".format(ALT_silva_server), "wishlist" : [ "VERSION.txt" ]}, \
"silva_taxfiles" : { "url" : "{}/Exports/taxonomy/".format(silva_server), "alturl" : "{}/Exports/taxonomy/".format(ALT_silva_server), "wishlist" : ["taxmap_slv_lsu_ref_nr_{}.txt.gz", "taxmap_slv_lsu_ref_nr_{}.txt.gz.md5", "taxmap_slv_ssu_ref_nr_{}.txt.gz", "taxmap_slv_ssu_ref_nr_{}.txt.gz.md5"] }, \
"silva_fastas" : { "url" : "{}/Exports/".format(silva_server), "alturl" : "{}/Exports/".format(ALT_silva_server), "wishlist" : ["SILVA_{}_LSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_LSURef_NR99_tax_silva.fasta.gz.md5", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz.md5"] } } #currently, silva does not seem to allow recursive downloads based on filename-patterns --> Using this workaround instead. format function will need to replace '{}' with the database version later


# --> Consider Grepping and filtering only EUkaryote sequences from these --> merge with gtdb dataset OR merge them all (if not too large) and make sure taxonomy is updated!'accordingly!

taxdb_outfilebasename = "gtdb_taxonomy_br.json.gz" #todo: "_br" still stands for "binrefiner". change that!
Expand Down Expand Up @@ -328,12 +332,17 @@ def getsilvaversion(urldict, targetfolder): #yes I know. This could go easier wi
#if os.path.exists(versionfilename):
# sys.stderr.write("\nDeleting pre-existing {}\n".format(versionfilename))
# os.remove(versionfilename)
_download_unixwget(urldict["url"] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)

url = "url"
returncode = _download_unixwget(urldict[url] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)
if returncode != 0:
sys.stderr.write("\thttps url not reachable. trying ftp-url\n")
url = "alturl"
returncode = _download_unixwget(urldict[url] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)
if returncode != 0:
sys.exit("\n\tERROR: can't reach silva database. please try again later\n")
with open(versionfilename) as versionfile:
version = versionfile.read().strip()
#print("wtf")
return version
return version, url
# end of nested subfunctions

def get_download_dict(prelim_downloadlist, wishdict):
Expand All @@ -345,7 +354,7 @@ def get_download_dict(prelim_downloadlist, wishdict):
download_dict[x] = okdownloadlist
return download_dict

version = getsilvaversion(sourcedict["silva_version"], targetfolder)
version , url = getsilvaversion(sourcedict["silva_version"], targetfolder)
prelim_downloadlist = [os.path.join(targetfolder, sourcedict["silva_version"]["wishlist"][0])]
wishdict = {silvacat : [ w.format(version) for w in sourcedict[silvacat]["wishlist"] ] for silvacat in sourcedict }
download_dict = None
Expand All @@ -356,7 +365,8 @@ def get_download_dict(prelim_downloadlist, wishdict):
for silvacat in ["silva_taxfiles", "silva_fastas"]:
for wish in wishdict[silvacat]:
sys.stderr.write("\n\tNow downloading from silva: \"{}\" (attempt {})...\n".format(wish, trycounter +1))
returncode = _download_unixwget(sourcedict[silvacat]["url"] + wish, pattern = None, targetdir=targetfolder, verbose=verbose)
# ~ sys.stderr.write(sourcedict[silvacat][url] + wish + "\n")
returncode = _download_unixwget(sourcedict[silvacat][url] + wish, pattern = None, targetdir=targetfolder, verbose=verbose)
if returncode != 0:
sys.stderr.write("\nWARNING: wget returned non-zero returncode '{}' after downloading {} \n".format(returncode, wish))
prelim_downloadlist.append(os.path.join(targetfolder, wish))
Expand Down
29 changes: 3 additions & 26 deletions mdmcleaner/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,31 +174,20 @@ def write_dictlines(indict, outfile):
total_proteins = 0

print_dict = { "binname" : binname, \
"completeness_before" : trna_completeness_before,\
"completeness_after" : trna_completeness_after,\
"completeness_before[%]" : trna_completeness_before,\
"contamination_before[%]" : round(fraction_delete * 100, 2),\
"completeness_after[%]" : trna_completeness_after,\
"totalbincontigs" : totalbincontigs,\
"totalbinbp" : totalbinbp,\
"majortaxpath" : majortaxpath,\

"fraction_trustedbp" : fraction_trustedbp, \
"fraction_unknownbp" : fraction_unknownbp, \
"fraction_untrustedbp" : fraction_untrustedbp, \
"fraction_keep" : fraction_keep, \
"fraction_evaluate_low" : fraction_evaluate_low , \
"fraction_evaluate_high" : fraction_evaluate_high, \
"fraction_delete" : fraction_delete, \
"bin_trust" : bin_trust, \
"bin_trust_ignoring_viral":bin_trust_ignoring_viral, \

"fraction_different_species":fraction_different_species, \
"fraction_different_genus":fraction_different_genus, \
"fraction_different_family":fraction_different_family, \
"fraction_different_order":fraction_different_order, \
"fraction_different_class":fraction_different_class, \
"fraction_different_phylum":fraction_different_phylum, \
"fraction_different_domain":fraction_different_domain, \
"fraction_viral":fraction_viral, \

"fraction_refdb_ambiguity" : fraction_refdb_ambiguity, \
"fraction_ignored_refdb_ambiguity" : fraction_ignored_refdb_ambiguity, \
"fraction_nocoding" : fraction_nocoding, \
Expand All @@ -210,18 +199,6 @@ def write_dictlines(indict, outfile):
"sum_fraction_different_family" : sum_fraction_different_family, \
"sum_fraction_different_genus" : sum_fraction_different_genus, \
"sum_fraction_different_species" : sum_fraction_different_species, \

"fraction_trust0":fraction_trust0, \
"fraction_trust1":fraction_trust1, \
"fraction_trust2":fraction_trust2, \
"fraction_trust3":fraction_trust3, \
"fraction_trust4":fraction_trust4, \
"fraction_trust5":fraction_trust5, \
"fraction_trust6":fraction_trust6, \
"fraction_trust7":fraction_trust7, \
"fraction_trust8":fraction_trust8, \
"fraction_trust9":fraction_trust9, \
"fraction_trust10":fraction_trust10, \

"total_16SrRNA":total_16SrRNA, \
"total_23SrRNA":total_23SrRNA, \
Expand Down

0 comments on commit ad9ae06

Please sign in to comment.