added alternative ftp url for silva reference data, in case https url…

… does not work (fixes #33) fixed error messages for makedb. bumped version simplified overview-report a bit (fixes #34)
KIT-IBG-5 · Aug 12, 2022 · ad9ae06 · ad9ae06
1 parent 592b08e
commit ad9ae06
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 38 deletions.
diff --git a/mdmcleaner.py b/mdmcleaner.py
@@ -225,7 +225,7 @@ def main():
 			if args.get_pub_data:
 				args.outdir = "./db"
 			else:
-				assert "db_basedir" in configs.settings, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
+				assert "db_basedir" in configs.settings and len(configs.settings["db_basedir"]) != 0, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
 				args.outdir = os.path.join(configs.settings["db_basedir"][0], configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
 		elif not args.get_pub_data:
 			args.outdir = os.path.join(args.outdir, configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!

diff --git a/mdmcleaner/_version.py b/mdmcleaner/_version.py
@@ -1 +1 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
diff --git a/mdmcleaner/mdmcleaner.py b/mdmcleaner/mdmcleaner.py
@@ -225,7 +225,7 @@ def main():
 			if args.get_pub_data:
 				args.outdir = "./db"
 			else:
-				assert "db_basedir" in configs.settings, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
+				assert "db_basedir" in configs.settings and len(configs.settings["db_basedir"]) != 0, ("\n\nERROR: either 'outdir' must be specified as argument or 'db_basedir' needs to be specified in config file!\n\n")
 				args.outdir = os.path.join(configs.settings["db_basedir"][0], configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!
 		elif not args.get_pub_data:
 			args.outdir = os.path.join(args.outdir, configs.settings["db_type"][0]) # todo: read_gtdb_taxonomy should only get basedir as target-dir and simply assume the gtdb part!

diff --git a/mdmcleaner/read_gtdb_taxonomy.py b/mdmcleaner/read_gtdb_taxonomy.py
@@ -20,9 +20,13 @@
 					 "gtdb_vs_ncbi_lookup" : { "url" : "{}/auxillary_files".format(gtdb_server), "pattern" : "*_vs_*.xlsx" } } #todo: remove gtdb_vs_ncbi_lookuptables
 
 silva_server = "https://www.arb-silva.de/fileadmin/silva_databases/current"
-silva_source_dict = { "silva_version" : { "url" : "{}/".format(silva_server), "wishlist" : [ "VERSION.txt" ]}, \
-					  "silva_taxfiles" : { "url" : "{}/Exports/taxonomy/".format(silva_server), "wishlist" : ["taxmap_slv_lsu_ref_nr_{}.txt.gz", "taxmap_slv_lsu_ref_nr_{}.txt.gz.md5", "taxmap_slv_ssu_ref_nr_{}.txt.gz", "taxmap_slv_ssu_ref_nr_{}.txt.gz.md5"] }, \
-					  "silva_fastas" : { "url" : "{}/Exports/".format(silva_server), "wishlist" : ["SILVA_{}_LSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_LSURef_NR99_tax_silva.fasta.gz.md5", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz.md5"] } } #currently, silva does not seem to allow recursive downloads based on filename-patterns --> Using this workaround instead. format function will need to replace '{}' with the database version later
+ALT_silva_server = "ftp://arb-silva.de/current" # apparently sometimes one or the other of the silva servers is not reachable. therefore always trying both alternately
+
+silva_source_dict = { "silva_version" : { "url" : "{}/".format(silva_server), "alturl" : "{}/".format(ALT_silva_server), "wishlist" : [ "VERSION.txt" ]}, \
+					  "silva_taxfiles" : { "url" : "{}/Exports/taxonomy/".format(silva_server), "alturl" : "{}/Exports/taxonomy/".format(ALT_silva_server), "wishlist" : ["taxmap_slv_lsu_ref_nr_{}.txt.gz", "taxmap_slv_lsu_ref_nr_{}.txt.gz.md5", "taxmap_slv_ssu_ref_nr_{}.txt.gz", "taxmap_slv_ssu_ref_nr_{}.txt.gz.md5"] }, \
+					  "silva_fastas" : { "url" : "{}/Exports/".format(silva_server), "alturl" : "{}/Exports/".format(ALT_silva_server), "wishlist" : ["SILVA_{}_LSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_LSURef_NR99_tax_silva.fasta.gz.md5", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz", "SILVA_{}_SSURef_NR99_tax_silva.fasta.gz.md5"] } } #currently, silva does not seem to allow recursive downloads based on filename-patterns --> Using this workaround instead. format function will need to replace '{}' with the database version later
+
+
 #    --> Consider Grepping and filtering only EUkaryote sequences from these --> merge with gtdb dataset OR merge them all (if not too large) and make sure taxonomy is updated!'accordingly!
 
 taxdb_outfilebasename = "gtdb_taxonomy_br.json.gz" #todo: "_br" still stands for "binrefiner". change that!
@@ -328,12 +332,17 @@ def getsilvaversion(urldict, targetfolder): #yes I know. This could go easier wi
 		#if os.path.exists(versionfilename):
 		#	sys.stderr.write("\nDeleting pre-existing {}\n".format(versionfilename))
 		#	os.remove(versionfilename)
-		_download_unixwget(urldict["url"] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)
-
+		url = "url"
+		returncode = _download_unixwget(urldict[url] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)
+		if returncode != 0:
+			sys.stderr.write("\thttps url not reachable. trying ftp-url\n")
+			url = "alturl"
+			returncode = _download_unixwget(urldict[url] + urldict["wishlist"][0], targetdir=targetfolder, verbose=verbose)
+			if returncode != 0:
+				sys.exit("\n\tERROR: can't reach silva database. please try again later\n")	
 		with open(versionfilename) as versionfile:
 			version = versionfile.read().strip()
-			#print("wtf")
-		return version
+		return version, url
 	# end of nested subfunctions
 
 	def get_download_dict(prelim_downloadlist, wishdict):
@@ -345,7 +354,7 @@ def get_download_dict(prelim_downloadlist, wishdict):
 			download_dict[x] = okdownloadlist
 		return download_dict
 
-	version = getsilvaversion(sourcedict["silva_version"], targetfolder)
+	version , url = getsilvaversion(sourcedict["silva_version"], targetfolder)
 	prelim_downloadlist = [os.path.join(targetfolder, sourcedict["silva_version"]["wishlist"][0])]
 	wishdict = {silvacat : [ w.format(version) for w in sourcedict[silvacat]["wishlist"] ] for silvacat in sourcedict }
 	download_dict = None
@@ -356,7 +365,8 @@ def get_download_dict(prelim_downloadlist, wishdict):
 		for silvacat in ["silva_taxfiles", "silva_fastas"]:
 			for wish in wishdict[silvacat]:
 				sys.stderr.write("\n\tNow downloading from silva: \"{}\" (attempt {})...\n".format(wish, trycounter +1))
-				returncode = _download_unixwget(sourcedict[silvacat]["url"] + wish, pattern = None, targetdir=targetfolder, verbose=verbose)
+				# ~ sys.stderr.write(sourcedict[silvacat][url] + wish + "\n")
+				returncode = _download_unixwget(sourcedict[silvacat][url] + wish, pattern = None, targetdir=targetfolder, verbose=verbose)
 				if returncode != 0:
 					sys.stderr.write("\nWARNING: wget returned non-zero returncode '{}' after downloading {} \n".format(returncode, wish))
 				prelim_downloadlist.append(os.path.join(targetfolder, wish))

diff --git a/mdmcleaner/reporting.py b/mdmcleaner/reporting.py
@@ -174,31 +174,20 @@ def write_dictlines(indict, outfile):
 		total_proteins = 0
 
 	print_dict = {  "binname" : binname, \
-							"completeness_before" : trna_completeness_before,\
-							"completeness_after" : trna_completeness_after,\
+							"completeness_before[%]" : trna_completeness_before,\
+							"contamination_before[%]" : round(fraction_delete * 100, 2),\
+							"completeness_after[%]" : trna_completeness_after,\
 							"totalbincontigs" : totalbincontigs,\
 							"totalbinbp" : totalbinbp,\
 							"majortaxpath" : majortaxpath,\
 
-							"fraction_trustedbp" : fraction_trustedbp, \
-							"fraction_unknownbp" : fraction_unknownbp, \
-							"fraction_untrustedbp" : fraction_untrustedbp, \
 							"fraction_keep" : fraction_keep, \
 							"fraction_evaluate_low" : fraction_evaluate_low	, \
 							"fraction_evaluate_high" : fraction_evaluate_high, \
 							"fraction_delete" : fraction_delete, \
 							"bin_trust" : bin_trust, \
 							"bin_trust_ignoring_viral":bin_trust_ignoring_viral, \
 
-							"fraction_different_species":fraction_different_species, \
-							"fraction_different_genus":fraction_different_genus, \
-							"fraction_different_family":fraction_different_family, \
-							"fraction_different_order":fraction_different_order, \
-							"fraction_different_class":fraction_different_class, \
-							"fraction_different_phylum":fraction_different_phylum, \
-							"fraction_different_domain":fraction_different_domain, \
-							"fraction_viral":fraction_viral, \
-
 							"fraction_refdb_ambiguity" : fraction_refdb_ambiguity, \
 							"fraction_ignored_refdb_ambiguity" : fraction_ignored_refdb_ambiguity, \
 							"fraction_nocoding" : fraction_nocoding, \
@@ -210,18 +199,6 @@ def write_dictlines(indict, outfile):
 							"sum_fraction_different_family" : sum_fraction_different_family, \
 							"sum_fraction_different_genus" : sum_fraction_different_genus, \
 							"sum_fraction_different_species" : sum_fraction_different_species, \
-
-							"fraction_trust0":fraction_trust0, \
-							"fraction_trust1":fraction_trust1, \
-							"fraction_trust2":fraction_trust2, \
-							"fraction_trust3":fraction_trust3, \
-							"fraction_trust4":fraction_trust4, \
-							"fraction_trust5":fraction_trust5, \
-							"fraction_trust6":fraction_trust6, \
-							"fraction_trust7":fraction_trust7, \
-							"fraction_trust8":fraction_trust8, \
-							"fraction_trust9":fraction_trust9, \
-							"fraction_trust10":fraction_trust10, \
 
 							"total_16SrRNA":total_16SrRNA, \
 							"total_23SrRNA":total_23SrRNA, \