Skip to content

Commit

Permalink
Merge pull request #27 from MatthewRalston/metadata_block_end_symbols
Browse files Browse the repository at this point in the history
Update to the view method, and improvements to the header parsing and writing bits, officially there is a metadata block-end delimiter.
  • Loading branch information
MatthewRalston committed Jan 18, 2021
2 parents 3bcf4d8 + 979e839 commit 0700153
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 24 deletions.
87 changes: 70 additions & 17 deletions kmerdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,8 @@ def get_matrix(arguments):
final_df.to_csv(sys.stdout, sep=arguments.output_delimiter, index=arguments.with_index)
logger.info("Done printing {0} matrix to STDOUT".format(arguments.method))

logger.info("Beginning distribution analysis in R...")
logger.warn("Not implemented in Python...")
#logger.info("Beginning distribution analysis in R...")
#logger.warn("Not implemented in Python...")

sys.stderr.write(config.DONE)

Expand Down Expand Up @@ -703,23 +703,74 @@ def header(arguments):
else:
yaml.add_representer(OrderedDict, util.represent_ordereddict)
print(yaml.dump(kdb.header))

print(config.header_delimiter)

def view(arguments):
from kmerdb import fileutil

from kmerdb.config import VERSION
if os.path.splitext(arguments.kdb)[-1] != ".kdb":
raise IOError("Viewable .kdb filepath does not end in '.kdb'")

with fileutil.open(arguments.kdb, mode='r') as kdb:
if kdb.header["version"] != VERSION:
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
from kmerdb import fileutil, config
import json
header = None
if type(arguments.kdb_in) is not str: # Read from STDIN
logger.warning("Interpreting data from STDIN as uncompressed .kdb input")
header = ''
kdb_in = None
while type(header) is str:
line = sys.stdin.readline().rstrip()
if line != config.end_header_line:
header += line
continue
else:
header_dict = yaml.safe_parse(header)
if type(header_dict) is not dict:
logger.debug("Tried to parse:\n{0}\n".format(header))
raise ValueError("Could not parse YAML formatted header")
else:
if header["version"] != config.VERSION:
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
kdb_in = sys.stdin
header=header_dict
else:
assert type(arguments.kdb_in) is str, "kdb_in must be a str"
if os.path.splitext(arguments.kdb_in)[-1] != ".kdb": # A filepath with invalid suffix
raise IOError("Viewable .kdb filepath does not end in '.kdb'")
elif not os.path.exists(arguments.kdb_in):
raise IOError("Viewable .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in))
kdb_in = fileutil.open(arguments.kdb_in, mode='r')
header = kdb_in.header
if header["version"] != config.VERSION:
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
if arguments.kdb_out is None or arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT": # Write to stdout, uncompressed
if arguments.header:
print(yaml.dump(kdb.header))
for line in kdb:
for line in kdb_in:
print(line.rstrip())
elif arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout
logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.")
sys.exit(1)
elif type(arguments.kdb_out) is not str:
raise ValueError("Cannot write a file to an argument that isn't a string")
elif os.path.exists(arguments.kdb_out):
logger.warning("Overwriting '{0}'...".format(arguments.kdb_out))
elif not os.path.exists(arguments.kdb_out):
logger.debug("Creating '{0}'...".format(arguments.kdb_out))
with fileutil.open(arguments.kdb_out, header=header, mode='wb') as kdb_out:
try:
line = None
for line in kdb_in:
line = line.rstrip()
kmer_id, count, metadata = line.split("\t")
kmer_id, count = int(kmer_id), int(count)
kdb_out.write("{0}\t{1}\t{2}\n".format(kmer_id, count, json.dumps(metadata)))
except StopIteration as e:
logger.error(e)
raise e
finally:
#kdb_out._write_block(kdb_out._buffer)
#kdb_out._handle.flush()
#kdb_out._handle.close()
sys.stderr.write(config.DONE)



def profile(arguments):
import math
from kmerdb import parse, fileutil, kmer
Expand Down Expand Up @@ -751,8 +802,8 @@ def profile(arguments):
header["metadata_blocks"] = math.ceil( sys.getsizeof(header_bytes) / ( 2**16 ) ) # Second estimate
#header["metadata_blocks"] = 2
logger.info("Collapsing the k-mer counts across the various input files into the final kdb file '{0}'".format(arguments.kdb))
kdb_out = fileutil.open(arguments.kdb, 'wb', header)
try:
kdb_out = fileutil.open(arguments.kdb, 'wb', header)
iterating = True
while iterating:
try:
Expand All @@ -768,8 +819,8 @@ def profile(arguments):
pass
else:
seq = kmer.id_to_kmer(i, arguments.k)
neighbors = kmer.neighbors(seq, arguments.k)
kdb_out.write("{0}\t{1}\t{2}\n".format(i, count, json.dumps(neighbors)))
metadata = kmer.neighbors(seq, arguments.k) # metadata is initialized by the neighbors
kdb_out.write("{0}\t{1}\t{2}\n".format(i, count, json.dumps(metadata)))
else:
iterating = False
except StopIteration as e:
Expand Down Expand Up @@ -890,7 +941,9 @@ def cli():
view_parser = subparsers.add_parser("view", help="View the contents of the .kdb file")
view_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count")
view_parser.add_argument("-H", "--header", action="store_true", help="Include header in the output")
view_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)")
view_parser.add_argument("-c", "--compress", action="store_true", help="Print compressed output")
view_parser.add_argument("kdb_in", type=str, nargs="?", default=None, help="A k-mer database file (.kdb) to be read (Optional)")
view_parser.add_argument("kdb_out", type=str, nargs="?", default=None, help="A k-mer database file (.kdb) to be written to (Optional)")
view_parser.set_defaults(func=view)


Expand Down
2 changes: 2 additions & 0 deletions kmerdb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@


VERSION="0.0.7"
end_header_line = " kdb: can someone get me some sentries"
header_delimiter = end_header_line + "\n" + ("="*24) + "\n"

header_schema = {
"type": "object",
Expand Down
29 changes: 22 additions & 7 deletions kmerdb/fileutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from collections import deque, OrderedDict
import psutil
import numpy as np
import math

#import pdb

Expand Down Expand Up @@ -83,12 +84,13 @@ def _s3_file_download(self, seqpath, temporary=True):
return filepath.name


def open(filepath, mode="r", *args):
def open(filepath, mode="r", header=None):

if type(filepath) is not str:
raise TypeError("kmerdb.fileutil.open expects a str as its first positional argument")
elif type(mode) is not str:
raise TypeError("kmerdb.fileutil.open expects the keyword argument 'mode' to be a str")
elif "w" in mode and (len(args) != 1 or not isinstance(args[0], OrderedDict)):
elif ("w" in mode or "x" in mode) and (header is None or not isinstance(header, OrderedDict)):
raise TypeError("kmerdb.fileutil.open expects an additional header dictionary")
modes = set(mode)
if modes - set("xrwbt") or len(mode) > len(modes):
Expand All @@ -109,8 +111,8 @@ def open(filepath, mode="r", *args):

if "r" in mode.lower():
return KDBReader(filename=filepath, mode=mode)
elif "w" in mode.lower() or "a" in mode.lower():
return KDBWriter(args[0], filename=filepath, mode=mode)
elif "w" in mode.lower() or "x" in mode.lower():
return KDBWriter(header, filename=filepath, mode=mode)
else:
raise ValueError("Bad mode %r" % mode)

Expand Down Expand Up @@ -150,6 +152,8 @@ def __init__(self, filename:str=None, fileobj:io.IOBase=None, mode:str="r", max_
# 0th block
logger.info("Loading the 0th block from '{0}'...".format(self._filepath))
self._load_block(self._handle.tell())

self._buffer = self._buffer.rstrip(config.header_delimiter)
header_data = OrderedDict(yaml.safe_load(self._buffer))
num_header_blocks = None
if type(header_data) is str:
Expand All @@ -167,7 +171,7 @@ def __init__(self, filename:str=None, fileobj:io.IOBase=None, mode:str="r", max_
else:
for i in range(header_data["metadata_blocks"] - 1):
self._load_block(self._handle.tell())
addtl_header_data = yaml.safe_load(self._buffer)
addtl_header_data = yaml.safe_load(self._buffer.rstrip(config.header_delimiter))
if type(addtl_header_data) is str:
logger.error(addtl_header_data)
raise TypeError("kmerdb.fileutil.KDBReader determined the data in the {0} block of the header data from '{1}' was not YAML formatted".format(i, self._filepath))
Expand Down Expand Up @@ -357,12 +361,23 @@ def __init__(self, header:OrderedDict, filename=None, mode="w", fileobj=None, co
"""
Write the header to the file
"""




logger.info("Constructing a new .kdb file '{0}'...".format(self._handle.name))
yaml.add_representer(OrderedDict, util.represent_ordereddict)

header_bytes = bgzf._as_bytes(yaml.dump(self.header, sort_keys=False))
header_plus_delimiter_in_bytes = header_bytes + bgzf._as_bytes(config.header_delimiter)
self.header["metadata_blocks"] = math.ceil( sys.getsizeof(header_plus_delimiter_in_bytes) / ( 2**16 ) ) # First estimate
header_bytes = bgzf._as_bytes(yaml.dump(self.header, sort_keys=False))
header_bytes = header_bytes + bgzf._as_bytes(config.header_delimiter)
self.header["metadata_blocks"] = math.ceil( sys.getsizeof(header_bytes) / ( 2**16 ) ) # Second estimate
logger.info("Writing the {0} header blocks to the new file".format(self.header["metadata_blocks"]))
logger.debug(self.header)
logger.debug("Header is being written as follows:\n{0}".format(yaml.dump(self.header, sort_keys=False)))
yaml.add_representer(OrderedDict, util.represent_ordereddict)
header_bytes = bgzf._as_bytes(yaml.dump(self.header, sort_keys=False))

for i in range(self.header["metadata_blocks"]):
header_slice = header_bytes[:65536]
header_bytes = header_bytes[65536:]
Expand Down

0 comments on commit 0700153

Please sign in to comment.