Skip to content

Commit

Permalink
Fixes #10. Also adds verbosity feature with k-mer count summing in mu…
Browse files Browse the repository at this point in the history
…ltiprofile, Uses fileutil.open instead of calling the class directly. Adds some major changes to the distance method, specifically printing an improved tsv with full symmetry emulation and header printing (filename => os.path.basename(f).split('.')[0]).
  • Loading branch information
MatthewRalston committed Jan 3, 2021
1 parent 030dc50 commit b4388ce
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 17 deletions.
32 changes: 25 additions & 7 deletions bin/kdb
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,46 @@ def index_file(arguments):
index._write_line_index(arguments.kdbi, line_index)

def distances(arguments):
import pandas as pd
import numpy as np
if len(arguments.kdb) < 2:
logger.error("kdb distance requires at least two .kdb files.")
sys.exit(1)

from kdb import distance
n = len(arguments.kdb)
filenames = list(map(lambda f: os.path.basename(f).split(".")[0], arguments.kdb))
data = [['' for x in range(n)] for y in range(n)]

for i in range(n):
for j in range(n):
if i == j:
data[i][j] = distance.identity[arguments.metric]
elif i > j:
data[i][j] = ''
else:
data[i][j] = None
elif i < j and arguments.metric == "correlation":
logger.info("Calculating correlation distance between '{0}' and '{1}'...".format(arguments.kdb[i], arguments.kdb[j]))
data[i][j] = distance.correlation(arguments.kdb[i], arguments.kdb[j])
else:
logger.error("Other distances are not implemented yet")
sys.exit(1)

# This double loop quickly identifies empty cells and sets the data correctly from the permutation above
for i in range(n):
for j in range(n):
if data[i][j] is None:
data[i][j] = data[j][i]
#print(data)
logger.info("Printing distance matrix...")

if len(arguments.kdb) == 2:
logger.info("Printing single distance")
print(data[0][1])
else:
for x in range(len(arguments.kdb)):
print("\t".join(map(str, data[x])))
logger.info("Printing distance matrix...")
df = pd.DataFrame(np.array(data), columns=filenames)
df.to_csv(sys.stdout, sep=arguments.delimiter, index=False)
# for x in range(n):
# print("\t".join(map(str, data[x])))

def get_matrix(arguments):
logging.getLogger('matplotlib.font_manager').disabled = True
Expand Down Expand Up @@ -369,7 +386,7 @@ def profile(arguments):
#header["metadata_blocks"] = 2
logger.info("Collapsing the k-mer counts across the various input files into the final kdb file '{0}'".format(arguments.kdb))
try:
kdb_out = fileutil.KDBWriter(header, fileobj=open(arguments.kdb, 'wb'))
kdb_out = fileutil.open(arguments.kdb, 'wb', header)
iterating = True
while iterating:
# The 0th element is the count
Expand All @@ -380,7 +397,8 @@ def profile(arguments):
i = kmer_counts_per_file[0][0] - 1 # Remove 1 for the Sqlite zero-based indexing
count = sum([x[1] for x in kmer_counts_per_file]) # The 1th element is the k-mer count
#sys.stderr.write("\r")
sys.stderr.write("K-mer counts: {0} = {1}\n".format(list(map(lambda x: x[1], kmer_counts_per_file)), count))
if arguments.verbose == 2:
sys.stderr.write("K-mer counts: {0} = {1}\n".format(list(map(lambda x: x[1], kmer_counts_per_file)), count))
kdb_out.write("{0}\t{1}\n".format(i, count))
else:
iterating = False
Expand Down
26 changes: 16 additions & 10 deletions kdb/fileutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
sys.path.append('..')

import config
from kdb import kmer, database
from kdb import kmer, database, util

# Logging configuration
import logging
Expand Down Expand Up @@ -65,15 +65,19 @@ def _s3_file_download(self, seqpath, temporary=True):
return filepath.name


def open(filepath, mode="r"):
def open(filepath, mode="r", *args):
if type(filepath) is not str:
raise TypeError("kdb.fileutil.open expects a str as its first positional argument")
elif type(mode) is not str:
raise TypeError("kdb.fileutil.open expects the keyword argument 'mode' to be a str")
elif "w" in mode and (len(args) != 1 or not isinstance(args[0], OrderedDict)):
raise TypeError("kdb.fileutil.open expects an additional header dictionary")
modes = set(mode)
if modes - set("xrwbt") or len(mode) > len(modes):
raise ValueError("invalid mode: {}".format(mode))



creating = "x" in modes
reading = "r" in modes
writing = "w" in modes
Expand All @@ -86,9 +90,9 @@ def open(filepath, mode="r"):
raise ValueError("must have exactly one or read/write")

if "r" in mode.lower():
return KDBReader(filepath, mode=mode)
return KDBReader(filename=filepath, mode=mode)
elif "w" in mode.lower() or "a" in mode.lower():
return KDBWriter(filepath, mode=mode)
return KDBWriter(args[0], filename=filepath, mode=mode)
else:
raise ValueError("Bad mode %r" % mode)

Expand Down Expand Up @@ -127,11 +131,11 @@ def __init__(self, filename:str=None, fileobj:io.IOBase=None, mode:str="r", max_
# 0th block
logger.info("Loading the 0th block from '{0}'...".format(self._filepath))
self._load_block(handle.tell())
header_data = yaml.safe_load(self._buffer)
header_data = OrderedDict(yaml.safe_load(self._buffer))
num_header_blocks = None
if type(header_data) is str:
raise TypeError("kdb.fileutil.KDBReader could not parse the YAML formatted metadata in the first blocks of the file")
elif type(header_data) is dict:
elif type(header_data) is OrderedDict:
logger.info("Successfully parsed the 0th block of the file, which is expected to be the first block of YAML formatted metadata")
if "version" not in header_data.keys():
raise TypeError("kdb.fileutil.KDBReader couldn't validate the header YAML")
Expand Down Expand Up @@ -277,7 +281,7 @@ def setup_yaml():


class KDBWriter(bgzf.BgzfWriter):
def __init__(self, header:dict, filename=None, mode="w", fileobj=None, compresslevel=6):
def __init__(self, header:OrderedDict, filename=None, mode="w", fileobj=None, compresslevel=6):
"""Initilize the class."""
if not isinstance(header, OrderedDict):
raise TypeError("kdb.fileutil.KDBWriter expects a valid header object as its first positional argument")
Expand Down Expand Up @@ -311,9 +315,11 @@ def __init__(self, header:dict, filename=None, mode="w", fileobj=None, compressl
Write the header to the file
"""
logger.info("Constructing a new kdb file '{0}'...".format(self._handle.name))
logger.debug("Writing the {0} header blocks to the new file".format(self.header["metadata_blocks"]))
setup_yaml()
header_bytes = bgzf._as_bytes(yaml.dump(self.header))
logger.info("Writing the {0} header blocks to the new file".format(self.header["metadata_blocks"]))
logger.debug(self.header)
logger.debug("Header is being written as follows:\n{0}".format(yaml.dump(self.header, sort_keys=False)))
yaml.add_representer(OrderedDict, util.represent_ordereddict)
header_bytes = bgzf._as_bytes(yaml.dump(self.header, sort_keys=False))
for i in range(self.header["metadata_blocks"]):
header_slice = header_bytes[:65536]
header_bytes = header_bytes[65536:]
Expand Down

0 comments on commit b4388ce

Please sign in to comment.