Merge 1e5f60f into e61106a

MatthewRalston · Jan 18, 2021 · 392c603 · 392c603
2 parents e61106a + 1e5f60f
commit 392c603
Show file tree

Hide file tree

Showing 9 changed files with 184 additions and 115 deletions.
diff --git a/TODO.org b/TODO.org
@@ -91,8 +91,8 @@ The final prototype would be .bgzf format from biopython
 ** Results
 *** Distribution fitting / model selection
 *** PCA
-*** 
-
+*** kmerdb shuf on 3 of 30 metagenomes for k=1:12 + kPAL figure 
+*** Median "distance" between profiles of pairwise comparison
 ** Distribution analysis
 ** Accurately describe kdb counting algorithm
 *** althought the algorithm differs in its approach to fastq k-mer counting from fasta k-mer counting,

diff --git a/examples/example_report/bibliography.bib b/examples/example_report/bibliography.bib
@@ -331,6 +331,17 @@ @article{simpson2014exploring
   year={2014},
   publisher={Oxford University Press}
 }
+@article{caporaso2011moving,
+  title={Moving pictures of the human microbiome},
+  author={Caporaso, J Gregory and Lauber, Christian L and Costello, Elizabeth K and Berg-Lyons, Donna and Gonzalez, Antonio and Stombaugh, Jesse and Knights, Dan and Gajer, Pawel and Ravel, Jacques and Fierer, Noah and others},
+  journal={Genome biology},
+  volume={12},
+  number={5},
+  pages={1--8},
+  year={2011},
+  publisher={BioMed Central}
+}
+
 
 
 
diff --git a/examples/example_report/index.Rmd b/examples/example_report/index.Rmd
@@ -100,7 +100,9 @@ Researchers are interested in k-mers and alignment-free methods because sequence
 
 First, there was Jellyfish (\parencite{marccais2011fast}), designed to keep k-mer count histograms in a specialized file with remarkable language bindings and profiling of the algorithm. At the center of its innovation to the number of data structures used to store k-mer counts was a lock-free hash table used for concurrent access to the k-mer profile. To my knowledge, this is the origin of the convention of encoding each k-mer as an integer, which saves on memory and file size. The only downside to the paper was the extreme focus on performance, for the rather narrow group of bioinformaticians who want to explore the subject through the lens of real-world datasets and biological insights. That said, I enjoyed the implementation, it's attention to language bindings, and the excellent focus.
 
-Next we have the k-mer Profile Analysis Library (\{anvar2014determining}), where Anvar points out that software is an analytical tool that we use can use on real world and simulated data. The authors note the primary questions involved in exploration of these subsequence spaces are the modalities of certain spectra, and the characteristic curve i.e. the over-representation and underrepresentation along the spectrum for any particular input. But comparison of multiple spectra also plays a role and the authors chose a single-distance interface using their multi-set and Euclidean distances.
+Next we have the k-mer Profile Analysis Library (\{anvar2014determining}), where Anvar points out that software is an analytical tool that we use can use on real world and simulated data. The authors note the primary questions involved in exploration of these subsequence spaces are the modalities of certain spectra, and the characteristic curve i.e. the over-representation and underrepresentation along the spectrum for any particular input. The authors note that a choice of k of 10 seemed to be optimal for them to establish maximal separation between simulated metagenomes and their randomly permuted counterparts. I look to confirm these findings, as perhaps there are general patterns for microbial genome complexity and they could be specified better with model criterion in k-mer space.
+
+
 
 ## What are "alignment-free" methods?
 

diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py
@@ -51,11 +51,37 @@ def citation(arguments):
 def index_file(arguments):
     from kmerdb import fileutil, index
     from kmerdb.config import DONE
+
     with fileutil.open(arguments.kdb, mode='r') as kdb:
-        k = kdb.header['k']
+        k = kdb.metadata['k']
     line_index = index.open(arguments.kdb, mode="xt", k=k)
     sys.stderr.write(DONE)
 
+
+def shuf(arguments):
+    import random
+
+    from kmerdb import fileutil, index
+    from kmerdb.config import DONE
+
+    if os.path.splitext(arguments.kdb_in)[-1] != ".kdb":
+        raise ValueError("kdb shuf requires an indexed .kdb file. Received '{0}'".format(arguments.kdb))
+    else:
+        arguments.kdb_index = arguments.kdb_in + "i"
+
+        with fileutil.open(arguments.kdb_in, mode='r') as kdb:
+            with index.open(arguments.kdb_index, mode='r') as kdbi:
+                shuffled = list(range(len(kdbi.index)))
+                random.shuffle(shuffled)
+                random.shuffle(shuffled)
+                with fileutil.open(arguments.kdb_out, mode='w', metadata=kdb.metadata) as kdb_out:
+                    x = 1
+                    for i in shuffled:
+                        kmer_id, count, kmer_metadata = index.read_line(kdb, kdbi, i)
+                        kdb_out.write("{0}\t{1}\t{2}".format(x, count, kmer_metadata))
+                        x += 1
+    sys.stderr.write(DONE)
+
 def markov_probability(arguments):
     import pandas as pd
     import numpy as np
@@ -71,7 +97,7 @@ def markov_probability(arguments):
         #df = pd.DataFrame([], columns=["SequenceID", "Log_Odds_ratio", "p_of_seq"])
         profiles = np.array([], dtype="int64")
         with fileutil.open(arguments.kdb, 'r') as kdb:
-            k = kdb.header['k']
+            k = kdb.metadata['k']
             with index.open(arguments.kdbi, 'r') as kdbi:
                 with seqparser.SeqParser(arguments.seqfile, arguments.fastq_block_size, k) as seqprsr:
                     recs = [r for r in seqprsr]
@@ -696,19 +722,19 @@ def header(arguments):
         raise IOError("Viewable .kdb filepath does not end in '.kdb'")
 
     with fileutil.open(arguments.kdb, mode='r') as kdb:
-        if kdb.header["version"] != config.VERSION:
+        if kdb.metadata["version"] != config.VERSION:
             logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
         if arguments.json:
-            print(dict(kdb.header))
+            print(dict(kdb.metadata))
         else:
             yaml.add_representer(OrderedDict, util.represent_ordereddict)
-            print(yaml.dump(kdb.header))
+            print(yaml.dump(kdb.metadata))
             print(config.header_delimiter)
 
 def view(arguments):
     from kmerdb import fileutil, config, util
     import json
-    header = None
+    metadata = None
 
     def get_header(line, header):
         """
@@ -730,23 +756,23 @@ def get_header(line, header):
 
     if type(arguments.kdb_in) is None or arguments.kdb_in == "STDIN" or arguments.kdb_in == "/dev/stdin": # Read from STDIN
         logger.warning("Interpreting data from STDIN as uncompressed .kdb input")
-        header = ''
+        metadata = ''
         kdb_in = None
         if arguments.decompress is True:
             ifile = gzip.open(sys.stdin.buffer, 'rt')
-            while type(header) is str:
+            while type(metadata) is str:
                 #logger.debug("looping in gzip for header")
                 line = ifile.readline()
-                header = get_header(line, header)
+                metadata = get_header(line, metadata)
             kdb_in = ifile
         else:
             i = 0
-            while type(header) is str:
+            while type(metadata) is str:
                 line = sys.stdin.readline()
                 #logger.debug("looping in stdin for header")
                 #print(line)
 
-                header = get_header(line, header)
+                metadata = get_header(line, metadata)
                 i += 1
 
                 if i == 50:
@@ -759,13 +785,13 @@ def get_header(line, header):
         elif not os.path.exists(arguments.kdb_in):
             raise IOError("Viewable .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in))
         kdb_in = fileutil.open(arguments.kdb_in, mode='r')
-        header = kdb_in.header
-    if header["version"] != config.VERSION:
+        metadata = kdb_in.metadata
+    if metadata["version"] != config.VERSION:
         logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
     if arguments.kdb_out is None or arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT": # Write to stdout, uncompressed
         if arguments.header:
             yaml.add_representer(OrderedDict, util.represent_ordereddict)
-            print(yaml.dump(header, sort_keys=False))
+            print(yaml.dump(metadata, sort_keys=False))
             print(config.header_delimiter)
         if kdb_in is None: # Read from STDIN, since there was no kdb_in file.
             logger.info("Reading from STDIN...")
@@ -794,20 +820,20 @@ def get_header(line, header):
     elif not os.path.exists(arguments.kdb_out):
         logger.debug("Creating '{0}'...".format(arguments.kdb_out))
     else:
-        with fileutil.open(arguments.kdb_out, header=header, mode='wb') as kdb_out:
+        with fileutil.open(arguments.kdb_out, metadata=metadata, mode='wb') as kdb_out:
             try:
                 line = None
                 if kdb_in is None:
                     while line is None:
                         line = sys.stdin.readline().rstrip()
-                        kmer_id, count, metadata = fileutil.parse_line(line)
-                        kdb_out.write("{0}\t{1}\t{2}\n".format(kmer_id, count, metadata))
+                        kmer_id, count, kmer_metadata = fileutil.parse_line(line)
+                        kdb_out.write("{0}\t{1}\t{2}\n".format(kmer_id, count, kmer_metadata))
                         line = None
                 else:
                     for line in kdb_in:
                         line = line.rstrip()
-                        kmer_id, count, metadata = fileutil.parse_line(line)
-                        kdb_out.write("{0}\t{1}\t{2}\n".format(kmer_id, count, metadata))
+                        kmer_id, count, kmer_metadata = fileutil.parse_line(line)
+                        kdb_out.write("{0}\t{1}\t{2}\n".format(kmer_id, count, kmer_metadata))
 
             except StopIteration as e:
                 logger.error(e)
@@ -829,29 +855,29 @@ def profile(arguments):
     if os.path.splitext(arguments.kdb)[-1] != ".kdb":
         raise IOError("Destination .kdb filepath does not end in '.kdb'")
 
-    metadata = []
+    file_metadata = []
     tempdbs = []
     for f in arguments.seqfile:
         db, m = parse.parsefile(f, arguments.k, p=arguments.parallel, b=arguments.fastq_block_size, stranded=arguments.not_strand_specific)
-        metadata.append(m)
+        file_metadata.append(m)
         tempdbs.append(db)
-    header=OrderedDict({
+    metadata=OrderedDict({
         "version": VERSION,
         "metadata_blocks": 1,
         "k": arguments.k,
         "metadata": False,
         "tags": [],
-        "files": metadata
+        "files": file_metadata
     })
 
-    header_bytes = bgzf._as_bytes(yaml.dump(header))
+    metadata_bytes = bgzf._as_bytes(yaml.dump(metadata))
     # The number of metadata blocks is the number of bytes of the header block(s) / the number of bytes per block in the BGZF specification
-    header["metadata_blocks"] = math.ceil( sys.getsizeof(header_bytes) / ( 2**16 ) ) # First estimate
-    header_bytes = bgzf._as_bytes(yaml.dump(header))
-    header["metadata_blocks"] = math.ceil( sys.getsizeof(header_bytes) / ( 2**16 ) ) # Second estimate
-    #header["metadata_blocks"] = 2
+    metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_bytes) / ( 2**16 ) ) # First estimate
+    metadata_bytes = bgzf._as_bytes(yaml.dump(metadata))
+    metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_bytes) / ( 2**16 ) ) # Second estimate
+
     logger.info("Collapsing the k-mer counts across the various input files into the final kdb file '{0}'".format(arguments.kdb))    
-    kdb_out = fileutil.open(arguments.kdb, 'wb', header)
+    kdb_out = fileutil.open(arguments.kdb, 'wb', metadata=metadata)
     try:
         iterating = True
         while iterating:
@@ -868,8 +894,8 @@ def profile(arguments):
                         pass
                     else:
                         seq = kmer.id_to_kmer(i, arguments.k)
-                        metadata = kmer.neighbors(seq, arguments.k) # metadata is initialized by the neighbors
-                        kdb_out.write("{0}\t{1}\t{2}\n".format(i, count, metadata))
+                        kmer_metadata = kmer.neighbors(seq, arguments.k) # metadata is initialized by the neighbors
+                        kdb_out.write("{0}\t{1}\t{2}\n".format(i, count, kmer_metadata))
                 else:
                     iterating = False
             except StopIteration as e:
@@ -1080,6 +1106,13 @@ def cli():
     index_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)")
     index_parser.set_defaults(func=index_file)
 
+    shuf_parser = subparsers.add_parser("shuf", help="Create a shuffled .kdb file")
+    shuf_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count")
+    shuf_parser.add_argument("kdb_in", type=str, help="An indexed k-mer database file (.kdb)")
+    shuf_parser.add_argument("kdb_out", type=str, help="The output shuffled k-mer database file (.kdb)")
+    shuf_parser.set_defaults(func=shuf)
+
+
     markov_probability_parser = subparsers.add_parser("probability", help=u"""
 Calculate the log-odds ratio of the Markov probability of a given sequence from the product (pi) of the transition probabilities(aij) times the frequency of the first k-mer (P(X1)), given the entire k-mer profile of a species.
 
@@ -1092,7 +1125,7 @@ def cli():
     markov_probability_parser.add_argument("-d", "--delimiter", type=str, default="\t", help="The delimiter to use when reading the csv.")
     markov_probability_parser.add_argument("-b", "--fastq-block-size", type=int, default=100000, help="Number of reads to load in memory at once for processing")
     markov_probability_parser.add_argument("seqfile", type=str, metavar="<.fasta|.fastq>", default=None, help="Sequences to calculate standard Markov-chain probabilities from, either .fasta or .fastq")
-    markov_probability_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)")
+    markov_probability_parser.add_argument("kdb", type=str, help="An indexed k-mer database file (.kdb)")
     markov_probability_parser.set_defaults(func=markov_probability)
 
     citation_parser = subparsers.add_parser("citation", help="Silence the citation notice on further runs")

diff --git a/kmerdb/config.py b/kmerdb/config.py
@@ -21,7 +21,7 @@
 end_header_line = "    kdb: can someone get me some sentries"
 header_delimiter = end_header_line + "\n" + ("="*24) + "\n"
 
-header_schema = {
+metadata_schema = {
     "type": "object",
     "properties": {
         "version": {"type": "string"},

diff --git a/kmerdb/distance.py b/kmerdb/distance.py
@@ -46,14 +46,14 @@ def correlation(fname1, fname2):
     with fileutil.open(fname1, mode='r') as kdb1:
         with fileutil.open(fname2, mode='r') as kdb2:
             if k is None:
-                k = kdb1.header['k']
-            if k != kdb1.header['k']:
-                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb1.header['k'], k))
-            elif k != kdb2.header['k']:
-                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb2.header['k'], k))
+                k = kdb1.metadata['k']
+            if k != kdb1.metadata['k']:
+                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb1.metadata['k'], k))
+            elif k != kdb2.metadata['k']:
+                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb2.metadata['k'], k))
             N = 4 ** k
-            x_bar = functools.reduce(lambda a,b: a+b, map(lambda x: x['total_kmers'], kdb1.header['files']), 0) / N
-            y_bar = functools.reduce(lambda a,b: a+b, map(lambda y: y['total_kmers'], kdb2.header['files']), 0) / N
+            x_bar = functools.reduce(lambda a,b: a+b, map(lambda x: x['total_kmers'], kdb1.metadata['files']), 0) / N
+            y_bar = functools.reduce(lambda a,b: a+b, map(lambda y: y['total_kmers'], kdb2.metadata['files']), 0) / N
             ## CALCULATE CORRELATION
             ssxx = 0
             ssyy = 0
@@ -88,11 +88,11 @@ def euclidean(fname1, fname2):
     with fileutil.open(fname1, mode='r') as kdb1:
         with fileutil.open(fname2, mode='r') as kdb2:
             if k is None:
-                k = kdb1.header['k']
-            if k != kdb1.header['k']:
-                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb1.header['k'], k))
-            elif k != kdb2.header['k']:
-                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb2.header['k'], k))
+                k = kdb1.meatadata['k']
+            if k != kdb1.metadata['k']:
+                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb1.metadata['k'], k))
+            elif k != kdb2.metdata['k']:
+                raise Exception("File '{0}' reported k = {1} instead of k = {2}".format(f, kdb2.metadata['k'], k))
             N = 4 ** k
 
             for kmer_id in range(N):