Merge a62e0d2 into c1bbfc6

MatthewRalston · Jan 2, 2021 · 9d02878 · 9d02878
2 parents c1bbfc6 + a62e0d2
commit 9d02878
Show file tree

Hide file tree

Showing 8 changed files with 583 additions and 8 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,7 @@ language: python
 python:
   - "3.7.4"
 install:
+  - pip install Cython
   - pip install -r requirements.txt
   - pip install pytest pytest-cov coveralls
 script:

diff --git a/README.md b/README.md
@@ -105,3 +105,23 @@ Distributed under the GPL v3.0 license. See `LICENSE.txt` for the copy distribut
 4. Push to the branch (`git push origin feature/fooBar`)
 5. Create a new Pull Request
 
+## Acknowledgements
+
+Thank you to the authors of kPAL and Jellyfish for the early inspiration. And thank you to others for the encouragement along the way, who shall remain nameless. I wanted this library to be a good strategy for assessing these k-mer profiles, in a way that is both cost aware of the analytical tasks at play, capable of storing the exact profiles in sync with the current assemblies, and then updating the kmer databases only when needed to generate enough spectral signature information.
+
+The intention is that more developers would want to add functionality to the codebase or even just utilize things downstream, but to build out directly with numpy and scipy/scikit as needed to suggest the basic infrastructure for the ML problems and modeling approaches that could be applied to such datasets. This project has begun under GPL v3.0 and hopefully could gain some interest.
+
+More on the flip-side. Literally. And figuratively. It's so complex with technology these days.
+
+<!--
+Thanks to my former mentors BC, MR, IN, CR, and my newer bosses PJ and KL.
+Thanks to the Pap lab for the first dataset that I continue to use.
+Thank you to Ryan for the food and stuff.
+Thanks to Blahah for tolerating someone snooping and imitating his Ruby style.
+Thanks to Erin for getting my feet wet in this new field.
+Thanks to Rachel for the good memories and friendship.
+Thanks to Yasmeen for the usual banter.
+Thanks to Max, Robin, and Robert for the halfway decent memories in St. Louis.
+And thanks to my family and friends.
+Go Blue Hens
+-->
diff --git a/TODO.org b/TODO.org
@@ -4,6 +4,28 @@
 The initial prototype would be plaintext
 The final prototype would be .bgzf format from biopython
 
+
+* MASTHEADS
+** DEBUG_MASTHEAD
+*** other figure filepaths
+*** Memory issue regarding matrix formation
+**** Can we calculate for them how much memory would be required?
+**** Can we print that as the final line of the error?
+
+* [[https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA][PCA]] (scikit-learn)
+** Uses the LAPACK implementation
+** 
+* Manifold learning
+** Isomap (derived from multidimensional scaling (MDS) or Kernel PCA)
+*** Lower dimensional projectsion of the data preserving geodesic distances between all points
+** (Modified) Locally Linear Embedding
+*** Lower dimensional projection of the data preserving local neighborhood distances
+*** locally_linear_embedding or LocallyLinearEmbedding with method="modified"
+** t-SNE
+*** While isomap, LLE, and variants are best tuited to unfold a single continuous low-dimensional manifold
+*** t-SNE will focus on the local structure of the data and will tend to extract clustered local groups of samples.
+*** This ability to group samples based on the local structure might be beneficial to visually disentangle a dataset that comprises several manifolds at once.
+
 * Comment code
 ** TODO fileutil
 ** profile

diff --git a/bin/kdb b/bin/kdb
diff --git a/config.py b/config.py
@@ -51,3 +51,184 @@
     },
     "required": ["version", "metadata_blocks", "k", "tags", "files"]
 }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+pca_variance_fig_filepath = "PCA_variance_accumulation.png"
+kmeans_elbow_graph_fig_filepath = "kmeans_on_pca_elbow_graph.png"
+kmeans_clustering_fig_filepath = "kmeans_clustering_of_kmer_profiles.png"
+ecopy_rarefaction_fig_filepath = "ecopy_rarefaction_curve.png"
+files = (pca_variance_fig_filepath, kmeans_elbow_graph_fig_filepath, kmeans_clustering_fig_filepath, ecopy_rarefaction_fig_filepath)
+
+DEFAULT_MASTHEAD = """
+
+==========================================
+                 k d b
+==========================================
+
+Thank you for using kdb. Please feel free
+to submit issues and requests to 
+https://github.com/MatthewRalston/kdb
+
+Copyright 2020 Matt Ralston (mrals89@gmail.com)
+
+# First links
+https://matthewralston.github.io/blog/kmer-database-format-part-1
+https://github.com/MatthewRalston/kdb
+
+
+Please cite my repository in your work!
+
+Feel free to e-mail me or reach out!
+
+"""
+DONE = """
+
+==========================================
+----------------D O N E-------------------
+==========================================
+
+"""
+
+
+MATRIX_MASTHEAD = """
+
+==========================================
+           ./bin/kdb matrix
+==========================================
+
+Matrix generation beginning!
+
+Matrix will be written to STDOUT as this is the first step of the pipeline.
+
+"""
+for i in range(42, 1, -1):
+    MATRIX_MASTHEAD += i*"=" + "\n"
+
+
+CLUSTER_MASTHEAD = """
+
+==========================================
+           ./bin/kdb cluster
+==========================================
+
+K-means clustering beginning!
+
+"""
+for i in range(42, 1, -1):
+    CLUSTER_MASTHEAD += i*"=" + "\n"
+
+RAREFY_MASTHEAD = """
+
+==========================================
+           ./bin/kdb rarefy
+==========================================
+
+
+Rarefaction beginning!
+
+"""
+for i in range(42, 1, -1):
+    RAREFY_MASTHEAD += i*"=" + "\n"
+
+
+
+
+
+DEBUG_MASTHEAD = '''
+
+The workflow is roughly as follows:
+
+# # # # # # # # # #  #
+# profile generation #
+# # # # # # # # # #  #
+# I have included -p and -b parameters that influence the rate of profile generation.
+# The block size (-b) is primarily for the number of .fastq(.gz) records to read at a time.
+# The -p parallel feature works within fastq parsing to process k-mer shredding/counting.
+#
+# -k $K is the parameter that specifies the k-mer resolution
+#
+# This command uses SQLite3 behind the scenes for on-disk k-mer counting
+# since memory is rate limiting for profile generation when dealing 
+# with biologically conventional choices of k (20 < k < 35).
+parallel 'kdb profile -k $K {{}} {{.}}.$K.kdb' ::: $(/bin/ls test/data/*.fasta.gz)
+
+
+
+# # # # # # #
+# analysis  #
+# # # # # # #
+################################
+# W A R N I N G :  M E M O R Y #
+################################
+# The first step of either rarefaction or clustering is to generate the k-mer profile matrix
+# The matrix is not a new concept, it is just samples as columns of profiles. 
+# Without metadata or graphical information embedded in the format, 
+# we can create a matrix and do data science with the information.
+# However, because the profiles are exponential in k, a linear increase in k 
+# hypothetically results in an improvement of resolution that is at least superlinear in k.
+# Therefore, the amount of memory can be calculated by the integer size times the profile resolution
+# times the number of samples.
+#
+#
+##################
+# Cluster analysis
+##################
+# The first step ('kdb matrix') generates one from different profiles with the same choice of k.
+# This command uses ecopy to normalize between sample k-mer total counts before PCA/tSNE.
+# -n $N is the dimensionality of either PCA or tSNE. A good choice for tSNE is 2.
+# If the command is run with PCA without a selected dimensionality, an elbow graph
+# will be produced named '{0}'. Please use this graph to select
+# the number of principal components to use.
+# The pipeline will not continue until -n $N is selected by the user.
+# It is not recommended to feed Unnormalized or Normalized matrices directly to 'kdb cluster'
+# 
+# The PCA/tSNE matrix will be dimReduced ($N) * N, where N is the number of samples/files/profiles.
+#
+# And finally, a k-means clustering will be done on the reduced dimensionality dataset
+# Note here that the -k $K is not related to the choice of substring length 'k' for profile generation.
+# The 'kdb cluster' command produces two figures, first is an elbow graph looking at up to N clusters.
+# This elbow graph will be written to '{1}'.
+# The second is the more typical scatterplot of the first two reduced dimensions
+# and the k-means clustering labels shown over the scatter.
+# This file will be written to '{2}'.
+kdb matrix [-n $N] [ PCA | tSNE ] test/data/*.$K.kdb | kdb cluster -k $K
+
+#
+# If you wanted to save a matrix from kdb matrix for use on your own
+# it is recommended that you consider gzip compressing it if it is the Normalized or Unnormalized matrix
+# which we will see is used downstream in the rarefaction analytical pathway.
+#
+##################
+# Rarefaction
+##################
+# The Unnormalized and Normalized matrices go to ecopy's rarefy function to produce a rarefaction plot
+# This plot will be written to '{3}'.
+kdb matrix [ Unnormalized | Normalized ] test/data/*.$K.kdb | kdb rarefy
+'''.format(*files)
+
diff --git a/kdb/fileutil.py b/kdb/fileutil.py
@@ -5,8 +5,10 @@
 import tempfile
 import yaml, json
 from collections import deque, OrderedDict
-import pdb
+import psutil
+import numpy as np
 
+#import pdb
 
 from builtins import open as _open
 
@@ -234,7 +236,35 @@ def __next__(self):
     #         self._handle.close()
     #         raise StopIteration
     #     return self._buffer
-
+
+    def slurp(self, dtype:int="int32"):
+        """
+        A function to read an entire .kdb file into memory
+        """
+        if type(dtype) is not str:
+            raise TypeError("kdb.fileutil.KDBReader.slurp expects the dtype keyword argument to be a str")
+
+        try:
+            np.dtype(dtype)
+        except TypeError as e:
+            logger.error(e)
+            logger.error("kdb.fileutil.KDBReader.slurp encountered a TypeError while assessing the numpy datatype '{0}'...".format(dtype))
+            raise TypeError("kdb.fileutil.KDBReader.slurp expects the dtype keyword argument to be a valid numpy data type")
+
+        # First calculate the amount of memory required by the array
+        N = 4**self.k # The dimension of the k-space, or the number of elements for the array
+        num_bytes = 4 * N
+        vmem = psutil.virtual_memory()
+        if vmem.available > num_bytes:
+            # Do the slurp
+            self.profile = np.zeros(4**self.k, dtype="int32")
+            for kmer_id in range(N):
+                line = next(self)
+                _, count = (int(_count) for _count in line.rstrip().split("\t"))
+                self.profile[kmer_id] = count
+        else:
+            raise OSError("The dimensionality at k={0} or 4^k = {1} exceeds the available amount of available memory (bytes) {2}".format(self.k, N, vmem.available))
+        return self.profile
 
 def setup_yaml():
     """

diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,20 @@
 biopython==1.74
 boto3==1.10.8
 botocore==1.13.8
+Cython==0.29.21
+distlib==0.3.0
+docutils==0.15.2
+-e git://github.com/MatthewRalston/ecopy.git#egg=ecopy
 jsonschema==3.1.1
 matplotlib==3.1.3
+more-itertools==8.2.0
+numba==0.52.0
+numpy==1.18.1
+pandas==1.2.0
+psutil==5.8.0
 PyYAML==5.1.2
 SQLAlchemy==1.3.13
 Sphinx==3.1.2
 sphinx_rtd_theme==0.5.0
+urllib3==1.25.8
+virtualenv==20.0.8
diff --git a/setup.py b/setup.py
@@ -87,11 +87,23 @@ def can_import(module_name):
     'biopython==1.74',
     'boto3==1.10.8',
     'botocore==1.13.8',
+    'Cython==0.29.21',
+    'distlib==0.3.0',
+    'docutils==0.15.2',
+    '-e git://github.com/MatthewRalston/ecopy.git#egg=ecopy',
     'jsonschema==3.1.1',
     'matplotlib==3.1.3',
+    'more-itertools==8.2.0',
+    'numba==0.52.0',
+    'numpy==1.18.1',
+    'pandas==1.2.0',
+    'psutil==5.8.0',
     'PyYAML==5.1.2',
     'SQLAlchemy==1.3.13',
-    'Sphinx==2.2.0']
+    'Sphinx==3.1.2',
+    'sphinx_rtd_theme==0.5.0',
+    'urllib3==1.25.8',
+    'virtualenv==20.0.8']
 
 # What packages are optional?
 EXTRAS = {