Merge branch 'dev' of github.com:KrishnaswamyLab/graphtools into dev

KrishnaswamyLab · Oct 12, 2019 · 96d253d · 96d253d
2 parents e827d60 + 2e8d8ff
commit 96d253d
Show file tree

Hide file tree

Showing 5 changed files with 315 additions and 35 deletions.
diff --git a/graphtools/api.py b/graphtools/api.py
@@ -11,6 +11,7 @@
 
 def Graph(data,
           n_pca=None,
+          rank_threshold=None,
           sample_idx=None,
           adaptive_k=None,
           precomputed=None,
@@ -53,14 +54,28 @@ def Graph(data,
     ----------
     data : array-like, shape=[n_samples,n_features]
         accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
-        TODO: accept pandas dataframes
+        TODO: accept pandas dataframes'
 
-    n_pca : `int` or `None`, optional (default: `None`)
+    n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
         number of PC dimensions to retain for graph building.
-        If `None`, uses the original data.
+        If n_pca in `[None,False,0]`, uses the original data.
+        If `True` then estimate using a singular value threshold
         Note: if data is sparse, uses SVD instead of PCA
         TODO: should we subtract and store the mean?
 
+    rank_threshold : `float`, 'auto', optional (default: 'auto')
+        threshold to use when estimating rank for
+        `n_pca in [True, 'auto']`.
+        Note that the default kwarg is `None` for this parameter.
+        It is subsequently parsed to 'auto' if necessary.
+        If 'auto', this threshold is
+        smax * np.finfo(data.dtype).eps * max(data.shape)
+        where smax is the maximum singular value of the data matrix.
+        For reference, see, e.g.
+        W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
+        “Numerical Recipes (3rd edition)”,
+        Cambridge University Press, 2007, page 795.
+
     knn : `int`, optional (default: 5)
         Number of nearest neighbors (including self) to use to build the graph
 

diff --git a/graphtools/base.py b/graphtools/base.py
@@ -1,5 +1,6 @@
 from future.utils import with_metaclass
 from builtins import super
+from copy import copy as shallow_copy
 import numpy as np
 import abc
 import pygsp
@@ -82,12 +83,26 @@ class Data(Base):
         accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
         `pandas.DataFrame`, `pandas.SparseDataFrame`.
 
-    n_pca : `int` or `None`, optional (default: `None`)
+    n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
         number of PC dimensions to retain for graph building.
-        If `None`, uses the original data.
+        If n_pca in `[None,False,0]`, uses the original data.
+        If `True` then estimate using a singular value threshold
         Note: if data is sparse, uses SVD instead of PCA
         TODO: should we subtract and store the mean?
 
+    rank_threshold : `float`, 'auto', optional (default: 'auto')
+        threshold to use when estimating rank for
+        `n_pca in [True, 'auto']`.
+        Note that the default kwarg is `None` for this parameter.
+        It is subsequently parsed to 'auto' if necessary.
+        If 'auto', this threshold is
+        smax * np.finfo(data.dtype).eps * max(data.shape)
+        where smax is the maximum singular value of the data matrix.
+        For reference, see, e.g.
+        W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
+        “Numerical Recipes (3rd edition)”,
+        Cambridge University Press, 2007, page 795.
+
     random_state : `int` or `None`, optional (default: `None`)
         Random state for random PCA
 
@@ -105,15 +120,12 @@ class Data(Base):
         sklearn PCA operator
     """
 
-    def __init__(self, data, n_pca=None, random_state=None, **kwargs):
+    def __init__(self, data, n_pca=None, rank_threshold=None,
+                 random_state=None, **kwargs):
 
         self._check_data(data)
-        if n_pca is not None and np.min(data.shape) <= n_pca:
-            warnings.warn("Cannot perform PCA to {} dimensions on "
-                          "data with min(n_samples, n_features) = {}".format(
-                              n_pca, np.min(data.shape)),
-                          RuntimeWarning)
-            n_pca = None
+        n_pca, rank_threshold = self._parse_n_pca_threshold(
+            data, n_pca, rank_threshold)
         try:
             if isinstance(data, pd.SparseDataFrame):
                 data = data.to_coo()
@@ -134,10 +146,76 @@ def __init__(self, data, n_pca=None, random_state=None, **kwargs):
             pass
         self.data = data
         self.n_pca = n_pca
+        self.rank_threshold = rank_threshold
         self.random_state = random_state
         self.data_nu = self._reduce_data()
         super().__init__(**kwargs)
 
+    def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
+        if isinstance(n_pca, str):
+            n_pca = n_pca.lower()
+            if n_pca != "auto":
+                raise ValueError("n_pca must be an integer "
+                                 "0 <= n_pca < min(n_samples,n_features), "
+                                 "or in [None,False,True,'auto'].")
+            else:
+                n_pca = True
+        if isinstance(n_pca, numbers.Number):
+            if not float(n_pca).is_integer():  # cast it to integer
+                n_pcaR = np.round(n_pca).astype(int)
+                warnings.warn(
+                    "Cannot perform PCA to fractional {} dimensions. "
+                    "Rounding to {}".format(
+                        n_pca, n_pcaR), RuntimeWarning)
+                n_pca = n_pcaR
+
+            if n_pca < 0:
+                raise ValueError(
+                    "n_pca cannot be negative. "
+                    "Please supply an integer "
+                    "0 <= n_pca < min(n_samples,n_features) or None")
+            elif np.min(data.shape) <= n_pca:
+                warnings.warn(
+                    "Cannot perform PCA to {} dimensions on "
+                    "data with min(n_samples, n_features) = {}".format(
+                        n_pca, np.min(
+                            data.shape)), RuntimeWarning)
+                n_pca = 0
+
+        if n_pca in [0, False, None]:  # cast 0, False to None.
+            n_pca = None
+        elif n_pca is True:  # notify that we're going to estimate rank.
+            tasklogger.log_info("Estimating n_pca from matrix rank. "
+                                "Supply an integer n_pca "
+                                "for fixed amount.")
+        if not any([isinstance(n_pca, numbers.Number),
+                    n_pca is None,
+                    n_pca is True]):
+            raise ValueError(
+                "n_pca was not an instance of numbers.Number, "
+                "could not be cast to False, and not None. "
+                "Please supply an integer "
+                "0 <= n_pca < min(n_samples,n_features) or None")
+        if rank_threshold is not None and n_pca is not True:
+            warnings.warn("n_pca = {}, therefore rank_threshold of {} "
+                          "will not be used. To use rank thresholding, "
+                          "set n_pca = True".format(n_pca, rank_threshold),
+                          RuntimeWarning)
+        if n_pca is True:
+            if isinstance(rank_threshold, str):
+                rank_threshold = rank_threshold.lower()
+            if rank_threshold is None:
+                rank_threshold = 'auto'
+            if isinstance(rank_threshold, numbers.Number):
+                if rank_threshold <= 0:
+                    raise ValueError(
+                        "rank_threshold must be positive float or 'auto'. ")
+            else:
+                if rank_threshold != 'auto':
+                    raise ValueError(
+                        "rank_threshold must be positive float or 'auto'. ")
+        return n_pca, rank_threshold
+
     def _check_data(self, data):
         if len(data.shape) != 2:
             msg = "ValueError: Expected 2D array, got {}D array " \
@@ -154,6 +232,7 @@ def _reduce_data(self):
         If data is dense, uses randomized PCA. If data is sparse, uses
         randomized SVD.
         TODO: should we subtract and store the mean?
+        TODO: Fix the rank estimation so we do not compute the full SVD. 
 
         Returns
         -------
@@ -166,13 +245,47 @@ def _reduce_data(self):
                         isinstance(self.data, sparse.lil_matrix) or \
                         isinstance(self.data, sparse.dok_matrix):
                     self.data = self.data.tocsr()
-                self.data_pca = TruncatedSVD(self.n_pca,
-                                             random_state=self.random_state)
+                if self.n_pca is True:
+                    self.data_pca = TruncatedSVD(self.data.shape[1] - 1,
+                                                 random_state=self.random_state)
+                else:
+                    self.data_pca = TruncatedSVD(self.n_pca,
+                                                 random_state=self.random_state)
             else:
-                self.data_pca = PCA(self.n_pca,
-                                    svd_solver='randomized',
-                                    random_state=self.random_state)
+                if self.n_pca is True:
+                    self.data_pca = PCA(self.data.shape[1] - 1,
+                                        svd_solver='randomized',
+                                        random_state=self.random_state)
+                else:
+                    self.data_pca = PCA(self.n_pca,
+                                        svd_solver='randomized',
+                                        random_state=self.random_state)
             self.data_pca.fit(self.data)
+            if self.n_pca is True:
+                s = self.data_pca.singular_values_
+                smax = s.max()
+                if self.rank_threshold == 'auto':
+                    threshold = smax * \
+                        np.finfo(self.data.dtype).eps * max(self.data.shape)
+                    self.rank_threshold = threshold
+                threshold = self.rank_threshold
+                gate = np.where(s >= threshold)[0]
+                self.n_pca = gate.shape[0]
+                if self.n_pca == 0:
+                    raise ValueError("Supplied threshold {} was greater than "
+                                     "maximum singular value {} "
+                                     "for the data matrix".format(threshold, smax))
+                tasklogger.log_info(
+                    "Using rank estimate of {} as n_pca".format(
+                        self.n_pca))
+                # reset the sklearn operator
+                op = self.data_pca  # for line-width brevity..
+                op.components_ = op.components_[gate, :]
+                op.explained_variance_ = op.explained_variance_[gate]
+                op.explained_variance_ratio_ = op.explained_variance_ratio_[
+                    gate]
+                op.singular_values_ = op.singular_values_[gate]
+                self.data_pca = op  # im not clear if this is needed due to assignment rules
             data_nu = self.data_pca.transform(self.data)
             tasklogger.log_complete("PCA")
             return data_nu
@@ -263,6 +376,7 @@ def inverse_transform(self, Y, columns=None):
         ----------
         Y : array-like, shape=[n_samples_y, n_pca]
             n_features must be the same as `self.data_nu`.
+
         columns : list-like
             list of integers referring to column indices in the original data
             space to be returned. Avoids recomputing the full matrix where only
@@ -660,14 +774,13 @@ def to_pickle(self, path):
         path : str
             File path where the pickled object will be stored.
         """
-        if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph):
-            # python 3.5, 3.6
-            logger = self.logger
-            self.logger = logger.name
+        pickle_obj = shallow_copy(self)
+        is_oldpygsp = all([isinstance(self, pygsp.graphs.Graph),
+                           int(sys.version.split(".")[1]) < 7])
+        if is_oldpygsp:
+            pickle_obj.logger = pickle_obj.logger.name
         with open(path, 'wb') as f:
-            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
-        if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph):
-            self.logger = logger
+            pickle.dump(pickle_obj, f, protocol=pickle.HIGHEST_PROTOCOL)
 
     def _check_shortest_path_distance(self, distance):
         if distance == 'data' and self.weighted:
@@ -817,10 +930,25 @@ class DataGraph(with_metaclass(abc.ABCMeta, Data, BaseGraph)):
     data : array-like, shape=[n_samples,n_features]
         accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
 
-    n_pca : `int` or `None`, optional (default: `None`)
+    n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
         number of PC dimensions to retain for graph building.
-        If `None`, uses the original data.
+        If n_pca in `[None,False,0]`, uses the original data.
+        If `True` then estimate using a singular value threshold
         Note: if data is sparse, uses SVD instead of PCA
+        TODO: should we subtract and store the mean?
+
+    rank_threshold : `float`, 'auto', optional (default: 'auto')
+        threshold to use when estimating rank for
+        `n_pca in [True, 'auto']`.
+        Note that the default kwarg is `None` for this parameter.
+        It is subsequently parsed to 'auto' if necessary.
+        If 'auto', this threshold is
+        smax * np.finfo(data.dtype).eps * max(data.shape)
+        where smax is the maximum singular value of the data matrix.
+        For reference, see, e.g.
+        W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
+        “Numerical Recipes (3rd edition)”,
+        Cambridge University Press, 2007, page 795.
 
     random_state : `int` or `None`, optional (default: `None`)
         Random state for random PCA and graph building

diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -86,7 +86,7 @@ def __init__(self, data, knn=5, decay=None,
                           "n_samples ({n}). Setting knn={n}".format(
                               k=knn, n=data.shape[0] - 2))
             knn = data.shape[0] - 2
-        if n_pca is None and data.shape[1] > 500:
+        if n_pca in [None,0,False] and data.shape[1] > 500:
             warnings.warn("Building a kNNGraph on data of shape {} is "
                           "expensive. Consider setting n_pca.".format(
                               data.shape), UserWarning)
@@ -700,11 +700,25 @@ class TraditionalGraph(DataGraph):
         distance metric for building kNN graph.
         TODO: actually sklearn.neighbors has even more choices
 
-    n_pca : `int` or `None`, optional (default: `None`)
+    n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
         number of PC dimensions to retain for graph building.
-        If `None`, uses the original data.
-        Note: if data is sparse, uses SVD instead of PCA.
-        Only one of `precomputed` and `n_pca` can be set.
+        If n_pca in `[None,False,0]`, uses the original data.
+        If `True` then estimate using a singular value threshold
+        Note: if data is sparse, uses SVD instead of PCA
+        TODO: should we subtract and store the mean?
+
+    rank_threshold : `float`, 'auto', optional (default: 'auto')
+        threshold to use when estimating rank for
+        `n_pca in [True, 'auto']`.
+        Note that the default kwarg is `None` for this parameter.
+        It is subsequently parsed to 'auto' if necessary.
+        If 'auto', this threshold is
+        smax * np.finfo(data.dtype).eps * max(data.shape)
+        where smax is the maximum singular value of the data matrix.
+        For reference, see, e.g.
+        W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
+        “Numerical Recipes (3rd edition)”,
+        Cambridge University Press, 2007, page 795.
 
     thresh : `float`, optional (default: `1e-4`)
         Threshold above which to calculate alpha decay kernel.
@@ -731,7 +745,7 @@ def __init__(self, data,
             raise ValueError(
                 "`decay` must be provided for a "
                 "TraditionalGraph. For kNN kernel, use kNNGraph.")
-        if precomputed is not None and n_pca is not None:
+        if precomputed is not None and n_pca not in [None,0,False]:
             # the data itself is a matrix of distances / affinities
             n_pca = None
             warnings.warn("n_pca cannot be given on a precomputed graph."

diff --git a/setup.py b/setup.py
@@ -43,8 +43,8 @@
 setup(name='graphtools',
       version=version,
       description='graphtools',
-      author='Jay Stanley and Scott Gigante, Krishnaswamy Lab, Yale University',
-      author_email='jay.stanley@yale.edu',
+      author='Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University',
+      author_email='scott.gigante@yale.edu',
       packages=['graphtools', ],
       license='GNU General Public License Version 2',
       install_requires=install_requires,