Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:KrishnaswamyLab/graphtools into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Oct 12, 2019
2 parents e827d60 + 2e8d8ff commit 96d253d
Show file tree
Hide file tree
Showing 5 changed files with 315 additions and 35 deletions.
21 changes: 18 additions & 3 deletions graphtools/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

def Graph(data,
n_pca=None,
rank_threshold=None,
sample_idx=None,
adaptive_k=None,
precomputed=None,
Expand Down Expand Up @@ -53,14 +54,28 @@ def Graph(data,
----------
data : array-like, shape=[n_samples,n_features]
accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
TODO: accept pandas dataframes
TODO: accept pandas dataframes'
n_pca : `int` or `None`, optional (default: `None`)
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If `None`, uses the original data.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
knn : `int`, optional (default: 5)
Number of nearest neighbors (including self) to use to build the graph
Expand Down
174 changes: 151 additions & 23 deletions graphtools/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from future.utils import with_metaclass
from builtins import super
from copy import copy as shallow_copy
import numpy as np
import abc
import pygsp
Expand Down Expand Up @@ -82,12 +83,26 @@ class Data(Base):
accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
`pandas.DataFrame`, `pandas.SparseDataFrame`.
n_pca : `int` or `None`, optional (default: `None`)
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If `None`, uses the original data.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
random_state : `int` or `None`, optional (default: `None`)
Random state for random PCA
Expand All @@ -105,15 +120,12 @@ class Data(Base):
sklearn PCA operator
"""

def __init__(self, data, n_pca=None, random_state=None, **kwargs):
def __init__(self, data, n_pca=None, rank_threshold=None,
random_state=None, **kwargs):

self._check_data(data)
if n_pca is not None and np.min(data.shape) <= n_pca:
warnings.warn("Cannot perform PCA to {} dimensions on "
"data with min(n_samples, n_features) = {}".format(
n_pca, np.min(data.shape)),
RuntimeWarning)
n_pca = None
n_pca, rank_threshold = self._parse_n_pca_threshold(
data, n_pca, rank_threshold)
try:
if isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
Expand All @@ -134,10 +146,76 @@ def __init__(self, data, n_pca=None, random_state=None, **kwargs):
pass
self.data = data
self.n_pca = n_pca
self.rank_threshold = rank_threshold
self.random_state = random_state
self.data_nu = self._reduce_data()
super().__init__(**kwargs)

def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
if isinstance(n_pca, str):
n_pca = n_pca.lower()
if n_pca != "auto":
raise ValueError("n_pca must be an integer "
"0 <= n_pca < min(n_samples,n_features), "
"or in [None,False,True,'auto'].")
else:
n_pca = True
if isinstance(n_pca, numbers.Number):
if not float(n_pca).is_integer(): # cast it to integer
n_pcaR = np.round(n_pca).astype(int)
warnings.warn(
"Cannot perform PCA to fractional {} dimensions. "
"Rounding to {}".format(
n_pca, n_pcaR), RuntimeWarning)
n_pca = n_pcaR

if n_pca < 0:
raise ValueError(
"n_pca cannot be negative. "
"Please supply an integer "
"0 <= n_pca < min(n_samples,n_features) or None")
elif np.min(data.shape) <= n_pca:
warnings.warn(
"Cannot perform PCA to {} dimensions on "
"data with min(n_samples, n_features) = {}".format(
n_pca, np.min(
data.shape)), RuntimeWarning)
n_pca = 0

if n_pca in [0, False, None]: # cast 0, False to None.
n_pca = None
elif n_pca is True: # notify that we're going to estimate rank.
tasklogger.log_info("Estimating n_pca from matrix rank. "
"Supply an integer n_pca "
"for fixed amount.")
if not any([isinstance(n_pca, numbers.Number),
n_pca is None,
n_pca is True]):
raise ValueError(
"n_pca was not an instance of numbers.Number, "
"could not be cast to False, and not None. "
"Please supply an integer "
"0 <= n_pca < min(n_samples,n_features) or None")
if rank_threshold is not None and n_pca is not True:
warnings.warn("n_pca = {}, therefore rank_threshold of {} "
"will not be used. To use rank thresholding, "
"set n_pca = True".format(n_pca, rank_threshold),
RuntimeWarning)
if n_pca is True:
if isinstance(rank_threshold, str):
rank_threshold = rank_threshold.lower()
if rank_threshold is None:
rank_threshold = 'auto'
if isinstance(rank_threshold, numbers.Number):
if rank_threshold <= 0:
raise ValueError(
"rank_threshold must be positive float or 'auto'. ")
else:
if rank_threshold != 'auto':
raise ValueError(
"rank_threshold must be positive float or 'auto'. ")
return n_pca, rank_threshold

def _check_data(self, data):
if len(data.shape) != 2:
msg = "ValueError: Expected 2D array, got {}D array " \
Expand All @@ -154,6 +232,7 @@ def _reduce_data(self):
If data is dense, uses randomized PCA. If data is sparse, uses
randomized SVD.
TODO: should we subtract and store the mean?
TODO: Fix the rank estimation so we do not compute the full SVD.
Returns
-------
Expand All @@ -166,13 +245,47 @@ def _reduce_data(self):
isinstance(self.data, sparse.lil_matrix) or \
isinstance(self.data, sparse.dok_matrix):
self.data = self.data.tocsr()
self.data_pca = TruncatedSVD(self.n_pca,
random_state=self.random_state)
if self.n_pca is True:
self.data_pca = TruncatedSVD(self.data.shape[1] - 1,
random_state=self.random_state)
else:
self.data_pca = TruncatedSVD(self.n_pca,
random_state=self.random_state)
else:
self.data_pca = PCA(self.n_pca,
svd_solver='randomized',
random_state=self.random_state)
if self.n_pca is True:
self.data_pca = PCA(self.data.shape[1] - 1,
svd_solver='randomized',
random_state=self.random_state)
else:
self.data_pca = PCA(self.n_pca,
svd_solver='randomized',
random_state=self.random_state)
self.data_pca.fit(self.data)
if self.n_pca is True:
s = self.data_pca.singular_values_
smax = s.max()
if self.rank_threshold == 'auto':
threshold = smax * \
np.finfo(self.data.dtype).eps * max(self.data.shape)
self.rank_threshold = threshold
threshold = self.rank_threshold
gate = np.where(s >= threshold)[0]
self.n_pca = gate.shape[0]
if self.n_pca == 0:
raise ValueError("Supplied threshold {} was greater than "
"maximum singular value {} "
"for the data matrix".format(threshold, smax))
tasklogger.log_info(
"Using rank estimate of {} as n_pca".format(
self.n_pca))
# reset the sklearn operator
op = self.data_pca # for line-width brevity..
op.components_ = op.components_[gate, :]
op.explained_variance_ = op.explained_variance_[gate]
op.explained_variance_ratio_ = op.explained_variance_ratio_[
gate]
op.singular_values_ = op.singular_values_[gate]
self.data_pca = op # im not clear if this is needed due to assignment rules
data_nu = self.data_pca.transform(self.data)
tasklogger.log_complete("PCA")
return data_nu
Expand Down Expand Up @@ -263,6 +376,7 @@ def inverse_transform(self, Y, columns=None):
----------
Y : array-like, shape=[n_samples_y, n_pca]
n_features must be the same as `self.data_nu`.
columns : list-like
list of integers referring to column indices in the original data
space to be returned. Avoids recomputing the full matrix where only
Expand Down Expand Up @@ -660,14 +774,13 @@ def to_pickle(self, path):
path : str
File path where the pickled object will be stored.
"""
if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph):
# python 3.5, 3.6
logger = self.logger
self.logger = logger.name
pickle_obj = shallow_copy(self)
is_oldpygsp = all([isinstance(self, pygsp.graphs.Graph),
int(sys.version.split(".")[1]) < 7])
if is_oldpygsp:
pickle_obj.logger = pickle_obj.logger.name
with open(path, 'wb') as f:
pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
if int(sys.version.split(".")[1]) < 7 and isinstance(self, pygsp.graphs.Graph):
self.logger = logger
pickle.dump(pickle_obj, f, protocol=pickle.HIGHEST_PROTOCOL)

def _check_shortest_path_distance(self, distance):
if distance == 'data' and self.weighted:
Expand Down Expand Up @@ -817,10 +930,25 @@ class DataGraph(with_metaclass(abc.ABCMeta, Data, BaseGraph)):
data : array-like, shape=[n_samples,n_features]
accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
n_pca : `int` or `None`, optional (default: `None`)
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If `None`, uses the original data.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
random_state : `int` or `None`, optional (default: `None`)
Random state for random PCA and graph building
Expand Down
26 changes: 20 additions & 6 deletions graphtools/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(self, data, knn=5, decay=None,
"n_samples ({n}). Setting knn={n}".format(
k=knn, n=data.shape[0] - 2))
knn = data.shape[0] - 2
if n_pca is None and data.shape[1] > 500:
if n_pca in [None,0,False] and data.shape[1] > 500:
warnings.warn("Building a kNNGraph on data of shape {} is "
"expensive. Consider setting n_pca.".format(
data.shape), UserWarning)
Expand Down Expand Up @@ -700,11 +700,25 @@ class TraditionalGraph(DataGraph):
distance metric for building kNN graph.
TODO: actually sklearn.neighbors has even more choices
n_pca : `int` or `None`, optional (default: `None`)
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If `None`, uses the original data.
Note: if data is sparse, uses SVD instead of PCA.
Only one of `precomputed` and `n_pca` can be set.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
thresh : `float`, optional (default: `1e-4`)
Threshold above which to calculate alpha decay kernel.
Expand All @@ -731,7 +745,7 @@ def __init__(self, data,
raise ValueError(
"`decay` must be provided for a "
"TraditionalGraph. For kNN kernel, use kNNGraph.")
if precomputed is not None and n_pca is not None:
if precomputed is not None and n_pca not in [None,0,False]:
# the data itself is a matrix of distances / affinities
n_pca = None
warnings.warn("n_pca cannot be given on a precomputed graph."
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
setup(name='graphtools',
version=version,
description='graphtools',
author='Jay Stanley and Scott Gigante, Krishnaswamy Lab, Yale University',
author_email='jay.stanley@yale.edu',
author='Scott Gigante, Daniel Burkhardt, and Jay Stanley, Yale University',
author_email='scott.gigante@yale.edu',
packages=['graphtools', ],
license='GNU General Public License Version 2',
install_requires=install_requires,
Expand Down

0 comments on commit 96d253d

Please sign in to comment.