Skip to content

Commit

Permalink
improve n_pca='auto' docs
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Oct 12, 2019
1 parent 360c785 commit fa0d586
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 42 deletions.
21 changes: 11 additions & 10 deletions graphtools/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,18 @@ def Graph(data,
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
If n_pca in `[None, False, 0]`, uses the original data.
If 'auto' or `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
s_max * eps * max(n_samples, n_features)
where s_max is the maximum singular value of the data matrix
and eps is numerical precision. [press2007]_.
knn : `int`, optional (default: 5)
Number of nearest neighbors (including self) to use to build the graph
Expand Down Expand Up @@ -171,6 +166,12 @@ def Graph(data,
Raises
------
ValueError : if selected parameters are incompatible.
References
----------
.. [press2007] W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
"""
tasklogger.set_level(verbose)
if sample_idx is not None and len(np.unique(sample_idx)) == 1:
Expand Down
48 changes: 16 additions & 32 deletions graphtools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,23 +85,18 @@ class Data(Base):
n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
number of PC dimensions to retain for graph building.
If n_pca in `[None,False,0]`, uses the original data.
If `True` then estimate using a singular value threshold
If n_pca in `[None, False, 0]`, uses the original data.
If 'auto' or `True` then estimate using a singular value threshold
Note: if data is sparse, uses SVD instead of PCA
TODO: should we subtract and store the mean?
rank_threshold : `float`, 'auto', optional (default: 'auto')
threshold to use when estimating rank for
`n_pca in [True, 'auto']`.
Note that the default kwarg is `None` for this parameter.
It is subsequently parsed to 'auto' if necessary.
If 'auto', this threshold is
smax * np.finfo(data.dtype).eps * max(data.shape)
where smax is the maximum singular value of the data matrix.
For reference, see, e.g.
W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
“Numerical Recipes (3rd edition)”,
Cambridge University Press, 2007, page 795.
s_max * eps * max(n_samples, n_features)
where s_max is the maximum singular value of the data matrix
and eps is numerical precision. [press2007]_.
random_state : `int` or `None`, optional (default: `None`)
Random state for random PCA
Expand Down Expand Up @@ -158,8 +153,6 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
raise ValueError("n_pca must be an integer "
"0 <= n_pca < min(n_samples,n_features), "
"or in [None,False,True,'auto'].")
else:
n_pca = True
if isinstance(n_pca, numbers.Number):
if not float(n_pca).is_integer(): # cast it to integer
n_pcaR = np.round(n_pca).astype(int)
Expand All @@ -185,23 +178,24 @@ def _parse_n_pca_threshold(self, data, n_pca, rank_threshold):
if n_pca in [0, False, None]: # cast 0, False to None.
n_pca = None
elif n_pca is True: # notify that we're going to estimate rank.
n_pca = 'auto'
tasklogger.log_info("Estimating n_pca from matrix rank. "
"Supply an integer n_pca "
"for fixed amount.")
if not any([isinstance(n_pca, numbers.Number),
n_pca is None,
n_pca is True]):
n_pca == 'auto']):
raise ValueError(
"n_pca was not an instance of numbers.Number, "
"could not be cast to False, and not None. "
"Please supply an integer "
"0 <= n_pca < min(n_samples,n_features) or None")
if rank_threshold is not None and n_pca is not True:
if rank_threshold is not None and n_pca != 'auto':
warnings.warn("n_pca = {}, therefore rank_threshold of {} "
"will not be used. To use rank thresholding, "
"set n_pca = True".format(n_pca, rank_threshold),
RuntimeWarning)
if n_pca is True:
if n_pca == 'auto':
if isinstance(rank_threshold, str):
rank_threshold = rank_threshold.lower()
if rank_threshold is None:
Expand Down Expand Up @@ -240,28 +234,19 @@ def _reduce_data(self):
"""
if self.n_pca is not None and self.n_pca < self.data.shape[1]:
tasklogger.log_start("PCA")
n_pca = self.data.shape[1] - 1 if self.n_pca == 'auto' else self.n_pca
if sparse.issparse(self.data):
if isinstance(self.data, sparse.coo_matrix) or \
isinstance(self.data, sparse.lil_matrix) or \
isinstance(self.data, sparse.dok_matrix):
self.data = self.data.tocsr()
if self.n_pca is True:
self.data_pca = TruncatedSVD(self.data.shape[1] - 1,
random_state=self.random_state)
else:
self.data_pca = TruncatedSVD(self.n_pca,
random_state=self.random_state)
self.data_pca = TruncatedSVD(n_pca, random_state=self.random_state)
else:
if self.n_pca is True:
self.data_pca = PCA(self.data.shape[1] - 1,
svd_solver='randomized',
random_state=self.random_state)
else:
self.data_pca = PCA(self.n_pca,
svd_solver='randomized',
random_state=self.random_state)
self.data_pca = PCA(n_pca,
svd_solver='randomized',
random_state=self.random_state)
self.data_pca.fit(self.data)
if self.n_pca is True:
if self.n_pca == 'auto':
s = self.data_pca.singular_values_
smax = s.max()
if self.rank_threshold == 'auto':
Expand All @@ -276,8 +261,7 @@ def _reduce_data(self):
"maximum singular value {} "
"for the data matrix".format(threshold, smax))
tasklogger.log_info(
"Using rank estimate of {} as n_pca".format(
self.n_pca))
"Using rank estimate of {} as n_pca".format(self.n_pca))
# reset the sklearn operator
op = self.data_pca # for line-width brevity..
op.components_ = op.components_[gate, :]
Expand Down

0 comments on commit fa0d586

Please sign in to comment.