Skip to content

Commit

Permalink
Merge bf4d451 into 0494ccb
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Nov 11, 2019
2 parents 0494ccb + bf4d451 commit 21aae57
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 28 deletions.
14 changes: 9 additions & 5 deletions graphtools/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,22 @@ def Graph(
data,
n_pca=None,
rank_threshold=None,
sample_idx=None,
adaptive_k=None,
precomputed=None,
knn=5,
decay=40,
bandwidth=None,
bandwidth_scale=1.0,
knn_max=None,
anisotropy=0,
distance="euclidean",
thresh=1e-4,
kernel_symm="+",
theta=None,
precomputed=None,
beta=1,
sample_idx=None,
adaptive_k=None,
n_landmark=None,
n_svd=100,
beta=1,
n_jobs=-1,
verbose=False,
random_state=None,
Expand Down Expand Up @@ -90,6 +91,9 @@ def Graph(
bandwidth_scale : `float`, optional (default : 1.0)
Rescaling factor for bandwidth.
knn_max : `int` or `None`, optional (default : `None`)
Maximum number of neighbors with nonzero affinity
anisotropy : float, optional (default: 0)
Level of anisotropy between 0 and 1
(alpha in Coifman & Lafon, 2006)
Expand Down Expand Up @@ -193,7 +197,7 @@ def Graph(
elif decay is None:
# knn kernel
graphtype = "knn"
elif thresh == 0 or callable(bandwidth):
elif (thresh == 0 and knn_max is None) or callable(bandwidth):
# compute full distance matrix
graphtype = "exact"
else:
Expand Down
87 changes: 65 additions & 22 deletions graphtools/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def __init__(
data,
knn=5,
decay=None,
knn_max=None,
search_multiplier=20,
bandwidth=None,
bandwidth_scale=1.0,
distance="euclidean",
Expand All @@ -77,11 +79,15 @@ def __init__(
**kwargs
):

if decay is not None and thresh <= 0:
raise ValueError(
"Cannot instantiate a kNNGraph with `decay=None` "
"and `thresh=0`. Use a TraditionalGraph instead."
)
if decay is not None:
if thresh <= 0 and knn_max is None:
raise ValueError(
"Cannot instantiate a kNNGraph with `decay=None`, "
"`thresh=0` and `knn_max=None`. Use a TraditionalGraph instead."
)
elif thresh < np.finfo(float).eps:
thresh = np.finfo(float).eps

if callable(bandwidth):
raise NotImplementedError(
"Callable bandwidth is only supported by"
Expand All @@ -100,6 +106,12 @@ def __init__(
"n_samples ({n}). Setting knn={n}".format(k=knn, n=data.shape[0] - 2)
)
knn = data.shape[0] - 2
if knn_max is not None and knn_max < knn:
warnings.warn(
"Cannot set knn_max ({knn_max}) to be less than "
"knn ({knn}). Setting knn_max={knn}".format(knn=knn, knn_max=knn_max)
)
knn_max = knn
if n_pca in [None, 0, False] and data.shape[1] > 500:
warnings.warn(
"Building a kNNGraph on data of shape {} is "
Expand All @@ -108,6 +120,8 @@ def __init__(
)

self.knn = knn
self.knn_max = knn_max
self.search_multiplier = search_multiplier
self.decay = decay
self.bandwidth = bandwidth
self.bandwidth_scale = bandwidth_scale
Expand All @@ -125,6 +139,7 @@ def get_params(self):
"decay": self.decay,
"bandwidth": self.bandwidth,
"bandwidth_scale": self.bandwidth_scale,
"knn_max": self.knn_max,
"distance": self.distance,
"thresh": self.thresh,
"n_jobs": self.n_jobs,
Expand All @@ -145,6 +160,7 @@ def set_params(self, **params):
- verbose
Invalid parameters: (these would require modifying the kernel matrix)
- knn
- knn_max
- decay
- bandwidth
- bandwidth_scale
Expand All @@ -161,6 +177,8 @@ def set_params(self, **params):
"""
if "knn" in params and params["knn"] != self.knn:
raise ValueError("Cannot update knn. Please create a new graph")
if "knn_max" in params and params["knn_max"] != self.knn:
raise ValueError("Cannot update knn_max. Please create a new graph")
if "decay" in params and params["decay"] != self.decay:
raise ValueError("Cannot update decay. Please create a new graph")
if "bandwidth" in params and params["bandwidth"] != self.bandwidth:
Expand Down Expand Up @@ -237,7 +255,8 @@ def build_kernel(self):
symmetric matrix with ones down the diagonal
with no non-negative entries.
"""
K = self.build_kernel_to_data(self.data_nu, knn=self.knn + 1)
knn_max = self.knn_max + 1 if self.knn_max else None
K = self.build_kernel_to_data(self.data_nu, knn=self.knn + 1, knn_max=knn_max)
return K

def _check_duplicates(self, distances, indices):
Expand Down Expand Up @@ -272,7 +291,9 @@ def _check_duplicates(self, distances, indices):
RuntimeWarning,
)

def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None):
def build_kernel_to_data(
self, Y, knn=None, knn_max=None, bandwidth=None, bandwidth_scale=None
):
"""Build a kernel from new input data `Y` to the `self.data`
Parameters
Expand Down Expand Up @@ -314,8 +335,13 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None
if knn > self.data.shape[0]:
warnings.warn(
"Cannot set knn ({k}) to be greater than "
"n_samples ({n}). Setting knn={n}".format(k=knn, n=self.data.shape[0])
"n_samples ({n}). Setting knn={n}".format(
k=knn, n=self.data_nu.shape[0]
)
)
knn = self.data_nu.shape[0]
if knn_max is None:
knn_max = self.data_nu.shape[0]

Y = self._check_extension_shape(Y)
if self.decay is None or self.thresh == 1:
Expand All @@ -328,7 +354,7 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None
with _logger.task("KNN search"):
# sparse fast alpha decay
knn_tree = self.knn_tree
search_knn = min(knn * 20, self.data_nu.shape[0])
search_knn = min(knn * self.search_multiplier, knn_max)
distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn)
self._check_duplicates(distances, indices)
with _logger.task("affinities"):
Expand All @@ -348,12 +374,13 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None
if len(update_idx) > 0:
distances = [d for d in distances]
indices = [i for i in indices]
# increase the knn search
search_knn = min(search_knn * self.search_multiplier, knn_max)
while (
len(update_idx) > Y.shape[0] // 10
and search_knn < self.data_nu.shape[0] / 2
and search_knn < knn_max
):
# increase the knn search
search_knn = min(search_knn * 20, self.data_nu.shape[0])
dist_new, ind_new = knn_tree.kneighbors(
Y[update_idx], n_neighbors=search_knn
)
Expand All @@ -375,22 +402,38 @@ def build_kernel_to_data(self, Y, knn=None, bandwidth=None, bandwidth_scale=None
search_knn, len(update_idx)
)
)
# increase the knn search
search_knn = min(search_knn * self.search_multiplier, knn_max)
if search_knn > self.data_nu.shape[0] / 2:
knn_tree = NearestNeighbors(
search_knn, algorithm="brute", n_jobs=self.n_jobs
).fit(self.data_nu)
if len(update_idx) > 0:
_logger.debug("radius search on {}".format(len(update_idx)))
# give up - radius search
dist_new, ind_new = knn_tree.radius_neighbors(
Y[update_idx, :],
radius=radius
if isinstance(bandwidth, numbers.Number)
else np.max(radius[update_idx]),
)
for i, idx in enumerate(update_idx):
distances[idx] = dist_new[i]
indices[idx] = ind_new[i]
if search_knn == knn_max:
_logger.debug(
"knn search to knn_max ({}) on {}".format(
knn_max, len(update_idx)
)
)
# give up - search out to knn_max
dist_new, ind_new = knn_tree.kneighbors(
Y[update_idx], n_neighbors=search_knn
)
for i, idx in enumerate(update_idx):
distances[idx] = dist_new[i]
indices[idx] = ind_new[i]
else:
_logger.debug("radius search on {}".format(len(update_idx)))
# give up - radius search
dist_new, ind_new = knn_tree.radius_neighbors(
Y[update_idx, :],
radius=radius
if isinstance(bandwidth, numbers.Number)
else np.max(radius[update_idx]),
)
for i, idx in enumerate(update_idx):
distances[idx] = dist_new[i]
indices[idx] = ind_new[i]
if isinstance(bandwidth, numbers.Number):
data = np.concatenate(distances) / bandwidth
else:
Expand Down
63 changes: 62 additions & 1 deletion test/test_knn.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import print_function, division
from sklearn.utils.graph import graph_shortest_path
from scipy.spatial.distance import pdist, squareform
import warnings
from load_tests import (
graphtools,
np,
Expand Down Expand Up @@ -57,6 +58,11 @@ def test_k_too_large():
build_graph(data, n_pca=20, decay=10, knn=len(data) - 1, thresh=1e-4)


@warns(UserWarning)
def test_knnmax_too_large():
build_graph(data, n_pca=20, decay=10, knn=10, knn_max=9, thresh=1e-4)


@warns(UserWarning)
def test_bandwidth_no_decay():
build_graph(data, n_pca=20, decay=None, bandwidth=3, thresh=1e-4)
Expand Down Expand Up @@ -159,6 +165,59 @@ def test_sparse_alpha_knn_graph():
assert isinstance(G2, graphtools.graphs.kNNGraph)


def test_knnmax():
data = datasets.make_swiss_roll()[0]
k = 5
k_max = 10
a = 0.45
thresh = 0

with warnings.catch_warnings():
warnings.filterwarnings("ignore", "K should be symmetric", RuntimeWarning)
G = build_graph(
data,
n_pca=None, # n_pca,
decay=a,
knn=k - 1,
knn_max=k_max - 1,
thresh=0,
random_state=42,
kernel_symm=None,
)
assert np.all((G.K > 0).sum(axis=1) == k_max)

pdx = squareform(pdist(data, metric="euclidean"))
knn_dist = np.partition(pdx, k, axis=1)[:, :k]
knn_max_dist = np.max(np.partition(pdx, k_max, axis=1)[:, :k_max], axis=1)
epsilon = np.max(knn_dist, axis=1)
pdx_scale = (pdx.T / epsilon).T
K = np.where(pdx <= knn_max_dist[:, None], np.exp(-1 * pdx_scale ** a), 0)
K = K + K.T
W = np.divide(K, 2)
np.fill_diagonal(W, 0)
G = pygsp.graphs.Graph(W)
G2 = build_graph(
data,
n_pca=None, # n_pca,
decay=a,
knn=k - 1,
knn_max=k_max - 1,
thresh=0,
random_state=42,
use_pygsp=True,
)
assert isinstance(G2, graphtools.graphs.kNNGraph)
assert G.N == G2.N
assert np.all(G.dw == G2.dw)
assert (G.W - G2.W).nnz == 0


def test_thresh_small():
data = datasets.make_swiss_roll()[0]
G = graphtools.Graph(data, thresh=1e-30)
assert G.thresh == np.finfo("float").eps


def test_knn_graph_fixed_bandwidth():
k = None
decay = 5
Expand Down Expand Up @@ -422,14 +481,15 @@ def test_set_params():
"theta": None,
"anisotropy": 0,
"knn": 3,
"knn_max": None,
"decay": None,
"bandwidth": None,
"bandwidth_scale": 1,
"distance": "euclidean",
"thresh": 0,
"n_jobs": -1,
"verbose": 0,
}
}, G.get_params()
G.set_params(n_jobs=4)
assert G.n_jobs == 4
assert G.knn_tree.n_jobs == 4
Expand All @@ -439,6 +499,7 @@ def test_set_params():
assert G.verbose == 2
G.set_params(verbose=0)
assert_raises(ValueError, G.set_params, knn=15)
assert_raises(ValueError, G.set_params, knn_max=15)
assert_raises(ValueError, G.set_params, decay=10)
assert_raises(ValueError, G.set_params, distance="manhattan")
assert_raises(ValueError, G.set_params, thresh=1e-3)
Expand Down
1 change: 1 addition & 0 deletions test/test_landmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def test_set_params():
"n_landmark": 500,
"anisotropy": 0,
"knn": 3,
"knn_max": None,
"decay": None,
"bandwidth": None,
"bandwidth_scale": 1,
Expand Down

0 comments on commit 21aae57

Please sign in to comment.