Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch k means #2

Merged
merged 6 commits into from May 17, 2011
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 14 additions & 9 deletions examples/cluster/plot_mini_batch_kmeans.py
Expand Up @@ -12,7 +12,11 @@
"""
print __doc__

import time

import numpy as np
import pylab as pl
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, why use pylab instead of matplotlib?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its the same thing (pylab is a helper module shipped by matplotlib). The reason I changed this line is for consistency.


from scikits.learn.cluster import MiniBatchKMeans, KMeans
from scikits.learn.metrics.pairwise import euclidian_distances

Expand All @@ -37,9 +41,10 @@

##############################################################################
# Compute clustering with Means
k_means = KMeans(init='k-means++',
k=3)
k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
Expand All @@ -52,21 +57,19 @@
# KMeans algorithm have been calculated with X.
# It is also unecessary to copy X.

mbk = MiniBatchKMeans(init='k-means++',
k=3,
chunk_size=batch_size,
mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size,
copy_x=False)
t0 = time.time()
mbk.fit(X, shuffle=False)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

##############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

fig = plt.figure()
fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
Expand All @@ -88,6 +91,7 @@
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
pl.text(-3.5, 2.7, 'train time: %.2fs' % t_batch)

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
Expand All @@ -99,6 +103,7 @@
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('MiniBatchKMeans')
pl.text(-3.5, 2.7, 'train time: %.2fs' % t_mini_batch)

# Initialise the different array to all False
different = (mbk_means_labels == 4)
Expand All @@ -114,4 +119,4 @@
markerfacecolor='m', marker='.')
ax.set_title('Difference')

plt.show()
pl.show()
43 changes: 19 additions & 24 deletions scikits/learn/cluster/k_means_.py
Expand Up @@ -4,13 +4,14 @@
# Thomas Rueckstiess <ruecksti@in.tum.de>
# James Bergstra <james.bergstra@umontreal.ca>
# Jan Schlueter <scikit-learn@jan-schlueter.de>
# Nelle Varoquaux
# License: BSD

import warnings
import itertools

import numpy as np
from math import floor
import itertools

from ..base import BaseEstimator
from ..metrics.pairwise import euclidean_distances
Expand Down Expand Up @@ -73,7 +74,9 @@ def k_init(X, k, n_local_trials=None, random_state=None, x_squared_norms=None):

# Initialize list of closest distances and calculate current potential
if x_squared_norms is None:
x_squared_norms = (X ** 2).sum(axis=1)
x_squared_norms = X.copy()
x_squared_norms **= 2
x_squared_norms = x_squared_norms.sum(axis=1)
closest_dist_sq = euclidean_distances(
np.atleast_2d(centers[0]), X, Y_norm_squared=x_squared_norms,
squared=True)
Expand Down Expand Up @@ -184,7 +187,6 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,

"""
random_state = check_random_state(random_state)
n_samples = X.shape[0]

vdata = np.mean(np.var(X, 0))
best_inertia = np.infty
Expand Down Expand Up @@ -351,7 +353,6 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None):
"""
random_state = check_random_state(random_state)


n_samples = X.shape[0]
if init == 'k-means++':
centers = k_init(X, k,
Expand All @@ -363,7 +364,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None):
elif hasattr(init, '__array__'):
centers = np.asanyarray(init).copy()
elif callable(init):
centers = init(X, k, random_state=randome_state)
centers = init(X, k, random_state=random_state)
else:
raise ValueError("the init parameter for the k-means should "
"be 'k-means++' or 'random' or an ndarray, "
Expand Down Expand Up @@ -629,43 +630,37 @@ def fit(self, X, y=None, shuffle=True, **params):

self.random_state = check_random_state(self.random_state)

X = self._check_data(X, **params)

if self.copy_x:
X = X.copy()

if hasattr(self.init, '__array__'):
X = self._check_data(X, **params)
self.init = np.asarray(self.init)

if shuffle:
self.random_state.shuffle(X)

x_squared_norms = X.copy()
x_squared_norms **= 2
x_squared_norms = x_squared_norms.sum(axis=1)

self.cluster_centers_ = _init_centroids(
X, self.k, self.init, random_state=self.random_state,
x_squared_norms=x_squared_norms)
X, self.k, self.init, random_state=self.random_state)

self.counts = np.zeros(self.k)
tol = np.mean(np.var(X, 0)) * self.tol
tol = np.mean(np.var(X, axis=0)) * self.tol
try:
split_X = np.array_split(X, floor(float(len(X)) / self.chunk_size))
except ValueError:
split_X = [X]

squared_norms = [(x ** 2).sum(axis=1) for x in split_X]
data = zip(split_X, squared_norms)
old_centers = []

for i in xrange(self.max_iter):
j = i % len(data)
old_centers[:] = self.cluster_centers_.copy()
for i, (this_x, this_squared_norm) in zip(
xrange(self.max_iter),
itertools.cycle((x, (x ** 2).sum(axis=1))
for x in split_X)):
old_centers = self.cluster_centers_.copy()
self.cluster_centers_, self.counts = _mini_batch_step(
data[j][0], self.cluster_centers_, self.counts,
x_squared_norms=data[j][1])
this_x, self.cluster_centers_, self.counts,
x_squared_norms=this_squared_norm)

if np.sum(old_centers - self.cluster_centers_) ** 2 < tol:
if np.sum((old_centers - self.cluster_centers_) ** 2) < tol:
if self.verbose:
print 'Converged to similar centers at iteration', i
break
Expand All @@ -682,8 +677,8 @@ def partial_fit(self, X, y=None, **params):

self.random_state = check_random_state(self.random_state)

X = self._check_data(X, **params)
if hasattr(self.init, '__array__'):
X = self._check_data(X, **params)
self.init = np.asarray(self.init)

if len(X) == 0:
Expand Down