NelleV · NelleV · May 17, 2011 · May 17, 2011 · May 17, 2011 · May 17, 2011
diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py
@@ -12,7 +12,11 @@
 """
 print __doc__
 
+import time
+
 import numpy as np
+import pylab as pl
+
 from scikits.learn.cluster import MiniBatchKMeans, KMeans
 from scikits.learn.metrics.pairwise import euclidian_distances
 
@@ -37,9 +41,10 @@
 
 ##############################################################################
 # Compute clustering with Means
-k_means = KMeans(init='k-means++',
-                 k=3)
+k_means = KMeans(init='k-means++', k=3)
+t0 = time.time()
 k_means.fit(X)
+t_batch = time.time() - t0
 k_means_labels = k_means.labels_
 k_means_cluster_centers = k_means.cluster_centers_
 k_means_labels_unique = np.unique(k_means_labels)
@@ -52,21 +57,19 @@
 # KMeans algorithm have been calculated with X.
 # It is also unecessary to copy X.
 
-mbk = MiniBatchKMeans(init='k-means++',
-                      k=3,
-                      chunk_size=batch_size,
+mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size,
                       copy_x=False)
+t0 = time.time()
 mbk.fit(X, shuffle=False)
+t_mini_batch = time.time() - t0
 mbk_means_labels = mbk.labels_
 mbk_means_cluster_centers = mbk.cluster_centers_
 mbk_means_labels_unique = np.unique(mbk_means_labels)
 
 ##############################################################################
 # Plot result
-import matplotlib.pyplot as plt
-from itertools import cycle
 
-fig = plt.figure()
+fig = pl.figure()
 colors = ['#4EACC5', '#FF9C34', '#4E9A06']
 
 # We want to have the same colors for the same cluster from the
@@ -88,6 +91,7 @@
     ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                                     markeredgecolor='k', markersize=6)
 ax.set_title('KMeans')
+pl.text(-3.5, 2.7,  'train time: %.2fs' % t_batch)
 
 # MiniBatchKMeans
 ax = fig.add_subplot(1, 3, 2)
@@ -99,6 +103,7 @@
     ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                                     markeredgecolor='k', markersize=6)
 ax.set_title('MiniBatchKMeans')
+pl.text(-3.5, 2.7,  'train time: %.2fs' % t_mini_batch)
 
 # Initialise the different array to all False
 different = (mbk_means_labels == 4)
@@ -114,4 +119,4 @@
         markerfacecolor='m', marker='.')
 ax.set_title('Difference')
 
-plt.show()
+pl.show()
diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
@@ -4,13 +4,14 @@
 #          Thomas Rueckstiess <ruecksti@in.tum.de>
 #          James Bergstra <james.bergstra@umontreal.ca>
 #          Jan Schlueter <scikit-learn@jan-schlueter.de>
+#          Nelle Varoquaux
 # License: BSD
 
 import warnings
+import itertools
 
 import numpy as np
 from math import floor
-import itertools
 
 from ..base import BaseEstimator
 from ..metrics.pairwise import euclidean_distances
@@ -73,7 +74,9 @@ def k_init(X, k, n_local_trials=None, random_state=None, x_squared_norms=None):
 
     # Initialize list of closest distances and calculate current potential
     if x_squared_norms is None:
-        x_squared_norms = (X ** 2).sum(axis=1)
+        x_squared_norms = X.copy()
+        x_squared_norms **= 2
+        x_squared_norms = x_squared_norms.sum(axis=1)
     closest_dist_sq = euclidean_distances(
         np.atleast_2d(centers[0]), X, Y_norm_squared=x_squared_norms,
         squared=True)
@@ -184,7 +187,6 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
 
     """
     random_state = check_random_state(random_state)
-    n_samples = X.shape[0]
 
     vdata = np.mean(np.var(X, 0))
     best_inertia = np.infty
@@ -351,7 +353,6 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None):
     """
     random_state = check_random_state(random_state)
 
-
     n_samples = X.shape[0]
     if init == 'k-means++':
         centers = k_init(X, k,
@@ -363,7 +364,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None):
     elif hasattr(init, '__array__'):
         centers = np.asanyarray(init).copy()
     elif callable(init):
-        centers = init(X, k, random_state=randome_state)
+        centers = init(X, k, random_state=random_state)
     else:
         raise ValueError("the init parameter for the k-means should "
             "be 'k-means++' or 'random' or an ndarray, "
@@ -629,43 +630,37 @@ def fit(self, X, y=None, shuffle=True, **params):
 
         self.random_state = check_random_state(self.random_state)
 
+        X = self._check_data(X, **params)
+
         if self.copy_x:
             X = X.copy()
 
         if hasattr(self.init, '__array__'):
-            X = self._check_data(X, **params)
             self.init = np.asarray(self.init)
 
         if shuffle:
             self.random_state.shuffle(X)
 
-        x_squared_norms = X.copy()
-        x_squared_norms **= 2
-        x_squared_norms = x_squared_norms.sum(axis=1)
-
         self.cluster_centers_ = _init_centroids(
-                X, self.k, self.init, random_state=self.random_state,
-                x_squared_norms=x_squared_norms)
+                X, self.k, self.init, random_state=self.random_state)
 
         self.counts = np.zeros(self.k)
-        tol = np.mean(np.var(X, 0)) * self.tol
+        tol = np.mean(np.var(X, axis=0)) * self.tol
         try:
             split_X = np.array_split(X, floor(float(len(X)) / self.chunk_size))
         except ValueError:
             split_X = [X]
 
-        squared_norms = [(x ** 2).sum(axis=1) for x in split_X]
-        data = zip(split_X, squared_norms)
-        old_centers = []
-
-        for i in xrange(self.max_iter):
-            j = i % len(data)
-            old_centers[:] = self.cluster_centers_.copy()
+        for i, (this_x, this_squared_norm) in zip(
+                                    xrange(self.max_iter),
+                                    itertools.cycle((x, (x ** 2).sum(axis=1)) 
+                                                    for x in split_X)):
+            old_centers = self.cluster_centers_.copy()
             self.cluster_centers_, self.counts = _mini_batch_step(
-                data[j][0], self.cluster_centers_, self.counts,
-                x_squared_norms=data[j][1])
+                this_x, self.cluster_centers_, self.counts,
+                x_squared_norms=this_squared_norm)
 
-            if np.sum(old_centers - self.cluster_centers_) ** 2 < tol:
+            if np.sum((old_centers - self.cluster_centers_) ** 2) < tol:
                 if self.verbose:
                     print 'Converged to similar centers at iteration', i
                 break
@@ -682,8 +677,8 @@ def partial_fit(self, X, y=None, **params):
 
         self.random_state = check_random_state(self.random_state)
 
+        X = self._check_data(X, **params)
         if hasattr(self.init, '__array__'):
-            X = self._check_data(X, **params)
             self.init = np.asarray(self.init)
 
         if len(X) == 0: