In [1]:
# always import
import sys
from time import time

# numpy & scipy
import numpy as np
import scipy

# sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.manifold import Isomap, TSNE
from sklearn.metrics import pairwise_distances_argmin, pairwise_distances

# visuals
import matplotlib.pyplot as plt
from matplotlib import offsetbox

# maybe
from numba import jit

In [2]:
# load MNIST data and normalization
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, data_home='mnist/')
y = np.asarray(list(map(int, y)))
X = np.asarray(X.astype(float))
X = scale(X)
n_digits = len(np.unique(y))

In [3]:
# TODO: kmeans with PCA initialization of centers, or random initial centers
def kmeans_11(X, n_clusters, init="pca", tol = 1.0e-4, max_iter = 300, n_init=10, rseed=2):
    # 1. Randomly choose clusters
    if init == "pca":
        pca = PCA(n_components=n_clusters).fit(X)
        centers = pca.fit(X).components_
        n_init = 1
        #print('centers:', centers.shape)
    
    best_obj = 0.0
    for trial in range(n_init):
        
        if init == "rand":
            rng = np.random.RandomState(rseed+trial)
            i = rng.permutation(X.shape[0])[:n_clusters]
            centers = X[i]
        
        iter = 0
        obj = np.inf
        while iter < max_iter:
            # 2a. Assign labels based on closest center
            labels = pairwise_distances_argmin(X, centers)
            #print('labels:', np.unique(labels))

            # 2b. Find new centers from means of points
            new_centers = np.array([X[labels == i].mean(0)
                                    for i in range(n_clusters)])

            # 2c. Check for convergence
            new_obj = X - new_centers[labels]
            new_obj = np.sum(new_obj * new_obj)
            if obj - new_obj < tol:
                break
            else:
                centers = new_centers
                obj = new_obj

            iter += 1
            
        if trial == 0 or obj < best_obj:
            best_obj = obj
            best_centers = centers
            best_labels = labels               
    
    return best_centers, best_labels, best_obj

In [4]:
# PCA
pca = PCA(n_components=30).fit(X)
X_pca = pca.transform(X)

In [5]:
# kmeans with PCA pre-processing
t0 = time()
# pca = PCA(n_components=50).fit(X)
# X_pca = pca.transform(X)
# kmeans_pca = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
# kmeans_pca.fit(X_pca)
best_centers, best_labels1, best_obj = kmeans_11(X, n_clusters=n_digits, init="pca", rseed=2)
# print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
#       % ("kmeans", (time() - t0), kmeans.inertia_,
#          metrics.homogeneity_score(y, kmeans.labels_),
#          metrics.completeness_score(y, kmeans.labels_),
#          metrics.v_measure_score(y, kmeans.labels_),
#          metrics.adjusted_rand_score(y, kmeans.labels_),
#          metrics.adjusted_mutual_info_score(y,  kmeans.labels_)))
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
      % ("kmeans", (time() - t0), best_obj,
         metrics.homogeneity_score(y, best_labels1),
         metrics.completeness_score(y, best_labels1),
         metrics.v_measure_score(y, best_labels1),
         metrics.adjusted_rand_score(y, best_labels1),
         metrics.adjusted_mutual_info_score(y, best_labels1)))

kmeans   	135.78s	42569895	0.420	0.442	0.431	0.320	0.420




In [7]:
# kmeans with 10 random trials
t0 = time()
# kmeans_pca = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
# kmeans_pca.fit(X_pca)
best_centers, best_labels2, best_obj = kmeans_11(X_pca, n_clusters=n_digits, init="rand", rseed=2, n_init=10)
# print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
#       % ("kmeans", (time() - t0), kmeans_pca.inertia_,
#          metrics.homogeneity_score(y, kmeans_pca.labels_),
#          metrics.completeness_score(y, kmeans_pca.labels_),
#          metrics.v_measure_score(y, kmeans_pca.labels_),
#          metrics.adjusted_rand_score(y, kmeans_pca.labels_),
#          metrics.adjusted_mutual_info_score(y,  kmeans_pca.labels_)))
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
      % ("kmeans", (time() - t0), best_obj,
         metrics.homogeneity_score(y, best_labels2),
         metrics.completeness_score(y, best_labels2),
         metrics.v_measure_score(y, best_labels2),
         metrics.adjusted_rand_score(y, best_labels2),
         metrics.adjusted_mutual_info_score(y, best_labels2)))

kmeans   	42.05s	15013070	0.423	0.444	0.433	0.323	0.423




In [8]:
# TODO: Hungarian algorithm
class _Hungary(object):
    """State of the Hungarian algorithm.

    Parameters
    ----------
    cost_matrix : 2D matrix
        The cost matrix. Must have shape[1] >= shape[0].
    """

    def __init__(self, cost_matrix):
        self.C = cost_matrix.copy()

        n, m = self.C.shape
        self.row_uncovered = np.ones(n, dtype=bool)
        self.col_uncovered = np.ones(m, dtype=bool)
        self.Z0_r = 0
        self.Z0_c = 0
        self.path = np.zeros((n + m, 2), dtype=int)
        self.marked = np.zeros((n, m), dtype=int)

    def _clear_covers(self):
        """Clear all covered matrix cells"""
        self.row_uncovered[:] = True
        self.col_uncovered[:] = True


# Individual steps of the algorithm follow, as a state machine: they return
# the next step to be taken (function to be called), if any.

def _step1(state):
    """Steps 1 and 2 in the Wikipedia page."""

    # Step 1: For each row of the matrix, find the smallest element and
    # subtract it from every element in its row.
    state.C -= state.C.min(axis=1)[:, np.newaxis]
    # Step 2: Find a zero (Z) in the resulting matrix. If there is no
    # starred zero in its row or column, star Z. Repeat for each element
    # in the matrix.
    for i, j in zip(*np.where(state.C == 0)):
        if state.col_uncovered[j] and state.row_uncovered[i]:
            state.marked[i, j] = 1
            state.col_uncovered[j] = False
            state.row_uncovered[i] = False

    state._clear_covers()
    return _step3


def _step3(state):
    """
    Cover each column containing a starred zero. If n columns are covered,
    the starred zeros describe a complete set of unique assignments.
    In this case, Go to DONE, otherwise, Go to Step 4.
    """
    marked = (state.marked == 1)
    state.col_uncovered[np.any(marked, axis=0)] = False

    if marked.sum() < state.C.shape[0]:
        return _step4


def _step4(state):
    """
    Find a noncovered zero and prime it. If there is no starred zero
    in the row containing this primed zero, Go to Step 5. Otherwise,
    cover this row and uncover the column containing the starred
    zero. Continue in this manner until there are no uncovered zeros
    left. Save the smallest uncovered value and Go to Step 6.
    """
    # We convert to int as numpy operations are faster on int
    C = (state.C == 0).astype(int)
    covered_C = C * state.row_uncovered[:, np.newaxis]
    covered_C *= np.asarray(state.col_uncovered, dtype=int)
    n = state.C.shape[0]
    m = state.C.shape[1]

    while True:
        # Find an uncovered zero
        row, col = np.unravel_index(np.argmax(covered_C), (n, m))
        if covered_C[row, col] == 0:
            return _step6
        else:
            state.marked[row, col] = 2
            # Find the first starred element in the row
            star_col = np.argmax(state.marked[row] == 1)
            if state.marked[row, star_col] != 1:
                # Could not find one
                state.Z0_r = row
                state.Z0_c = col
                return _step5
            else:
                col = star_col
                state.row_uncovered[row] = False
                state.col_uncovered[col] = True
                covered_C[:, col] = C[:, col] * (
                    np.asarray(state.row_uncovered, dtype=int))
                covered_C[row] = 0


def _step5(state):
    """
    Construct a series of alternating primed and starred zeros as follows.
    Let Z0 represent the uncovered primed zero found in Step 4.
    Let Z1 denote the starred zero in the column of Z0 (if any).
    Let Z2 denote the primed zero in the row of Z1 (there will always be one).
    Continue until the series terminates at a primed zero that has no starred
    zero in its column. Unstar each starred zero of the series, star each
    primed zero of the series, erase all primes and uncover every line in the
    matrix. Return to Step 3
    """
    count = 0
    path = state.path
    path[count, 0] = state.Z0_r
    path[count, 1] = state.Z0_c

    while True:
        # Find the first starred element in the col defined by
        # the path.
        row = np.argmax(state.marked[:, path[count, 1]] == 1)
        if state.marked[row, path[count, 1]] != 1:
            # Could not find one
            break
        else:
            count += 1
            path[count, 0] = row
            path[count, 1] = path[count - 1, 1]

        # Find the first prime element in the row defined by the
        # first path step
        col = np.argmax(state.marked[path[count, 0]] == 2)
        if state.marked[row, col] != 2:
            col = -1
        count += 1
        path[count, 0] = path[count - 1, 0]
        path[count, 1] = col

    # Convert paths
    for i in range(count + 1):
        if state.marked[path[i, 0], path[i, 1]] == 1:
            state.marked[path[i, 0], path[i, 1]] = 0
        else:
            state.marked[path[i, 0], path[i, 1]] = 1

    state._clear_covers()
    # Erase all prime markings
    state.marked[state.marked == 2] = 0
    return _step3


def _step6(state):
    """
    Add the value found in Step 4 to every element of each covered row,
    and subtract it from every element of each uncovered column.
    Return to Step 4 without altering any stars, primes, or covered lines.
    """
    # the smallest uncovered value in the matrix
    if np.any(state.row_uncovered) and np.any(state.col_uncovered):
        minval = np.min(state.C[state.row_uncovered], axis=0)
        minval = np.min(minval[state.col_uncovered])
        state.C[~state.row_uncovered] += minval
        state.C[:, state.col_uncovered] -= minval
    return _step4

In [9]:
# utility functions when using Hungarian algorithm for clustering evaluation
# from munkres import Munkres

def make_cost_matrix(c1, c2):
    """
    """
    uc1 = np.unique(c1)
    uc2 = np.unique(c2)
    l1 = uc1.size
    l2 = uc2.size
    assert(l1 == l2 and np.all(uc1 == uc2))

    m = np.ones([l1, l2])
    for i in range(l1):
        it_i = np.nonzero(c1 == uc1[i])[0]
        for j in range(l2):
            it_j = np.nonzero(c2 == uc2[j])[0]
            m_ij = np.intersect1d(it_j, it_i)
            m[i,j] =  -m_ij.size
    return m

def translate_clustering(clt, mapper):
    return np.array([ mapper[i] for i in clt ])

def accuracy(cm):
    """computes accuracy from confusion matrix"""
    return np.trace(cm, dtype=float) / np.sum(cm)

def Hungarian_caller(y_pred, y_true):
    
    cost_matrix = make_cost_matrix(y_pred, y_true)

#     m = Munkres()
#     indexes = m.compute(cost_matrix)
    state = _Hungary(cost_matrix)
    step = None if 0 in cost_matrix.shape else _step1
    while step is not None:
        step = step(state)
    marked = state.marked
    indexes = np.where(marked == 1)
    
    mapper = { old: new for (old, new) in zip(indexes[0], indexes[1])}

    print("---------------------\nmapping:")
    for old, new in mapper.items():
        print("map: %s --> %s" %(old, new))
        
    new_pred = translate_clustering(y_pred, mapper)
    num_labels = len(np.unique(y_true))
    new_cm = confusion_matrix(y_true, new_pred, labels=range(num_labels))
    new_acc = accuracy(new_cm)
    print("---------------------\nnew confusion matrix:\n" \
              " %s\naccuracy: %.2f" % (str(new_cm), new_acc))
    
    return new_pred, new_cm, new_acc

In [10]:
new_pred, new_cm, new_acc = Hungarian_caller(best_labels1, y)

---------------------
mapping:
map: 0 --> 0
map: 1 --> 9
map: 2 --> 2
map: 3 --> 6
map: 4 --> 1
map: 5 --> 3
map: 6 --> 8
map: 7 --> 7
map: 8 --> 5
map: 9 --> 4
---------------------
new confusion matrix:
 [[3810   35  112 1425   12  503  341    6  649   10]
 [   0 7644   13   26    8  154   16    5   10    1]
 [  37  829 2388  717  189   67  846   32 1844   41]
 [   9  574  747 4089  204   97   83   97 1136  105]
 [  55  535   73    5 3967  936  127  753   30  343]
 [  31  464  247 2167  311 2662  101   88  179   63]
 [ 292  564  415  115   28  119 5322    1   15    5]
 [  20  516   16    9 1581  137    3 4082   13  916]
 [  44 1172  206 2635  358 2026   32  188   91   73]
 [  44  332   23  124 3485  124    5 2374   16  431]]
accuracy: 0.49


In [11]:
new_pred, new_cm, new_acc = Hungarian_caller(best_labels2, y)

---------------------
mapping:
map: 0 --> 0
map: 1 --> 6
map: 2 --> 5
map: 3 --> 1
map: 4 --> 2
map: 5 --> 4
map: 6 --> 7
map: 7 --> 3
map: 8 --> 8
map: 9 --> 9
---------------------
new confusion matrix:
 [[3990   32   82  934   20  663  764    7  398   13]
 [   0 7620   13   29    6  172   16    7   13    1]
 [  41  824 2570  790  159   91  696   44 1719   56]
 [  13  671  220 4449  185  149   97   92 1141  124]
 [  50  538   49    3 3948  900  161  673   24  478]
 [  36  528  128 2059  324 2806  129   66  156   81]
 [ 238  497  777   54   37  134 5123    1    9    6]
 [  20  549    7   14 1608  110    3 4054   18  910]
 [  49 1344   96 2227  402 2281   56  184   78  108]
 [  41  352   15  106 3522  118    7 2262   15  520]]
accuracy: 0.50


In [12]:
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import kneighbors_graph
from sklearn import neighbors
# subset = np.random.choice(len(X), 50000, replace=False)
# knn_graph = radius_neighbors_graph(X_pca[subset], 10.0, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False)
# A = kneighbors_graph(X[subset], 10, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False)
nn = neighbors.NearestNeighbors(n_neighbors=500, algorithm='kd_tree', metric='euclidean', n_jobs=8)
# nn = neighbors.NearestNeighbors(n_neighbors=500, metric='l1', n_jobs=8)
t0 = time()
knn_graph = nn.fit(X_pca).kneighbors_graph(mode='distance')
print('knn graph time =', time()-t0)
sigma = knn_graph.data.sum()/float(knn_graph.size)
print('sigma=', sigma)
knn_graph.data = np.exp(-1.0 * np.divide(np.power(knn_graph.data, 2), 2*sigma**2))
print('E_sum=', knn_graph.data.sum())
knn_graph.data /= knn_graph.data.sum()
# knn_graph.data = 1.0 - knn_graph.data
print('nonzeros=', knn_graph.count_nonzero())
                        

knn graph time = 116.75337314605713
sigma= 10.826615024646268
E_sum= 21540402.226940256


  self[i, j] = values


nonzeros= 35070000


In [13]:
# TODO: compute Graph Laplacian (symmetric normalized in hw)
def _setdiag_dense(A, d):
    A.flat[::len(d)+1] = d

def _laplacian_sparse(graph, normed=False, axis=0):
    if graph.format in ('lil', 'dok'):
        m = graph.tocoo()
        needs_copy = False
    else:
        m = graph
        needs_copy = True
    w = m.sum(axis=axis).getA1() - m.diagonal()
    if normed:
        m = m.tocoo(copy=needs_copy)
        isolated_node_mask = (w == 0)
        w = np.where(isolated_node_mask, 1, np.sqrt(w))
        m.data /= w[m.row]
        m.data /= w[m.col]
        m.data *= -1
        m.setdiag(1 - isolated_node_mask)
    else:
        if m.format == 'dia':
            m = m.copy()
        else:
            m = m.tocoo(copy=needs_copy)
        m.data *= -1
        m.setdiag(w)
    return m, w

def _laplacian_dense(graph, normed=False, axis=0):
    m = np.array(graph)
    np.fill_diagonal(m, 0)
    w = m.sum(axis=axis)
    if normed:
        isolated_node_mask = (w == 0)
        w = np.where(isolated_node_mask, 1, np.sqrt(w))
        m /= w
        m /= w[:, np.newaxis]
        m *= -1
        _setdiag_dense(m, 1 - isolated_node_mask)
    else:
        m *= -1
        _setdiag_dense(m, w)
    return m, w

def laplacian(csgraph, normed):
    create_lap = _laplacian_sparse if scipy.sparse.isspmatrix(csgraph) else _laplacian_dense
    lap, d = create_lap(csgraph, normed=normed)
    return lap, d

In [14]:
# TODO: Spectral Clustering
def spectral_clustering(affinity, n_clusters):
    L, d = laplacian((affinity + affinity.transpose())/2.0, normed=True)
#     L, d = laplacian(affinity, normed=True)
    t0 = time()
    eig_val, eig_vect = scipy.sparse.linalg.eigs(L, min(n_clusters+10, 30), which='SM')
    print('eigendecomp time =', time()-t0)
    X = eig_vect[:, 1:].real
    rows_norm = np.linalg.norm(X, axis=1, ord=2)
    Y = (X.T / rows_norm).T
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=20)
    kmeans.fit(Y)
#     best_centers, best_labels, best_obj = kmeans_11(Y, n_clusters=n_digits, init="pca", rseed=2)
#     return best_centers, best_labels, Y
    return kmeans.cluster_centers_, kmeans.labels_, Y

In [15]:
sc_centers, sc_pred, _ = spectral_clustering(knn_graph, n_digits)

eigendecomp time = 39.77084422111511


In [29]:
scipy.sparse.isspmatrix_csr(knn_graph)

True

In [22]:
(knn_graph + knn_graph.transpose()).count_nonzero()

50060092

In [16]:
new_pred, new_cm, new_acc = Hungarian_caller(sc_pred, y)

---------------------
mapping:
map: 0 --> 2
map: 1 --> 9
map: 2 --> 3
map: 3 --> 4
map: 4 --> 8
map: 5 --> 6
map: 6 --> 0
map: 7 --> 1
map: 8 --> 7
map: 9 --> 5
---------------------
new confusion matrix:
 [[6643    4   17   36   11   79   58    4   49    2]
 [   0 7750   31   23   16   18   22    3   12    2]
 [  86   79 5437  222   17  828   38   60  187   36]
 [  16   98   91 4760   33 1647   14  111  311   60]
 [  15   90   76   16 3981  109   39   24    9 2465]
 [ 104   40   27 1838  101 3625  143   15  284  136]
 [ 262   75   99   31    7   91 6282    1   21    7]
 [  21  214   39   23  294   55    0 6175    8  464]
 [  76  102   51 1035   86  620   24   28 4680  123]
 [  39   62   24  100 3600   34    1  586   41 2471]]
accuracy: 0.74


In [17]:
# TODO: KNN defined on data selection by kmeans
def find_landmark(X, centers):
    index = pairwise_distances_argmin(centers, X)
    return index

def knn(X, landmarks, y, k):
    dis = pairwise_distances(X, landmarks)
    vote = dis.argpartition(k, axis=1)[:, :k]
    for i in range(k):
        vote[:, i] = y[vote[:, i]]
    labels = list(map(lambda v: max(set(v), key = list(v).count), vote))
    return labels

def eval_knn_kmeans(X, y, k, centers):
    train_index = find_landmark(X, centers)
    test_index = np.setdiff1d(np.arange(len(X)), train_index, assume_unique=True)
    test_pred = knn(X[test_index], X[train_index], y[train_index], k)
    acc = np.sum(test_pred == y[test_index])/float(len(test_index))
    return acc

In [18]:
# TODO: kmeans based KNN: find a sample closest to each kmeans centroid, and use them as training set for KNN (K=1)
# best_centers, best_labels, best_obj = kmeans_11(X, n_clusters=n_digits*5, init="pca", rseed=2)
kmeans_pca = KMeans(init='k-means++', n_clusters=n_digits*10, n_init=10)
kmeans_pca.fit(X_pca)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=100, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
acc = eval_knn_kmeans(X_pca, y, 1, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=1) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 3, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=3) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 5, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=5) accuracy:', acc)

kmeans centroids based KNN (K=1) accuracy: 0.8140629470672389
kmeans centroids based KNN (K=3) accuracy: 0.7717310443490701
kmeans centroids based KNN (K=5) accuracy: 0.7438483547925608


In [20]:
kmeans_pca = KMeans(init='k-means++', n_clusters=n_digits*20, n_init=10)
kmeans_pca.fit(X_pca)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=200, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [443]:
acc = eval_knn_kmeans(X_pca, y, 1, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=1) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 3, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=3) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 5, kmeans_pca.cluster_centers_)
print('kmeans centroids based KNN (K=5) accuracy:', acc)

kmeans centroids based KNN (K=1) accuracy: 0.8614756446991404
kmeans centroids based KNN (K=3) accuracy: 0.8358022922636104
kmeans centroids based KNN (K=5) accuracy: 0.8216905444126075


In [453]:
# TODO: random sampling n_digits*10 samples, and use them as training set for KNN (K=1)
acc = eval_knn_kmeans(X_pca, y, 1, X_pca[np.random.choice(len(X_pca), n_digits*10, replace=False)])
print('random samples based KNN (K=1) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 3, X_pca[np.random.choice(len(X_pca), n_digits*10, replace=False)])
print('random samples based KNN (K=3) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 5, X_pca[np.random.choice(len(X_pca), n_digits*10, replace=False)])
print('random samples based KNN (K=5) accuracy:', acc)

acc = eval_knn_kmeans(X_pca, y, 1, X_pca[np.random.choice(len(X_pca), n_digits*20, replace=False)])
print('random samples based KNN (K=1) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 3, X_pca[np.random.choice(len(X_pca), n_digits*20, replace=False)])
print('random samples based KNN (K=3) accuracy:', acc)
acc = eval_knn_kmeans(X_pca, y, 5, X_pca[np.random.choice(len(X_pca), n_digits*20, replace=False)])
print('random samples based KNN (K=5) accuracy:', acc)

random samples based KNN (K=1) accuracy: 0.680829756795422
random samples based KNN (K=3) accuracy: 0.6800143061516452
random samples based KNN (K=5) accuracy: 0.595450643776824
random samples based KNN (K=1) accuracy: 0.7710888252148997
random samples based KNN (K=3) accuracy: 0.7227363896848138
random samples based KNN (K=5) accuracy: 0.7543696275071633


In [30]:
# TODO: test spectral clustering based KNN
sc_centers, sc_pred, X_lap = spectral_clustering(knn_graph, n_digits*10)

eigendecomp time = 39.07061696052551


In [31]:
sc_acc = eval_knn_kmeans(X_lap, y, 1, sc_centers)
print('spectral clustering based KNN (K=1) accuracy:', sc_acc)
sc_acc = eval_knn_kmeans(X_lap, y, 3, sc_centers)
print('spectral clustering based KNN (K=3) accuracy:', sc_acc)
sc_acc = eval_knn_kmeans(X_lap, y, 5, sc_centers)
print('spectral clustering based KNN (K=5) accuracy:', sc_acc)

spectral clustering based KNN (K=1) accuracy: 0.8470243204577969
spectral clustering based KNN (K=3) accuracy: 0.833447782546495
spectral clustering based KNN (K=5) accuracy: 0.817997138769671


In [32]:
# TODO: test spectral clustering based KNN
sc_centers, sc_pred, X_lap = spectral_clustering(knn_graph, n_digits*20)

eigendecomp time = 39.43999218940735


In [33]:
sc_acc = eval_knn_kmeans(X_lap, y, 1, sc_centers)
print('spectral clustering based KNN (K=1) accuracy:', sc_acc)
sc_acc = eval_knn_kmeans(X_lap, y, 3, sc_centers)
print('spectral clustering based KNN (K=3) accuracy:', sc_acc)
sc_acc = eval_knn_kmeans(X_lap, y, 5, sc_centers)
print('spectral clustering based KNN (K=5) accuracy:', sc_acc)

spectral clustering based KNN (K=1) accuracy: 0.8296131805157593
spectral clustering based KNN (K=3) accuracy: 0.8303151862464183
spectral clustering based KNN (K=5) accuracy: 0.8231375358166189


In [382]:
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(X.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [None]:
# t-SNE embedding of the digits dataset
print("Computing t-SNE embedding")
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
t0 = time()
X_tsne = tsne.fit_transform(X)

plot_embedding(X_tsne,
               "t-SNE embedding of the digits (time %.2fs)" %
               (time() - t0))

plt.show()

In [None]:
# Isomap projection of the digits dataset
print("Computing Isomap embedding")
t0 = time()
X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X)
print("Done.")
plot_embedding(X_iso,
               "Isomap projection of the digits (time %.2fs)" %
               (time() - t0))