In [1]:
# %matplotlib notebook
import typing
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import shuffle
from scipy.linalg import eigh
from sklearn import manifold
import matplotlib.cm as cm

xrange = range

In [2]:
data = sklearn.datasets.fetch_mldata("MNIST original")
n = 1000
imgs = data.data
lables = data.target

imgs, lables = shuffle(imgs, lables, random_state=0)

imgs = imgs[:n] * 1.0
lables = lables[:n]

plt.figure()

n_img_per_row = 10
img = np.zeros((30 * n_img_per_row, 30 * n_img_per_row))
for i in range(n_img_per_row):
    ix = 30 * i + 1
    for j in range(n_img_per_row):
        iy = 30 * j + 1
        img[ix:ix + 28, iy:iy + 28] = imgs[i * n_img_per_row + j].reshape((28, 28))

plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 28x28-dimensional digits dataset')
print(lables)

[ 1.  9.  2.  2.  7.  1.  8.  3.  3.  7.  7.  5.  0.  1.  2.  9.  2.  7.
  7.  9.  9.  8.  5.  8.  1.  2.  3.  7.  5.  5.  6.  0.  9.  0.  2.  5.
  0.  0.  2.  9.  5.  7.  2.  4.  8.  6.  0.  8.  3.  9.  9.  4.  3.  9.
  8.  1.  6.  0.  3.  1.  9.  6.  4.  0.  2.  9.  3.  0.  9.  3.  8.  6.
  3.  1.  2.  2.  3.  2.  5.  1.  1.  8.  3.  7.  4.  6.  3.  1.  0.  9.
  2.  7.  4.  8.  7.  1.  6.  7.  8.  3.  3.  2.  3.  4.  9.  0.  4.  4.
  4.  3.  5.  6.  4.  8.  3.  8.  9.  1.  0.  4.  7.  2.  7.  2.  0.  6.
  8.  6.  8.  1.  8.  5.  5.  1.  2.  9.  2.  7.  5.  9.  7.  5.  6.  8.
  1.  6.  4.  2.  6.  7.  1.  4.  7.  9.  4.  8.  8.  4.  7.  1.  0.  6.
  4.  5.  1.  1.  4.  8.  1.  2.  7.  1.  5.  7.  7.  8.  9.  5.  5.  6.
  0.  3.  7.  4.  0.  2.  8.  8.  1.  9.  1.  0.  5.  2.  9.  4.  4.  1.
  8.  6.  2.  7.  1.  3.  8.  6.  2.  2.  6.  2.  9.  8.  0.  9.  5.  5.
  0.  6.  3.  1.  8.  3.  9.  8.  3.  7.  9.  0.  6.  9.  4.  3.  2.  1.
  3.  0.  6.  8.  0.  0.  0.  1.  2.  3.  0.  4.  2

In [3]:
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from mpl_toolkits.mplot3d import Axes3D

#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding2d(X, y, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})


    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

        
def plot_embedding3d(X, y, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111, projection='3d')
    for i in range(X.shape[0]):
        ax.text(X[i, 0], X[i, 1], X[i, 2], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})


    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [4]:
def analyze_M(M: np.ndarray):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(M, interpolation='nearest')
    fig.colorbar(cax)
    plt.title("Matrix M")
    plt.savefig("Matrix_M.png")
    
    U, s, V = np.linalg.svd(M)
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(s)
    plt.title("Eigen values")
    plt.xlabel("Bins")
    plt.ylabel("Frequency")
    
    plt.savefig("eigen_histo.png")
    
    reconstruction_error = np.cumsum(s[::-1])
#     print(s)
#     print(reconstruction_error)
    fig = plt.figure()
    plt.plot(np.linspace(1, s.shape[0], s.shape[0]), reconstruction_error, 'ro')
    plt.title("Reconstruction Error")
    plt.xlabel("K")
    plt.ylabel("Error")
    
    plt.savefig("error.png")

In [5]:
def sexy_lle(X: np.ndarray, k: int, m: int = 2, metric: str = 'euclidean', analyze: bool = False) -> np.ndarray:
    n = X.shape[0] # number of data points
    d = X.shape[1] # dimensionality
    
    # Compute the neighbors of each data point
    nbrs = NearestNeighbors(n_neighbors=k + 1, metric=metric).fit(X)
    distances, indices = nbrs.kneighbors(X)
    
    
    W = np.zeros((n,n))
    for i in range(n):        
        ixs = indices[i][1:]
        x = X[i]
        ns = X[ixs]
        deltas = ns - x
        C = np.dot(deltas, np.transpose(deltas))
        factor = np.trace(C) / (k * 1000.0)
        C = C + factor * np.eye(k)

        w = np.linalg.solve(C, np.ones(k))
        W[i][ixs] = w / w.sum()
        
#         C_inv = np.linalg.inv(C)
#         normalizer = np.sum(C_inv, axis=None)
#         W[i][ixs] = np.sum(C_inv, axis=1) / normalizer
        
    eye = np.eye(W.shape[0])
    temp = eye - W
    M = np.dot(np.transpose(temp), temp)
    if analyze:
        analyze_M(M)

    k_skip = 1
    eigen_values, eigen_vectors = eigh(M, eigvals=(k_skip, m + k_skip - 1), overwrite_a=True)
    index = np.argsort(np.abs(eigen_values))
    embedding, reconstruction_error =  eigen_vectors[:, index], np.sum(eigen_values)
    
    return embedding


In [11]:
ks = [5, 10, 20, 30]
metrics = ['euclidean', 'manhattan', 'canberra', 'hamming']

for metric in metrics:
    for k in ks:
        title = "k: " + str(k) + " (" + metric + ")"
        embeded2 = sexy_lle(imgs, k, 2, metric)
        embeded3 = sexy_lle(imgs, k, 3, metric)

        plot_embedding2d(embeded2, lables, title)
        plt.savefig("k_" + str(k) + metric + '.png')
        plot_embedding3d(embeded3, lables, title)
        plt.savefig("k_" + str(k) + metric + '3d.png')


    #     clf = manifold.LocallyLinearEmbedding(k, n_components=2,
    #                                           method='standard')
    #     sk_embeded = clf.fit_transform(imgs)
    #     plot_embedding2d(sk_embeded, lables, title)




In [7]:
sexy_lle(imgs, 10, 2, analyze=True)





array([[ 0.05590969, -0.0826831 ],
       [ 0.02357511,  0.05046604],
       [ 0.00874849, -0.00513027],
       ..., 
       [-0.03052464, -0.01575462],
       [-0.0106847 ,  0.00082979],
       [-0.03393292,  0.00272716]])

In [8]:
def reconstruct(y: np.ndarray, X: np.ndarray, Y: np.ndarray, k: int, metric: str = 'euclidean') -> np.ndarray:
    nbrs = NearestNeighbors(n_neighbors=k + 1, metric=metric).fit(Y)
    distances, indices = nbrs.kneighbors(y)
    indices = indices[:,1:]
    ns = Y[indices]
    deltas = ns - y
    deltas = deltas.reshape(k, y.shape[0])
    C = np.dot(deltas, np.transpose(deltas))
    factor = np.trace(C) / (k * 1000.0)
    C = C + factor * np.eye(k)
    w = np.linalg.solve(C, np.ones(k))
    w = w / w.sum()
    
    reconstructed_x = np.dot(np.transpose(X[indices].reshape((k,-1))), w)
    return reconstructed_x
    

In [9]:
def visualize_digits(xs: typing.List[np.ndarray], title=''):
    plt.figure()
    for i, x in enumerate(xs):
        x = x.reshape((28,28))
        plt.subplot(1,len(xs), i+1)
        plt.imshow(x, cmap=plt.cm.binary)
        plt.axis('off')


In [10]:
im1 = imgs[0]
emb1 = embeded2[0]
reconstructed1 = reconstruct(emb1, imgs, embeded2, 10)
im2 = imgs[1]
emb2 = embeded2[1]
reconstructed2 = reconstruct(emb2, imgs, embeded2, 10)
# visualize_digits([im1,reconstructed1])
# visualize_digits([im2,reconstructed2])

recs = []
ims = []
print("starting interpolation")
for i in np.linspace(0, 1, 10):
    emb = i * emb1 + (1-i) * emb2
    im = i * im1 + (1-i) * im2
    rec = reconstruct(emb, imgs, embeded2, 10)
    recs += [rec]
    ims += [im]

visualize_digits(recs)
plt.savefig("interpolation_reconstruction.png")
visualize_digits(ims)
plt.savefig("interpolation_original.png")



starting interpolation
