In [0]:
! wget 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
! wget 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'

In [0]:
! gunzip train-images-idx3-ubyte.gz
! gunzip train-labels-idx1-ubyte.gz

In [0]:
! wget 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
! wget 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'

In [0]:
! gunzip t10k-images-idx3-ubyte.gz
! gunzip t10k-labels-idx1-ubyte.gz

In [0]:
! ls

In [0]:
from mlxtend.data import loadlocal_mnist
import matplotlib.pyplot as plt
import numpy as np
import time
import scipy as sp
from scipy import stats
%matplotlib inline

Images, Labels = loadlocal_mnist(
        images_path='train-images-idx3-ubyte',
        labels_path='train-labels-idx1-ubyte')

sampleSize = 2000
Images = Images[:sampleSize]
Labels = Labels[:sampleSize]
gray_scale_limit = 256

In [0]:
TestImages, TestLabels = loadlocal_mnist(
        images_path='t10k-images-idx3-ubyte',
        labels_path='t10k-labels-idx1-ubyte')

In [0]:
def Euclidean(a, b):
    return np.linalg.norm(a-b)

In [0]:
def closest_centroid(Images, Centroids):
    distances = np.array([[Euclidean(Images[i], Centroids[j]) for j in range(K)] for i in range(len(Images))])
    return np.argmin(distances, axis=1)

In [0]:
def initialize_centroids(Images):
    centroids = Images.copy()
    np.random.shuffle(centroids)
    return centroids[:K]

In [0]:
def move_centroids(Images, Closest, Centroids):
    return np.array([Images[Closest==i].mean(axis=0) for i in range(K)])

In [0]:
def get_distortion(Images, ClosestCentroids, Centroids):
    return int(np.sum([(Euclidean(Images[index], Centroids[centroid]) ** 2) for index, centroid in enumerate(ClosestCentroids)]))

In [0]:
def my_run():
    global Centroids, ClosestCentroids, Centroids, ClusterLabels, Errors, Iterations
    Centroids = initialize_centroids(Images)
    Errors = []
    
    oldCentroids = None
    numberofiterations = 300

    start_time = time.time()
    converged = False
    i = 0
    for i in range(numberofiterations):
        ClosestCentroids = closest_centroid(Images, Centroids)
        oldCentroids = Centroids.copy()
        Centroids = move_centroids(Images, ClosestCentroids, Centroids)
        error = int(Euclidean(Centroids, oldCentroids))
        print("Current Iteration :" , i)
        # Errors.append(error)
        if error == 0:
            converged = True
            break

    end_time = time.time()

    print("Time taken : " , end_time - start_time)
    print("Has converged : ", converged, " last iteration : " , i)
    # LastIteration.append(i)
    ClusterLabels = clusters_labels(ClosestCentroids)
    print("Accuracy =", calcAccuracy(TestImages, TestLabels, Images, Labels, ClusterLabels))

In [0]:
def clusters_labels(ClosestCentroids):
    return np.array([sp.stats.mode([Labels[index] for index, val in enumerate(ClosestCentroids) if val == j])[0][0] for j in range(K)])

In [0]:
def calcAccuracy(TestImages, TestLabels, Images, Labels, ClusterLabels):
    TestImagesCentroids = closest_centroid(TestImages, Centroids)
    for i in range(TestImagesCentroids.size):
        TestImagesCentroids[i] = ClusterLabels[TestImagesCentroids[i]]
    return np.sum([0 if val != TestLabels[i] else 1 for i, val in enumerate(TestImagesCentroids)]) / len(TestImages) * 100

# print(calcAccuracy(TestImages, TestLabels, Images, Labels, ClusterLabels))

In [0]:
Ks_Used = []
Accuracies = []

#Running this will take some time depending on number of images and K

def runProgram():
    global K, Ks_Used, Accuracies
    for loops in range(5, 51, 5):
        K = loops
        my_run()
        ncols = 5
        nrows = K // (ncols)
        print(nrows, ncols, K)
        figsize = [10, 2 * nrows]
        fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)

        for i, axi in enumerate(ax.flat):
            img = Centroids[i].reshape(28, 28)
            axi.imshow(img)
            axi.axis('off')
            axi.set_title("Label : " + str(ClusterLabels[i]))

        Ks_Used.append(loops)
        Accuracies.append(calcAccuracy(TestImages, TestLabels, Images, Labels, ClusterLabels))
        plt.tight_layout(True)
        # plt.show()
        
        name = str(K) + "centroids.png"
        plt.savefig(name, bbox_inches='tight', pad_inches = 0)

runProgram()

In [0]:
from google.colab import files
for i in range(5, 51, 5):
    name = str(i) + "centroids.png"
    print(name)
    files.download(name)

In [0]:
from google.colab import files
fig = plt.figure(figsize=[15, 7])
plt.plot(Ks_Used, Accuracies, label="Effect of K on accuracy")
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.title('Effect of K on accuracy')
plt.savefig("accuracy.png", bbox_inches='tight', pad_inches = 0)
files.download("accuracy.png")

In [0]:
from google.colab import files
fig = plt.figure(figsize=[20, 10])
plt.plot(Iterations, Errors)
plt.xlabel("Iterations")
plt.ylabel("Errors")
plt.title("Errors till convergence, K = 15")
plt.savefig("errors.png", bbox_inches="tight", pad_inches = 0)
plt.grid()
files.download("errors.png")
# Iterations = [z for z in range(62)]

In [0]:
# Are results wildly different for random restarts if we keep everything else the same
import statistics
K = 15

loops = 10

Accuracies = [0 for i in range(loops)]
LastIteration = []
i = 0
for i in range(loops):
    my_run()
    Accuracies[i] = calcAccuracy(TestImages, TestLabels, Images, Labels, ClusterLabels)
    print("Accuracy : " , Accuracies[i] , "\n");

meanAccuracy = sum(Accuracies) / len(Accuracies)
print("Mean accuracy = ", meanAccuracy)
standard_deviation = statistics.stdev(Accuracies)
print("Standard deviation of accuracy = " , standard_deviation)

meanIter = sum(LastIteration) / len(LastIteration)
print("Mean iterations = ", meanIter)
standard_deviation = statistics.stdev(LastIteration)
print("Standard deviation of iterations = " , standard_deviation)


In [0]:
K = 200Run the algorithm for different values of K and provide the accuracy of the fit for each
tried value with your comments.
my_run()