In [None]:
import numpy as np
import struct
import os
from array import array
from os.path  import join

# MNIST Data Loader Class
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

In [None]:
import random
import matplotlib.pyplot as plt

# Set file paths based on added MNIST Datasets
input_path = join(os.getcwd(), 'datasets')
training_images_filepath = join(input_path, 'train-images-idx3-ubyte.gz')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte.gz')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte.gz')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte.gz')

#
# Helper function to show a list of images with their relating titles
#
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

#
# Show some random training and test images 
#
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

#show_images(images_2_show, titles_2_show)

x_train = np.array(x_train).reshape((len(x_train), -1))
y_train = np.array(y_train)
x_test = np.array(x_test).reshape((len(x_test), -1))
y_test = np.array(y_test)
print("Training Set Data  Shape: ", x_train.shape)
print("Training Set Label Shape: ", y_train.shape)
print("Test Set Data  Shape: ", x_test.shape)
print("Test Set Label Shape: ", y_test.shape)

random_prototyping_accuracy = []
cnn_accuracy = []
sample_sizes = []

In [None]:
def run_knn(x_train, y_train, x_test, y_test, k):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn import metrics
    
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_train, y_train)

    y_test_pred = knn_model.predict(x_test)
    test_acc = metrics.accuracy_score(y_test, y_test_pred)

    #print("KNN Test Accuracy: ", test_acc*100, "%")
    return test_acc

In [None]:
def get_random_idx(sample_size, data_size):
    rand_idx = np.array(sample_size, dtype=int)
    rand_idx = np.int_(np.round(np.random.rand(sample_size) * (data_size-1)))
    return rand_idx

def plot_accuracies(sample_size, random_accuracy, custom_accuracy):
    import matplotlib.pyplot as plt
    
    #y = np.sum(random_accuracy, axis=1)/random_accuracy.shape[1]
    #yerr = np.std(random_accuracy, axis=1)
    #plt.figure()
    #plt.errorbar(sample_size, y, yerr)

    X_axis = np.arange(len(sample_size))
    plt.bar(X_axis-0.2, random_accuracy, 0.4, label='random')
    plt.bar(X_axis+0.2, cnn_accuracy, 0.4, label='cnn')
    plt.xticks(X_axis, sample_size)
    plt.xlabel("Sample size")
    plt.ylabel("Test Accuracy")
    plt.legend()
    plt.show()


def compare_random(sample_size):
    rand_idx = get_random_idx(sample_size, x_train.shape[0])
    rand_data = np.take(x_train, rand_idx, axis=0)
    rand_labels = np.take(y_train, rand_idx, axis=0)
    
    random_prototyping_accuracy.append(run_knn(rand_data, rand_labels, x_test, y_test, 1))

In [None]:
def initialize_cnn_subset():
    subset_idxs = []
    for i in range(10): #mnist label classes
        for j in range(100):
            if(y_train[j] == i):
                subset_idxs.append(j)
                found = 1
                break
        if(found == 0):
            print("Couldn't find a label in 100 attempts")
    return subset_idxs

def condense(subset_idxs, x_train, y_train, x_test, y_test):
    subset_idxs = np.array(subset_idxs)
    iter = 0
    train_acc = 0
    test_acc = 0
    while(train_acc < 1.):
        for i in range(x_train.shape[0]):
            if(i in subset_idxs):
                pass
            else:
                found = run_knn(np.take(x_train, subset_idxs, axis=0), np.take(y_train, subset_idxs, axis=0), np.reshape(x_train[i], (1,-1)), np.reshape(y_train[i], (1,-1)), 1)
                if(found < 0.5):
                    subset_idxs = np.append(subset_idxs, i)
                    #print("TR idx: ", i, ", Subset size: ", subset_idxs.shape[0])
        train_acc = run_knn(np.take(x_train, subset_idxs, axis=0), np.take(y_train, subset_idxs, axis=0), x_train, y_train, 1)
        test_acc = run_knn(np.take(x_train, subset_idxs, axis=0), np.take(y_train, subset_idxs, axis=0), x_test, y_test, 1)
        print("[Iter ", iter, "] Subset size: ", subset_idxs.shape[0], " Training accuracy: ", train_acc, " Test Accuracy: ", test_acc)
        sample_sizes.append(subset_idxs.shape[0])
        cnn_accuracy.append(test_acc)
        compare_random(subset_idxs.shape[0])
        iter += 1
    
    with open('cnn_subset_idxs.txt', 'w') as f:
        for subset_idx in subset_idxs:
            f.write("%s\n" % subset_idx)
        
    return subset_idxs

def modified_condense(x_train, y_train, x_test, y_test):
    #subset_idxs = np.array()
    #remaining_idxs = np.arange(0, x_train.shape[0], 1, dtype=int)
    subset_data = []
    subset_labels = []

    for i in range(10): #mnist label classes
        for j in range(100):
            if(y_train[j] == i):
                subset_data.append(x_train[j])
                subset_labels.append(i) #y_train[j]
                found = 1
                break
        if(found == 0):
            print("Couldn't find a label in 100 attempts")

    subset_data = np.array(subset_data)
    subset_labels = np.array(subset_labels)
    print("Initial Subset size: ", subset_data.shape, " ", subset_labels.shape)
    train_acc = 0
    iter = 0
    prev_misclassified_idxs = []
    misclassified_idxs = []
    while(train_acc < 0.94):
        if(len(misclassified_idxs) > 0.99*x_train.shape[0]):
            misclassified_idxs = []
        misclassified_data = []
        misclassified_labels = []
        for i in range(x_train.shape[0]):
            #if(i in misclassified_idxs):
            #    pass
            #else:
            found = run_knn(subset_data, subset_labels, np.reshape(x_train[i], (1,-1)), np.reshape(y_train[i], (1,-1)), 1)
            if(found < 0.5):
                misclassified_idxs.append(i)
                misclassified_data.append(x_train[i])
                misclassified_labels.append(y_train[i])
        train_acc = run_knn(subset_data, subset_labels, x_train, y_train, 1)
        test_acc = run_knn(subset_data, subset_labels, x_test, y_test, 1)
        print("[Iter ", iter, "] Subset size: ", subset_data.shape[0], " Misclassified: ", len(misclassified_data), " Training accuracy: ", train_acc, " Test Accuracy: ", test_acc)
        misclassified_data = np.array(misclassified_data)
        misclassified_labels = np.array(misclassified_labels)
        for label in range(10):
            idxs = np.flatnonzero(misclassified_labels == label)
            if(idxs.shape[0] != 0):
                #print(idxs.shape[0], " points of label ", label, " misclassified")
                centroid_data =  np.sum(np.take(misclassified_data, idxs, axis=0), axis=0).reshape(1, misclassified_data.shape[1])/idxs.shape[0]
                #print("Adding centroid of all these points to subset: ", centroid_data.shape)
                subset_data = np.append(subset_data, centroid_data, axis=0)
                subset_labels = np.append(subset_labels, label)
        #print("Iter ", iter, "New Subset size: ", subset_data.shape, " ", subset_labels.shape)
        iter += 1


In [None]:
cnn_subset_ids = initialize_cnn_subset()

In [None]:
cnn_subset_ids = condense(cnn_subset_ids, x_train, y_train, x_test, y_test)
plot_accuracies(sample_sizes, random_prototyping_accuracy, cnn_accuracy)
#modified_condense(x_train, y_train, x_test, y_test)

In [None]:
M = [1000, 4945, 10000]
n_iterations = 5

for i, m in enumerate(M):
    for iter in range(n_iterations):
        rand_idx = get_random_idx(m, x_train.shape[0])
        rand_data = np.take(x_train, rand_idx, axis=0)
        rand_labels = np.take(y_train, rand_idx, axis=0)
    
        random_prototyping_accuracy.append([m, run_knn(rand_data, rand_labels, x_test, y_test, 1)])
        #random_prototyping_accuracy[i][iter] = run_knn(rand_data, rand_labels, x_test, y_test, 1)
        #print("Sample size ", m, " accuracy: ", random_prototyping_accuracy[i][iter])

